|
|
|
@@ -6,13 +6,14 @@
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import datetime, time\n",
|
|
|
|
|
"import numpy as np\n",
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"from sklearn.svm import LinearSVC\n",
|
|
|
|
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
|
|
|
|
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
|
|
|
|
"from trees.Stree import Stree\n",
|
|
|
|
|
"import time"
|
|
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
|
|
|
"from sklearn import tree\n",
|
|
|
|
|
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
|
|
|
|
|
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
|
|
|
|
|
"from trees.Stree import Stree"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@@ -35,66 +36,23 @@
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
|
|
|
|
|
"text": "2020-05-17 16:15:24\n"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"def load_creditcard(n_examples=0):\n",
|
|
|
|
|
" df = pd.read_csv('data/creditcard.csv')\n",
|
|
|
|
|
" print(\"*Original Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
|
|
|
|
" print(\"*Original Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
|
|
|
|
" y = np.expand_dims(df.Class.values, axis=1)\n",
|
|
|
|
|
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
|
|
|
|
" #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
|
|
|
|
" #return Xtrain, Xtest, ytrain, ytest\n",
|
|
|
|
|
" if n_examples > 0:\n",
|
|
|
|
|
" X = X[:n_examples, :]\n",
|
|
|
|
|
" y = y[:n_examples, :]\n",
|
|
|
|
|
" else:\n",
|
|
|
|
|
" if n_examples < 0:\n",
|
|
|
|
|
" Xt = X[(y == 1).ravel()]\n",
|
|
|
|
|
" yt = y[(y == 1).ravel()]\n",
|
|
|
|
|
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
|
|
|
|
" X = np.append(Xt, X[indices], axis=0)\n",
|
|
|
|
|
" y = np.append(yt, y[indices], axis=0)\n",
|
|
|
|
|
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
|
|
|
|
" print(\"-Generated Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
|
|
|
|
" print(\"-Generated Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n",
|
|
|
|
|
" return X, y\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"random_state = 1\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# Datasets\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
|
|
|
|
|
"# n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
|
|
|
|
|
"# class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"#X, y = load_wine(return_X_y=True)\n",
|
|
|
|
|
"#X, y = load_iris(return_X_y=True)\n",
|
|
|
|
|
"#y[y==2]=0\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"X, y = load_creditcard()"
|
|
|
|
|
"print(datetime.date.today(), time.strftime(\"%H:%M:%S\"))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": "root\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\nroot - Down - Up\nroot - Down - Up - Down\nroot - Down - Up - Down - Down, <cgaf> - Leaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nroot - Down - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nroot - Down - Up - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Down - Up - Up - Up, <cgaf> - Leaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nroot - Up - Down - Down - Down, <cgaf> - Leaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up - Down - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Up, <cgaf> - Leaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n41.5053 secs\n"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"t = time.time()\n",
|
|
|
|
|
"clf = Stree(C=.01, random_state=random_state, use_predictions=False)\n",
|
|
|
|
|
"clf.fit(X, y)\n",
|
|
|
|
|
"print(clf)\n",
|
|
|
|
|
"print(f\"{time.time() - t:.4f} secs\")"
|
|
|
|
|
"# Load Dataset\n",
|
|
|
|
|
"df = pd.read_csv('data/creditcard.csv')\n",
|
|
|
|
|
"df.shape\n",
|
|
|
|
|
"random_state = 2020"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@@ -105,22 +63,24 @@
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": "Accuracy: 0.999512\n0.2389 secs\n"
|
|
|
|
|
"text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"t = time.time()\n",
|
|
|
|
|
"clf.score(X, y)\n",
|
|
|
|
|
"print(f\"{time.time() - t:.4f} secs\")"
|
|
|
|
|
"print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
|
|
|
|
"print(\"Valid: {0:.3f}% {1:,}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# outcomes without optimization executing predict_proba. 87 seconds\n",
|
|
|
|
|
"(284807, 2)\n",
|
|
|
|
|
"87.5212 secs"
|
|
|
|
|
"# Normalize Amount\n",
|
|
|
|
|
"from sklearn.preprocessing import RobustScaler\n",
|
|
|
|
|
"values = RobustScaler().fit_transform(df.Amount.values.reshape(-1, 1))\n",
|
|
|
|
|
"df['Amount_Scaled'] = values"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
@@ -131,47 +91,214 @@
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": "0.9991397683343457\n13.6326 secs\n"
|
|
|
|
|
"text": "X shape: (284807, 29)\ny shape: (284807,)\n"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"t = time.time()\n",
|
|
|
|
|
"clf2 = LinearSVC(C=.01, random_state=random_state)\n",
|
|
|
|
|
"clf2.fit(X, y)\n",
|
|
|
|
|
"print(clf2.score(X, y))\n",
|
|
|
|
|
"print(f\"{time.time() - t:.4f} secs\")"
|
|
|
|
|
"# Remove unneeded features\n",
|
|
|
|
|
"y = df.Class.values\n",
|
|
|
|
|
"X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
|
|
|
|
"print(f\"X shape: {X.shape}\\ny shape: {y.shape}\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 8,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Divide dataset\n",
|
|
|
|
|
"train_size = .7\n",
|
|
|
|
|
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, shuffle=True, random_state=random_state, stratify=y)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Linear Tree\n",
|
|
|
|
|
"linear_tree = tree.DecisionTreeClassifier(random_state=random_state)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 10,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Random Forest\n",
|
|
|
|
|
"random_forest = RandomForestClassifier(random_state=random_state, n_jobs=-1, n_estimators=100)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 11,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Stree\n",
|
|
|
|
|
"stree = Stree(random_state=random_state, C=.01)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# AdaBoost\n",
|
|
|
|
|
"adaboost = AdaBoostClassifier(random_state=random_state)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 13,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# Gradient Boosting\n",
|
|
|
|
|
"gradient = GradientBoostingClassifier(random_state=random_state)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 14,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"def try_model(name, model):\n",
|
|
|
|
|
" print(f\"************************** {name} **********************\")\n",
|
|
|
|
|
" now = time.time()\n",
|
|
|
|
|
" model.fit(Xtrain, ytrain)\n",
|
|
|
|
|
" spent = time.time() - now\n",
|
|
|
|
|
" print(f\"Train Model {name} took: {spent:.4} seconds\")\n",
|
|
|
|
|
" predict = model.predict(Xtrain)\n",
|
|
|
|
|
" predictt = model.predict(Xtest)\n",
|
|
|
|
|
" print(f\"=========== {name} - Train {Xtrain.shape[0]:,} samples =============\",)\n",
|
|
|
|
|
" print(classification_report(ytrain, predict, digits=6))\n",
|
|
|
|
|
" print(f\"=========== {name} - Test {Xtest.shape[0]:,} samples =============\")\n",
|
|
|
|
|
" print(classification_report(ytest, predictt, digits=6))\n",
|
|
|
|
|
" print(\"Confusion Matrix in Train\")\n",
|
|
|
|
|
" print(confusion_matrix(ytrain, predict))\n",
|
|
|
|
|
" print(\"Confusion Matrix in Test\")\n",
|
|
|
|
|
" print(confusion_matrix(ytest, predictt))\n",
|
|
|
|
|
" return f1_score(ytest, predictt), spent"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 15,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": "1.0\n18.8308 secs\n"
|
|
|
|
|
"text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 14.13 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\n"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"output_type": "error",
|
|
|
|
|
"ename": "KeyboardInterrupt",
|
|
|
|
|
"evalue": "",
|
|
|
|
|
"traceback": [
|
|
|
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
|
|
|
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
|
|
|
|
"\u001b[0;32m<ipython-input-15-29ecc2c3d67b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0moutcomes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mmodels\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mf1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime_spent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtry_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0moutcomes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime_spent\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mf1\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0mbest_f1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m<ipython-input-14-b6f4fa54c657>\u001b[0m in \u001b[0;36mtry_model\u001b[0;34m(name, model)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"************************** {name} **********************\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mnow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mytrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0mspent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mnow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Train Model {name} took: {spent:.4} seconds\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m~/Code/pyblique/venv/lib/python3.7/site-packages/sklearn/ensemble/_forest.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 381\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclass_weight\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 382\u001b[0m n_samples_bootstrap=n_samples_bootstrap)\n\u001b[0;32m--> 383\u001b[0;31m for i, t in enumerate(trees))\n\u001b[0m\u001b[1;32m 384\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 385\u001b[0m \u001b[0;31m# Collect newly grown trees\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m~/Code/pyblique/venv/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 1015\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1016\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1017\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1018\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1019\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m~/Code/pyblique/venv/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 907\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 908\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 909\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 910\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 911\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m/usr/local/Cellar/python/3.7.6_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 649\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 651\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 652\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m/usr/local/Cellar/python/3.7.6_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/pool.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 649\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m/usr/local/Cellar/python/3.7.6_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 550\u001b[0m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_flag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 552\u001b[0;31m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cond\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 553\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 554\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;32m/usr/local/Cellar/python/3.7.6_1/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"t = time.time()\n",
|
|
|
|
|
"clf3 = DecisionTreeClassifier(random_state=random_state)\n",
|
|
|
|
|
"clf3.fit(X, y)\n",
|
|
|
|
|
"print(clf3.score(X, y))\n",
|
|
|
|
|
"print(f\"{time.time() - t:.4f} secs\")"
|
|
|
|
|
"# Train & Test models\n",
|
|
|
|
|
"models = {\n",
|
|
|
|
|
" 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n",
|
|
|
|
|
" 'AdaBoost model': adaboost, 'Gradient Boost.': gradient\n",
|
|
|
|
|
"}\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"best_f1 = 0\n",
|
|
|
|
|
"outcomes = []\n",
|
|
|
|
|
"for name, model in models.items():\n",
|
|
|
|
|
" f1, time_spent = try_model(name, model)\n",
|
|
|
|
|
" outcomes.append((name, f1, time_spent))\n",
|
|
|
|
|
" if f1 > best_f1:\n",
|
|
|
|
|
" best_model = name\n",
|
|
|
|
|
" best_time = time_spent\n",
|
|
|
|
|
" best_f1 = f1"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"text": "************************************************************************************************************************************\n"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"output_type": "error",
|
|
|
|
|
"ename": "NameError",
|
|
|
|
|
"evalue": "name 'best_model' is not defined",
|
|
|
|
|
"traceback": [
|
|
|
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
|
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
|
|
|
"\u001b[0;32m<ipython-input-1-71789a67ee66>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"*\"\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;36m132\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"*\"\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;36m132\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtime_spent\u001b[0m \u001b[0;32min\u001b[0m \u001b[0moutcomes\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
|
|
|
"\u001b[0;31mNameError\u001b[0m: name 'best_model' is not defined"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"print(\"*\"*132)\n",
|
|
|
|
|
"print(f\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\")\n",
|
|
|
|
|
"print(\"*\"*132)\n",
|
|
|
|
|
"for name, f1, time_spent in outcomes:\n",
|
|
|
|
|
" print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "markdown",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"from sklearn.utils.estimator_checks import check_estimator\n",
|
|
|
|
|
"clf = Stree()\n",
|
|
|
|
|
"check_estimator(clf)"
|
|
|
|
|
"************************************************************************************************************************************\n",
|
|
|
|
|
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset\n",
|
|
|
|
|
"************************************************************************************************************************************\n",
|
|
|
|
|
"Model: Linear Tree\t Time: 23.05 seconds\t f1: 0.7645\n",
|
|
|
|
|
"Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n",
|
|
|
|
|
"Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8467\n",
|
|
|
|
|
"Model: AdaBoost model\t Time: 73.83 seconds\t f1: 0.7509\n",
|
|
|
|
|
"Model: Gradient Boost.\t Time: 388.69 seconds\t f1: 0.5259\n",
|
|
|
|
|
"Model: Neural Network\t Time: 25.47 seconds\t f1: 0.8328"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "raw",
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"source": [
|
|
|
|
|
"************************************************************************************************************************************\n",
|
|
|
|
|
"*The best f1 model is Random Forest, with a f1 score: 0.8791 in 1513.23 seconds with 0.7 samples in train dataset\n",
|
|
|
|
|
"************************************************************************************************************************************\n",
|
|
|
|
|
"Model: Linear Tree\t Time: 25.18 seconds\t f1: 0.7645\n",
|
|
|
|
|
"Model: Random Forest\t Time: 1513.23 seconds\t f1: 0.8791"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"hide_input": false,
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
@@ -184,12 +311,56 @@
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.7.6-final"
|
|
|
|
|
},
|
|
|
|
|
"orig_nbformat": 2,
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
|
|
|
|
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
|
|
|
|
"toc": {
|
|
|
|
|
"base_numbering": 1,
|
|
|
|
|
"nav_menu": {},
|
|
|
|
|
"number_sections": true,
|
|
|
|
|
"sideBar": true,
|
|
|
|
|
"skip_h1_title": false,
|
|
|
|
|
"title_cell": "Table of Contents",
|
|
|
|
|
"title_sidebar": "Contents",
|
|
|
|
|
"toc_cell": false,
|
|
|
|
|
"toc_position": {},
|
|
|
|
|
"toc_section_display": true,
|
|
|
|
|
"toc_window_display": false
|
|
|
|
|
},
|
|
|
|
|
"varInspector": {
|
|
|
|
|
"cols": {
|
|
|
|
|
"lenName": 16,
|
|
|
|
|
"lenType": 16,
|
|
|
|
|
"lenVar": 40
|
|
|
|
|
},
|
|
|
|
|
"kernels_config": {
|
|
|
|
|
"python": {
|
|
|
|
|
"delete_cmd_postfix": "",
|
|
|
|
|
"delete_cmd_prefix": "del ",
|
|
|
|
|
"library": "var_list.py",
|
|
|
|
|
"varRefreshCmd": "print(var_dic_list())"
|
|
|
|
|
},
|
|
|
|
|
"r": {
|
|
|
|
|
"delete_cmd_postfix": ") ",
|
|
|
|
|
"delete_cmd_prefix": "rm(",
|
|
|
|
|
"library": "var_list.r",
|
|
|
|
|
"varRefreshCmd": "cat(var_dic_list()) "
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"position": {
|
|
|
|
|
"height": "392px",
|
|
|
|
|
"left": "1518px",
|
|
|
|
|
"right": "20px",
|
|
|
|
|
"top": "40px",
|
|
|
|
|
"width": "392px"
|
|
|
|
|
},
|
|
|
|
|
"types_to_exclude": [
|
|
|
|
|
"module",
|
|
|
|
|
"function",
|
|
|
|
|
"builtin_function_or_method",
|
|
|
|
|
"instance",
|
|
|
|
|
"_Feature"
|
|
|
|
|
],
|
|
|
|
|
"window_display": true
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
"nbformat_minor": 4
|
|
|
|
|
}
|