diff --git a/main.py b/main.py index 36ed38d..d4f045a 100644 --- a/main.py +++ b/main.py @@ -46,4 +46,12 @@ clf.fit(Xtrain, ytrain) print(f"Took {time.time() - now:.2f} seconds to train") print(clf) print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}") -print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}") \ No newline at end of file +print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}") +proba = clf.predict_proba(Xtest) +res0 = proba[proba[:, 0] == 0] +res1 = proba[proba[:, 0] == 0] +print("++++++++++res0++++++++++++") +print(res0[res0[:, 1] > .8]) +print("**********res1************") +print(res1[res1[:, 1] < .4]) +print(clf.predict_proba(Xtest)) \ No newline at end of file diff --git a/test.ipynb b/test.ipynb index c94aaee..0f4ac90 100644 --- a/test.ipynb +++ b/test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -30,16 +30,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "2020-05-18 16:33:17\n" + } + ], "source": [ "print(datetime.date.today(), time.strftime(\"%H:%M:%S\"))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -51,9 +57,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n" + } + ], "source": [ "print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", "print(\"Valid: {0:.3f}% {1:,}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))" @@ -61,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -73,9 +85,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "X shape: (284807, 29)\ny shape: (284807,)\n" + } + ], "source": [ "# Remove unneeded features\n", "y = df.Class.values\n", @@ -85,7 +103,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -96,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -106,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -116,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -126,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -136,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -146,7 +164,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -171,9 +189,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 24.33 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 235.9 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 53.47 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999613 0.999869 0.999741 199020\n 1 0.911263 0.776163 0.838305 344\n\n accuracy 0.999483 199364\n macro avg 0.955438 0.888016 0.919023 199364\nweighted avg 0.999461 0.999483 0.999463 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999613 0.999883 0.999748 85295\n 1 0.920000 0.777027 0.842491 148\n\n accuracy 0.999497 85443\n macro avg 0.959807 0.888455 0.921119 85443\nweighted avg 0.999475 0.999497 0.999476 85443\n\nConfusion Matrix in Train\n[[198994 26]\n [ 77 267]]\nConfusion Matrix in Test\n[[85285 10]\n [ 33 115]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 66.55 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n************************** Gradient Boost. **********************\nTrain Model Gradient Boost. took: 331.6 seconds\n=========== Gradient Boost. - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999096 0.999854 0.999475 199020\n 1 0.849741 0.476744 0.610801 344\n\n accuracy 0.998952 199364\n macro avg 0.924419 0.738299 0.805138 199364\nweighted avg 0.998839 0.998952 0.998804 199364\n\n=========== Gradient Boost. - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.998981 0.999730 0.999355 85295\n 1 0.726190 0.412162 0.525862 148\n\n accuracy 0.998713 85443\n macro avg 0.862586 0.705946 0.762609 85443\nweighted avg 0.998508 0.998713 0.998535 85443\n\nConfusion Matrix in Train\n[[198991 29]\n [ 180 164]]\nConfusion Matrix in Test\n[[85272 23]\n [ 87 61]]\n" + } + ], "source": [ "# Train & Test models\n", "models = {\n", @@ -194,13 +218,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "************************************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 235.909 seconds with 0.7 samples in train dataset\n************************************************************************************************************************************\nModel: Linear Tree\t Time: 24.33 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 235.91 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 53.47 seconds\t f1: 0.8425\nModel: AdaBoost model\t Time: 66.55 seconds\t f1: 0.7509\nModel: Gradient Boost.\t Time: 331.56 seconds\t f1: 0.5259\n" + } + ], "source": [ - "print(\"*\"*132)\n", + "print(\"*\"*110)\n", "print(f\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\")\n", - "print(\"*\"*132)\n", + "print(\"*\"*110)\n", "for name, f1, time_spent in outcomes:\n", " print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")" ] @@ -240,7 +270,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.6-final" }, "toc": { "base_numbering": 1, @@ -294,4 +324,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/test2.ipynb b/test2.ipynb index 404123e..2e197f8 100644 --- a/test2.ipynb +++ b/test2.ipynb @@ -35,7 +35,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n" + "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807,)\nFraud: 0.173% 492\nValid: 99.827% 284315\n" } ], "source": [ @@ -74,7 +74,7 @@ "\n", "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n", "# data = load_creditcard(5000) # Take the first 5000 samples\n", - "data = load_creditcard(-1000) # Take all the samples\n", + "data = load_creditcard() # Take all the samples\n", "\n", "Xtrain = data[0]\n", "Xtest = data[1]\n", @@ -90,7 +90,7 @@ { "output_type": "stream", "name": "stdout", - "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\nroot\nroot - Down, - Leaf class=1 belief=0.983923 counts=(array([0, 1]), array([ 5, 306]))\nroot - Up, - Leaf class=0 belief=0.945430 counts=(array([0, 1]), array([693, 40]))\n\n\n0.0277 secs\n" + "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief=0.941799 counts=(array([0, 1]), array([ 11, 178]))\nroot - Down - Up\nroot - Down - Up - Down, - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Down - Up - Up, - Leaf class=0 belief=0.952381 counts=(array([0, 1]), array([20, 1]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief=0.902174 counts=(array([0, 1]), array([ 9, 83]))\nroot - Up - Down - Up, - Leaf class=0 belief=1.000000 counts=(array([0]), array([14]))\nroot - Up - Up, - Leaf class=0 belief=0.999598 counts=(array([0, 1]), array([198966, 80]))\n\n42.9141 secs\n" } ], "source": [ @@ -98,7 +98,23 @@ "clf = Stree(C=.01, random_state=random_state)\n", "clf.fit(Xtrain, ytrain)\n", "print(clf)\n", - "print()\n", + "print(f\"{time.time() - t:.4f} secs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n0.2084 secs\n" + } + ], + "source": [ + "t = time.time()\n", + "print(clf.predict(Xtest)[:17])\n", "print(f\"{time.time() - t:.4f} secs\")" ] }, @@ -110,19 +126,26 @@ { "output_type": "stream", "name": "stdout", - "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n****** (311, 1) (311, 1)\n****** (733, 1) (733, 1)\n[[0. 0.94542974]\n [1. 0.98392283]\n [0. 0.94542974]\n ...\n [0. 0.94542974]\n [0. 0.94542974]\n [1. 0.98392283]]\n" + "text": "[[0. 0.26356965]\n [0. 0.22665372]\n [0. 0.25678353]\n [0. 0.26056019]\n [0. 0.26583006]\n [0. 0.24360041]\n [0. 0.26366182]\n [0. 0.26012045]\n [0. 0.2298345 ]\n [0. 0.25726294]\n [0. 0.25909988]\n [0. 0.25940575]\n [0. 0.24256254]\n [0. 0.15094485]\n [0. 0.26327588]\n [0. 0.26382949]\n [0. 0.26290957]]\n0.2083 secs\n" } ], "source": [ - "k = clf.predict_proba(Xtrain)\n", - "print(k)" + "t = time.time()\n", + "print(clf.predict_proba(Xtest)[:17, :])\n", + "print(f\"{time.time() - t:.4f} secs\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Classifier's accuracy (train): 0.9995\nClassifier's accuracy (test) : 0.9995\n0.5074 secs\n" + } + ], "source": [ "t = time.time()\n", "print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n", @@ -130,20 +153,17 @@ "print(f\"{time.time() - t:.4f} secs\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# outcomes without optimization executing predict_proba. 87 seconds\n", - "(284807, 2)\n", - "87.5212 secs" - ] - }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "0.9993211848834895\n13.3835 secs\n" + } + ], "source": [ "t = time.time()\n", "clf2 = LinearSVC(C=.01, random_state=random_state)\n", @@ -154,9 +174,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "0.9991573329588147\n22.7635 secs\n" + } + ], "source": [ "t = time.time()\n", "clf3 = DecisionTreeClassifier(random_state=random_state)\n", diff --git a/tests/Stree_test.py b/tests/Stree_test.py index 65cd82a..a70c6ea 100644 --- a/tests/Stree_test.py +++ b/tests/Stree_test.py @@ -148,7 +148,7 @@ class Stree_test(unittest.TestCase): yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1])) self.assertEqual(0, yp[0:, 0]) self.assertEqual(1, y[28]) - self.assertEqual(0.29026400765649235, yp[0, 1]) + self.assertEqual(0.29026400766, round(yp[0, 1], 11)) def test_multiple_predict_proba(self): # First 27 elements the predictions are the same as the truth