From 736ab7ef2073830da806b920ab1bb59a289ce117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 15 Jun 2020 10:33:51 +0200 Subject: [PATCH] #2 update benchmark notebook --- notebooks/benchmark.ipynb | 254 +++++++------------------------------- stree/Strees.py | 4 +- 2 files changed, 48 insertions(+), 210 deletions(-) diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb index 9e19ff0..b87cf36 100644 --- a/notebooks/benchmark.ipynb +++ b/notebooks/benchmark.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -64,15 +64,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "2020-06-14 23:45:42\n" - ] + "name": "stdout", + "text": "2020-06-15 10:17:17\n" } ], "source": [ @@ -88,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -100,16 +98,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "Fraud: 0.173% 492\n", - "Valid: 99.827% 284,315\n" - ] + "name": "stdout", + "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n" } ], "source": [ @@ -119,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -131,16 +126,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "X shape: (284807, 29)\n", - "y shape: (284807,)\n" - ] + "name": "stdout", + "text": "X shape: (284807, 29)\ny shape: (284807,)\n" } ], "source": [ @@ -159,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -170,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -190,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -200,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -210,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -227,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -252,179 +244,20 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { + "output_type": "stream", "name": "stdout", - "output_type": "stream", - "text": [ - "************************** Linear Tree **********************\n", - "Train Model Linear Tree took: 13.52 seconds\n", - "=========== Linear Tree - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 1.000000 1.000000 1.000000 199020\n", - " 1 1.000000 1.000000 1.000000 344\n", - "\n", - " accuracy 1.000000 199364\n", - " macro avg 1.000000 1.000000 1.000000 199364\n", - "weighted avg 1.000000 1.000000 1.000000 199364\n", - "\n", - "=========== Linear Tree - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999578 0.999613 0.999596 85295\n", - " 1 0.772414 0.756757 0.764505 148\n", - "\n", - " accuracy 0.999192 85443\n", - " macro avg 0.885996 0.878185 0.882050 85443\n", - "weighted avg 0.999184 0.999192 0.999188 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[199020 0]\n", - " [ 0 344]]\n", - "Confusion Matrix in Test\n", - "[[85262 33]\n", - " [ 36 112]]\n", - "************************** Random Forest **********************\n", - "Train Model Random Forest took: 152.5 seconds\n", - "=========== Random Forest - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 1.000000 1.000000 1.000000 199020\n", - " 1 1.000000 1.000000 1.000000 344\n", - "\n", - " accuracy 1.000000 199364\n", - " macro avg 1.000000 1.000000 1.000000 199364\n", - "weighted avg 1.000000 1.000000 1.000000 199364\n", - "\n", - "=========== Random Forest - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999660 0.999965 0.999812 85295\n", - " 1 0.975410 0.804054 0.881481 148\n", - "\n", - " accuracy 0.999625 85443\n", - " macro avg 0.987535 0.902009 0.940647 85443\n", - "weighted avg 0.999618 0.999625 0.999607 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[199020 0]\n", - " [ 0 344]]\n", - "Confusion Matrix in Test\n", - "[[85292 3]\n", - " [ 29 119]]\n", - "************************** Stree (SVM Tree) **********************\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", - " \"the number of iterations.\", ConvergenceWarning)\n", - "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", - " \"the number of iterations.\", ConvergenceWarning)\n", - "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", - " \"the number of iterations.\", ConvergenceWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train Model Stree (SVM Tree) took: 32.55 seconds\n", - "=========== Stree (SVM Tree) - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999623 0.999864 0.999744 199020\n", - " 1 0.908784 0.781977 0.840625 344\n", - "\n", - " accuracy 0.999488 199364\n", - " macro avg 0.954204 0.890921 0.920184 199364\n", - "weighted avg 0.999467 0.999488 0.999469 199364\n", - "\n", - "=========== Stree (SVM Tree) - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999637 0.999918 0.999777 85295\n", - " 1 0.943548 0.790541 0.860294 148\n", - "\n", - " accuracy 0.999555 85443\n", - " macro avg 0.971593 0.895229 0.930036 85443\n", - "weighted avg 0.999540 0.999555 0.999536 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[198993 27]\n", - " [ 75 269]]\n", - "Confusion Matrix in Test\n", - "[[85288 7]\n", - " [ 31 117]]\n", - "************************** AdaBoost model **********************\n", - "Train Model AdaBoost model took: 47.34 seconds\n", - "=========== AdaBoost model - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999392 0.999678 0.999535 199020\n", - " 1 0.777003 0.648256 0.706815 344\n", - "\n", - " accuracy 0.999072 199364\n", - " macro avg 0.888198 0.823967 0.853175 199364\n", - "weighted avg 0.999008 0.999072 0.999030 199364\n", - "\n", - "=========== AdaBoost model - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999484 0.999707 0.999596 85295\n", - " 1 0.806202 0.702703 0.750903 148\n", - "\n", - " accuracy 0.999192 85443\n", - " macro avg 0.902843 0.851205 0.875249 85443\n", - "weighted avg 0.999149 0.999192 0.999165 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[198956 64]\n", - " [ 121 223]]\n", - "Confusion Matrix in Test\n", - "[[85270 25]\n", - " [ 44 104]]\n", - "************************** Gradient Boost. **********************\n", - "Train Model Gradient Boost. took: 244.1 seconds\n", - "=========== Gradient Boost. - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999096 0.999854 0.999475 199020\n", - " 1 0.849741 0.476744 0.610801 344\n", - "\n", - " accuracy 0.998952 199364\n", - " macro avg 0.924419 0.738299 0.805138 199364\n", - "weighted avg 0.998839 0.998952 0.998804 199364\n", - "\n", - "=========== Gradient Boost. - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.998981 0.999730 0.999355 85295\n", - " 1 0.726190 0.412162 0.525862 148\n", - "\n", - " accuracy 0.998713 85443\n", - " macro avg 0.862586 0.705946 0.762609 85443\n", - "weighted avg 0.998508 0.998713 0.998535 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[198991 29]\n", - " [ 180 164]]\n", - "Confusion Matrix in Test\n", - "[[85272 23]\n", - " [ 87 61]]\n" - ] + "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 13.91 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 173.1 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 38.4 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999623 0.999864 0.999744 199020\n 1 0.908784 0.781977 0.840625 344\n\n accuracy 0.999488 199364\n macro avg 0.954204 0.890921 0.920184 199364\nweighted avg 0.999467 0.999488 0.999469 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999637 0.999918 0.999777 85295\n 1 0.943548 0.790541 0.860294 148\n\n accuracy 0.999555 85443\n macro avg 0.971593 0.895229 0.930036 85443\nweighted avg 0.999540 0.999555 0.999536 85443\n\nConfusion Matrix in Train\n[[198993 27]\n [ 75 269]]\nConfusion Matrix in Test\n[[85288 7]\n [ 31 117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 47.21 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n" } ], "source": [ "# Train & Test models\n", "models = {\n", " 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n", - " 'AdaBoost model': adaboost, 'Gradient Boost.': gradient\n", + " 'AdaBoost model': adaboost\n", "}\n", "\n", "best_f1 = 0\n", @@ -440,22 +273,13 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "**************************************************************************************************************\n", - "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n", - "**************************************************************************************************************\n", - "Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n", - "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n", - "Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n", - "Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n", - "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n" - ] + "name": "stdout", + "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 173.095 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 13.91 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 173.09 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 38.40 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time: 47.21 seconds\t f1: 0.7509\n" } ], "source": [ @@ -466,6 +290,20 @@ " print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")" ] }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "**************************************************************************************************************\n", + "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n", + "**************************************************************************************************************\n", + "Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n", + "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n", + "Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n", + "Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n", + "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -487,9 +325,9 @@ "metadata": { "hide_input": false, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.6 64-bit ('general': venv)", "language": "python", - "name": "python3" + "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39" }, "language_info": { "codemirror_mode": { @@ -501,7 +339,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.6-final" }, "toc": { "base_numbering": 1, @@ -555,4 +393,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/stree/Strees.py b/stree/Strees.py index e2e33c3..7624972 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -193,8 +193,8 @@ class Splitter: def information_gain( self, labels_up: np.array, labels_dn: np.array ) -> float: - card_up = labels_up.shape[0] - card_dn = labels_dn.shape[0] + card_up = labels_up.shape[0] if labels_up is not None else 0 + card_dn = labels_dn.shape[0] if labels_dn is not None else 0 samples = card_up + card_dn up = card_up / samples * self.criterion_function(labels_up) dn = card_dn / samples * self.criterion_function(labels_dn)