From 736ab7ef2073830da806b920ab1bb59a289ce117 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Mon, 15 Jun 2020 10:33:51 +0200
Subject: [PATCH] #2 update benchmark notebook

---
 notebooks/benchmark.ipynb | 254 +++++++-------------------------------
 stree/Strees.py           |   4 +-
 2 files changed, 48 insertions(+), 210 deletions(-)

diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb
index 9e19ff0..b87cf36 100644
--- a/notebooks/benchmark.ipynb
+++ b/notebooks/benchmark.ipynb
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,15 +64,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "2020-06-14 23:45:42\n"
-     ]
+     "name": "stdout",
+     "text": "2020-06-15 10:17:17\n"
     }
    ],
    "source": [
@@ -88,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,16 +98,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "Fraud: 0.173% 492\n",
-      "Valid: 99.827% 284,315\n"
-     ]
+     "name": "stdout",
+     "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
     }
    ],
    "source": [
@@ -119,7 +114,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -131,16 +126,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "X shape: (284807, 29)\n",
-      "y shape: (284807,)\n"
-     ]
+     "name": "stdout",
+     "text": "X shape: (284807, 29)\ny shape: (284807,)\n"
     }
    ],
    "source": [
@@ -159,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,7 +172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -190,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -200,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -210,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -227,7 +219,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -252,179 +244,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
+     "output_type": "stream",
      "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "************************** Linear Tree **********************\n",
-      "Train Model Linear Tree took: 13.52 seconds\n",
-      "=========== Linear Tree - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   1.000000  1.000000  1.000000    199020\n",
-      "           1   1.000000  1.000000  1.000000       344\n",
-      "\n",
-      "    accuracy                       1.000000    199364\n",
-      "   macro avg   1.000000  1.000000  1.000000    199364\n",
-      "weighted avg   1.000000  1.000000  1.000000    199364\n",
-      "\n",
-      "=========== Linear Tree - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999578  0.999613  0.999596     85295\n",
-      "           1   0.772414  0.756757  0.764505       148\n",
-      "\n",
-      "    accuracy                       0.999192     85443\n",
-      "   macro avg   0.885996  0.878185  0.882050     85443\n",
-      "weighted avg   0.999184  0.999192  0.999188     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[199020      0]\n",
-      " [     0    344]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85262    33]\n",
-      " [   36   112]]\n",
-      "************************** Random Forest **********************\n",
-      "Train Model Random Forest took: 152.5 seconds\n",
-      "=========== Random Forest - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   1.000000  1.000000  1.000000    199020\n",
-      "           1   1.000000  1.000000  1.000000       344\n",
-      "\n",
-      "    accuracy                       1.000000    199364\n",
-      "   macro avg   1.000000  1.000000  1.000000    199364\n",
-      "weighted avg   1.000000  1.000000  1.000000    199364\n",
-      "\n",
-      "=========== Random Forest - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999660  0.999965  0.999812     85295\n",
-      "           1   0.975410  0.804054  0.881481       148\n",
-      "\n",
-      "    accuracy                       0.999625     85443\n",
-      "   macro avg   0.987535  0.902009  0.940647     85443\n",
-      "weighted avg   0.999618  0.999625  0.999607     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[199020      0]\n",
-      " [     0    344]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85292     3]\n",
-      " [   29   119]]\n",
-      "************************** Stree (SVM Tree) **********************\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
-      "  \"the number of iterations.\", ConvergenceWarning)\n",
-      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
-      "  \"the number of iterations.\", ConvergenceWarning)\n",
-      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
-      "  \"the number of iterations.\", ConvergenceWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train Model Stree (SVM Tree) took: 32.55 seconds\n",
-      "=========== Stree (SVM Tree) - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999623  0.999864  0.999744    199020\n",
-      "           1   0.908784  0.781977  0.840625       344\n",
-      "\n",
-      "    accuracy                       0.999488    199364\n",
-      "   macro avg   0.954204  0.890921  0.920184    199364\n",
-      "weighted avg   0.999467  0.999488  0.999469    199364\n",
-      "\n",
-      "=========== Stree (SVM Tree) - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999637  0.999918  0.999777     85295\n",
-      "           1   0.943548  0.790541  0.860294       148\n",
-      "\n",
-      "    accuracy                       0.999555     85443\n",
-      "   macro avg   0.971593  0.895229  0.930036     85443\n",
-      "weighted avg   0.999540  0.999555  0.999536     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[198993     27]\n",
-      " [    75    269]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85288     7]\n",
-      " [   31   117]]\n",
-      "************************** AdaBoost model **********************\n",
-      "Train Model AdaBoost model took: 47.34 seconds\n",
-      "=========== AdaBoost model - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999392  0.999678  0.999535    199020\n",
-      "           1   0.777003  0.648256  0.706815       344\n",
-      "\n",
-      "    accuracy                       0.999072    199364\n",
-      "   macro avg   0.888198  0.823967  0.853175    199364\n",
-      "weighted avg   0.999008  0.999072  0.999030    199364\n",
-      "\n",
-      "=========== AdaBoost model - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999484  0.999707  0.999596     85295\n",
-      "           1   0.806202  0.702703  0.750903       148\n",
-      "\n",
-      "    accuracy                       0.999192     85443\n",
-      "   macro avg   0.902843  0.851205  0.875249     85443\n",
-      "weighted avg   0.999149  0.999192  0.999165     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[198956     64]\n",
-      " [   121    223]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85270    25]\n",
-      " [   44   104]]\n",
-      "************************** Gradient Boost. **********************\n",
-      "Train Model Gradient Boost. took: 244.1 seconds\n",
-      "=========== Gradient Boost. - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999096  0.999854  0.999475    199020\n",
-      "           1   0.849741  0.476744  0.610801       344\n",
-      "\n",
-      "    accuracy                       0.998952    199364\n",
-      "   macro avg   0.924419  0.738299  0.805138    199364\n",
-      "weighted avg   0.998839  0.998952  0.998804    199364\n",
-      "\n",
-      "=========== Gradient Boost. - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.998981  0.999730  0.999355     85295\n",
-      "           1   0.726190  0.412162  0.525862       148\n",
-      "\n",
-      "    accuracy                       0.998713     85443\n",
-      "   macro avg   0.862586  0.705946  0.762609     85443\n",
-      "weighted avg   0.998508  0.998713  0.998535     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[198991     29]\n",
-      " [   180    164]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85272    23]\n",
-      " [   87    61]]\n"
-     ]
+     "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 13.91 seconds\n=========== Linear Tree - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Linear Tree - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999578  0.999613  0.999596     85295\n           1   0.772414  0.756757  0.764505       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.885996  0.878185  0.882050     85443\nweighted avg   0.999184  0.999192  0.999188     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85262    33]\n [   36   112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 173.1 seconds\n=========== Random Forest - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Random Forest - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999660  0.999965  0.999812     85295\n           1   0.975410  0.804054  0.881481       148\n\n    accuracy                       0.999625     85443\n   macro avg   0.987535  0.902009  0.940647     85443\nweighted avg   0.999618  0.999625  0.999607     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85292     3]\n [   29   119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 38.4 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999623  0.999864  0.999744    199020\n           1   0.908784  0.781977  0.840625       344\n\n    accuracy                       0.999488    199364\n   macro avg   0.954204  0.890921  0.920184    199364\nweighted avg   0.999467  0.999488  0.999469    199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999637  0.999918  0.999777     85295\n           1   0.943548  0.790541  0.860294       148\n\n    accuracy                       0.999555     85443\n   macro avg   0.971593  0.895229  0.930036     85443\nweighted avg   0.999540  0.999555  0.999536     85443\n\nConfusion Matrix in Train\n[[198993     27]\n [    75    269]]\nConfusion Matrix in Test\n[[85288     7]\n [   31   117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 47.21 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999392  0.999678  0.999535    199020\n           1   0.777003  0.648256  0.706815       344\n\n    accuracy                       0.999072    199364\n   macro avg   0.888198  0.823967  0.853175    199364\nweighted avg   0.999008  0.999072  0.999030    199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999484  0.999707  0.999596     85295\n           1   0.806202  0.702703  0.750903       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.902843  0.851205  0.875249     85443\nweighted avg   0.999149  0.999192  0.999165     85443\n\nConfusion Matrix in Train\n[[198956     64]\n [   121    223]]\nConfusion Matrix in Test\n[[85270    25]\n [   44   104]]\n"
     }
    ],
    "source": [
     "# Train & Test models\n",
     "models = {\n",
     "    'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree,  \n",
-    "    'AdaBoost model': adaboost, 'Gradient Boost.': gradient\n",
+    "    'AdaBoost model': adaboost\n",
     "}\n",
     "\n",
     "best_f1 = 0\n",
@@ -440,22 +273,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "**************************************************************************************************************\n",
-      "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
-      "**************************************************************************************************************\n",
-      "Model: Linear Tree\t Time:  13.52 seconds\t f1: 0.7645\n",
-      "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
-      "Model: Stree (SVM Tree)\t Time:  32.55 seconds\t f1: 0.8603\n",
-      "Model: AdaBoost model\t Time:  47.34 seconds\t f1: 0.7509\n",
-      "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n"
-     ]
+     "name": "stdout",
+     "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 173.095 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time:  13.91 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 173.09 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time:  38.40 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time:  47.21 seconds\t f1: 0.7509\n"
     }
    ],
    "source": [
@@ -466,6 +290,20 @@
     "    print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")"
    ]
   },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "**************************************************************************************************************\n",
+    "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
+    "**************************************************************************************************************\n",
+    "Model: Linear Tree\t Time:  13.52 seconds\t f1: 0.7645\n",
+    "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
+    "Model: Stree (SVM Tree)\t Time:  32.55 seconds\t f1: 0.8603\n",
+    "Model: AdaBoost model\t Time:  47.34 seconds\t f1: 0.7509\n",
+    "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -487,9 +325,9 @@
  "metadata": {
   "hide_input": false,
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.7.6 64-bit ('general': venv)",
    "language": "python",
-   "name": "python3"
+   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
   },
   "language_info": {
    "codemirror_mode": {
@@ -501,7 +339,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.6-final"
   },
   "toc": {
    "base_numbering": 1,
@@ -555,4 +393,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/stree/Strees.py b/stree/Strees.py
index e2e33c3..7624972 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -193,8 +193,8 @@ class Splitter:
     def information_gain(
         self, labels_up: np.array, labels_dn: np.array
     ) -> float:
-        card_up = labels_up.shape[0]
-        card_dn = labels_dn.shape[0]
+        card_up = labels_up.shape[0] if labels_up is not None else 0
+        card_dn = labels_dn.shape[0] if labels_dn is not None else 0
         samples = card_up + card_dn
         up = card_up / samples * self.criterion_function(labels_up)
         dn = card_dn / samples * self.criterion_function(labels_dn)