From 80b5cf8e722a0fc648624fbab69cd686847f906d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Fri, 15 May 2020 23:35:33 +0200
Subject: [PATCH] Implement predict & predict_proba optimization

reduces time in two orders of magnitude in creditcard dataset
---
 README.md           |  2 +-
 main.py             |  3 +-
 test.ipynb          | 41 ++++++++++++++++---------
 tests/Stree_test.py | 63 ++++++++++++++++++++++++++++++++------
 trees/Snode.py      |  6 ++--
 trees/Stree.py      | 73 ++++++++++++++++++++++++++-------------------
 6 files changed, 129 insertions(+), 59 deletions(-)

diff --git a/README.md b/README.md
index 3bbea76..713d726 100644
--- a/README.md
+++ b/README.md
@@ -10,5 +10,5 @@ python main.py
 ## Tests
 
 ```python
-python -m unittest tests.Stree_test tests.Snode_test
+python -m unittest -v tests.Stree_test tests.Snode_test
 ```
diff --git a/main.py b/main.py
index 9179ccb..ad7598d 100644
--- a/main.py
+++ b/main.py
@@ -33,8 +33,9 @@ def load_creditcard(n_examples=0):
     print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
     print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
     return X, y
-X, y = load_creditcard(-5000)
+#X, y = load_creditcard(-5000)
 #X, y = load_creditcard()
+X, y = load_creditcard()
 
 clf = Stree(C=.01, max_iter=100, random_state=random_state)
 clf.fit(X, y)
diff --git a/test.ipynb b/test.ipynb
index 0cb291e..beee72b 100644
--- a/test.ipynb
+++ b/test.ipynb
@@ -19,13 +19,7 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28)  y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
-    }
-   ],
+   "outputs": [],
    "source": [
     "def load_creditcard(n_examples=0):\n",
     "    df = pd.read_csv('data/creditcard.csv')\n",
@@ -61,6 +55,7 @@
     "\n",
     "#X, y = load_wine(return_X_y=True)\n",
     "#X, y = load_iris(return_X_y=True)\n",
+    "#y[y==2]=0\n",
     "\n",
     "X, y = load_creditcard()"
    ]
@@ -73,7 +68,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23,  2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18,  3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242,    103]))\n\n44.3767 secs\n"
+     "text": "root\nroot - Down\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242,    103]))\nroot - Down - Up\nroot - Down - Up - Down\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18,  3]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23,  2]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n60.9873 secs\n"
     }
    ],
    "source": [
@@ -92,7 +87,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "Accuracy: 0.999512\n33.1651 secs\n"
+     "text": "Accuracy: 0.999512\n0.3226 secs\n"
     }
    ],
    "source": [
@@ -109,7 +104,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "(284807, 2)\n87.5212 secs\n"
+     "text": "(284807, 2)\n0.4148 secs\n"
     }
    ],
    "source": [
@@ -119,6 +114,15 @@
     "print(f\"{time.time() - t:.4f} secs\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# outcomes without optimization executing predict_proba. 87 seconds\n",
+    "(284807, 2)\n",
+    "87.5212 secs"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 6,
@@ -127,7 +131,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "0.9991397683343457\n12.6601 secs\n"
+     "text": "0.9991397683343457\n20.9481 secs\n"
     }
    ],
    "source": [
@@ -146,7 +150,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "1.0\n18.2638 secs\n"
+     "text": "1.0\n32.2779 secs\n"
     }
    ],
    "source": [
@@ -156,6 +160,15 @@
     "print(clf3.score(X, y))\n",
     "print(f\"{time.time() - t:.4f} secs\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "from sklearn.utils.estimator_checks import check_estimator\n",
+    "clf = Stree()\n",
+    "check_estimator(clf)"
+   ]
   }
  ],
  "metadata": {
@@ -173,8 +186,8 @@
   },
   "orig_nbformat": 2,
   "kernelspec": {
-   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
-   "display_name": "Python 3.7.6 64-bit ('general': venv)"
+   "name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
+   "display_name": "Python 3.7.6 64-bit ('stree': venv)"
   }
  },
  "nbformat": 4,
diff --git a/tests/Stree_test.py b/tests/Stree_test.py
index 60db265..c89d7f5 100644
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
             os.environ.pop('TESTING')
         except:
             pass
-
+        
     def _get_Xy(self):
         X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
                                    n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
@@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase):
         return X, y
 
     def _check_tree(self, node: Snode):
+        """Check recursively that the nodes that are not leaves have the correct 
+        number of labels and its sons have the right number of elements in their dataset
+
+        Arguments:
+            node {Snode} -- node to check
+        """
         if node.is_leaf():
             return
         y_prediction = node._clf.predict(node._X)
@@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase):
         unique_y, count_y = np.unique(node._y, return_counts=True)
         _, count_d = np.unique(y_down, return_counts=True)
         _, count_u = np.unique(y_up, return_counts=True)
+        #
         for i in unique_y:
             try:
                 number_down = count_d[i]
@@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase):
             self.assertEqual(count_y[i], number_down + number_up)
         # Is the partition made the same as the prediction?
         # as the node is not a leaf...
-        unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
-        self.assertEqual(count_yp[1], y_down.shape[0])
-        self.assertEqual(count_yp[0], y_up.shape[0])
+        _, count_yp = np.unique(y_prediction, return_counts=True)
+        self.assertEqual(count_yp[1], y_up.shape[0])
+        self.assertEqual(count_yp[0], y_down.shape[0])
         self._check_tree(node.get_down())
         self._check_tree(node.get_up())
 
@@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase):
         return res
 
     def test_subdatasets(self):
-        """Check if the subdatasets files have the same predictions as the tree itself
+        """Check if the subdatasets files have the same labels as the original dataset
         """
-        model = self._clf._tree._clf
-        X, y = self._get_Xy()
-        model.fit(X, y)
         self._clf.save_sub_datasets()
         with open(self._clf.get_catalog_name()) as cat_file:
             catalog = csv.reader(cat_file, delimiter=',')
@@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase):
         right = (yp == y).astype(int)
         accuracy_computed = sum(right) / len(y)
         self.assertEqual(accuracy_score, accuracy_computed)
+        self.assertGreater(accuracy_score, 0.8)
     
     def test_single_predict_proba(self):
+        """Check that element 28 has a prediction different that the current label
+        """
         # Element 28 has a different prediction than the truth
         X, y = self._get_Xy()
         yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
         self.assertEqual(0, yp[0:, 0])
+        self.assertEqual(1, y[28])
         self.assertEqual(0.9282970550576184, yp[0:, 1])
 
     def test_multiple_predict_proba(self):
         # First 27 elements the predictions are the same as the truth
         num = 27
         X, y = self._get_Xy()
-        yp = self._clf.predict_proba(X[:num, :])
+        yp = self._clf.predict_proba(X[:num,:])
         self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
         expected_proba = [0.9759887,  0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.9759887, 
                         0.92829706, 0.9759887,  0.9759887,  0.9759887,  0.9759887,  0.92829706, 
@@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase):
                         0.92829706, 0.92829706, 0.9759887 ]
         self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
 
+    def test_use_model_predictions(self):
+        """Check that we get the same results wether we use the estimator in nodes
+        to compute labes or we use the hyperplane and the position of samples wrt to it
+        """
+        model_predictions = Stree(random_state=self._random_state,
+                            use_predictions=True)
+        model_hyperplane = Stree(random_state=self._random_state,
+                            use_predictions=False)
+        X, y = self._get_Xy()
+        model_predictions.fit(X, y)
+        model_hyperplane.fit(X, y)
+        self.assertListEqual(
+            model_predictions.predict(X).tolist(),
+            model_hyperplane.predict(X).tolist()
+        )
+        a = model_predictions.score(X, y, print_out=False),
+        b = model_hyperplane.score(X, y, print_out=False)
+        self.assertEqual(a, b)
+        self.assertGreater(b, .95)
+
+    def test_single_vs_multiple_prediction(self):
+        """Check if predicting sample by sample gives the same result as predicting
+        all samples at once
+        """
+        X, _ = self._get_Xy()
+        # Compute prediction line by line
+        yp_line = np.array([], dtype=int)
+        for xp in X:
+            yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
+        # Compute prediction at once
+        yp_once = self._clf.predict(X)
+        #
+        self.assertListEqual(yp_line.tolist(), yp_once.tolist())
+        
+
+
+
 
 
diff --git a/trees/Snode.py b/trees/Snode.py
index 8e1ca59..c3d8473 100644
--- a/trees/Snode.py
+++ b/trees/Snode.py
@@ -18,8 +18,8 @@ class Snode:
         self._interceptor = 0. if clf is None else clf.intercept_
         self._title = title
         self._belief = 0.  # belief of the prediction in a leaf node based on samples
-        self._X = X if os.environ.get(
-            'TESTING', 'Not Set') != 'Not Set' else None
+        # Only store dataset in Testing 
+        self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
         self._y = y
         self._down = None
         self._up = None
@@ -64,6 +64,6 @@ class Snode:
 
     def __str__(self) -> str:
         if self.is_leaf():
-            return f"Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
+            return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
         else:
             return f"{self._title}\n"
diff --git a/trees/Stree.py b/trees/Stree.py
index 25c9fe9..75d34ca 100644
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -43,26 +43,25 @@ class Stree(BaseEstimator, ClassifierMixin):
             setattr(self, parameter, value)
         return self
 
-    def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
+    def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
         if self.__use_predictions:
-            yp = clf.predict(X)
+            yp = node._clf.predict(data)
             down = (yp == 1).reshape(-1, 1)
         else:
             # doesn't work with multiclass as each sample has to do inner product with its own coeficients
             # computes positition of every sample is w.r.t. the hyperplane
-            coef = clf.coef_[0, :].reshape(-1, X.shape[1])
-            intercept = clf.intercept_[0]
-            res = X.dot(coef.T) + intercept
+            coef = node._vector[0, :].reshape(-1, data.shape[1])
+            res = data.dot(coef.T) + node._interceptor[0]
             down = res > 0
         up = ~down
-        X_down = X[down[:, 0]] if any(down) else None
-        y_down = y[down[:, 0]] if any(down) else None
-        X_up = X[up[:, 0]] if any(up) else None
-        y_up = y[up[:, 0]] if any(up) else None
-        return [X_up, y_up, X_down, y_down]
+        data_down = data[down[:, 0]] if any(down) else None
+        indices_down = indices[down[:, 0]] if any(down) else None
+        data_up = data[up[:, 0]] if any(up) else None
+        indices_up = indices[up[:, 0]] if any(up) else None
+        return [data_down, indices_down, data_up, indices_up]
 
     def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y.ravel())
         self.n_features_in_ = X.shape[1]
         self._tree = self.train(X, y.ravel(), title)
         self._build_predictor()
@@ -83,47 +82,59 @@ class Stree(BaseEstimator, ClassifierMixin):
     def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
         if np.unique(y).shape[0] == 1:
             # only 1 class => pure dataset
-            return Snode(None, X, y, title + ', <pure> ')
+            return Snode(None, X, y, title + ', <pure>')
         # Train the model
         clf = LinearSVC(max_iter=self._max_iter, C=self._C,
                         random_state=self._random_state)
         clf.fit(X, y)
         tree = Snode(clf, X, y, title)
-        X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
+        X_U, y_u, X_D, y_d = self._split_data(tree, X, y)
         if X_U is None or X_D is None:
             # didn't part anything
-            return Snode(clf, X, y, title + ', <couldn\'t go any further>')
+            return Snode(clf, X, y, title + ', <cgaf>')
         tree.set_up(self.train(X_U, y_u, title + ' - Up'))
         tree.set_down(self.train(X_D, y_d, title + ' - Down'))
         return tree
 
-    def predict(self, X: np.array) -> np.array:
-        def predict_class(xp: np.array, tree: Snode) -> np.array:
-            if tree.is_leaf():
+    def _predict_values(self, X: np.array) -> np.array:
+        def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
+            if xp is None:
+                return [], []
+            if node.is_leaf():
+                # set a class for every sample in dataset
+                prediction = np.full((xp.shape[0], 1), node._class)
                 if self.__proba:
-                    return [tree._class, tree._belief]
+                    prediction_proba = np.full((xp.shape[0], 1), node._belief)
+                    return np.append(prediction, prediction_proba, axis=1), indices
                 else:
-                    return tree._class
-            coef = tree._vector[0, :].reshape(-1, xp.shape[1])
-            if xp.dot(coef.T) + tree._interceptor[0] > 0:
-                return predict_class(xp, tree.get_down())
-            return predict_class(xp, tree.get_up())
-
+                    return prediction, indices
+            u, i_u, d, i_d = self._split_data(node, xp, indices)
+            k, l = predict_class(d, i_d, node.get_down())
+            m, n = predict_class(u, i_u, node.get_up())
+            return np.append(k, m), np.append(l, n)
         # sklearn check
         check_is_fitted(self)
         # Input validation
         X = check_array(X)
         # setup prediction & make it happen
-        y = np.array([], dtype=int)
-        for xp in X:
-            y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
-        return y
+        indices = np.arange(X.shape[0])
+        return predict_class(X, indices, self._tree)
+    
+    def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
+        y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
+        indices = indices.astype(int)
+        for i, index in enumerate(indices):
+            y_ordered[index] = y[i]
+        return y_ordered
+
+    def predict(self, X: np.array) -> np.array:
+        return self._reorder_results(*self._predict_values(X))
 
     def predict_proba(self, X: np.array) -> np.array:
         self.__proba = True
-        result = self.predict(X).reshape(X.shape[0], 2)
+        result, indices = self._predict_values(X)
         self.__proba = False
-        return result
+        return self._reorder_results(result.reshape(X.shape[0], 2), indices)
 
     def score(self, X: np.array, y: np.array, print_out=True) -> float:
         if not self.__trained:
@@ -180,4 +191,4 @@ class Stree(BaseEstimator, ClassifierMixin):
         """Save the every dataset stored in the tree to check with manual classifier
         """
         with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
-            self._save_datasets(self._tree, catalog, 1)
+            self._save_datasets(self._tree, catalog, 1)
\ No newline at end of file