Implement predict & predict_proba optimization

reduces time in two orders of magnitude in creditcard dataset
2025-08-15 23:46:02 +00:00 · 2020-05-15 23:35:33 +02:00
parent e56b955b92
commit 80b5cf8e72
6 changed files with 129 additions and 59 deletions
--- a/README.md
+++ b/README.md
@@ -10,5 +10,5 @@ python main.py
 ## Tests

 ```python
-python -m unittest tests.Stree_test tests.Snode_test
+python -m unittest -v tests.Stree_test tests.Snode_test
 ```
--- a/main.py
+++ b/main.py
@@ -33,8 +33,9 @@ def load_creditcard(n_examples=0):
    print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
    print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
    return X, y
-X, y = load_creditcard(-5000)
+#X, y = load_creditcard(-5000)
 #X, y = load_creditcard()
+X, y = load_creditcard()

 clf = Stree(C=.01, max_iter=100, random_state=random_state)
 clf.fit(X, y)
--- a/test.ipynb
+++ b/test.ipynb
@@ -19,13 +19,7 @@
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28)  y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
-    }
-   ],
+   "outputs": [],
   "source": [
    "def load_creditcard(n_examples=0):\n",
    "    df = pd.read_csv('data/creditcard.csv')\n",
@@ -61,6 +55,7 @@
    "\n",
    "#X, y = load_wine(return_X_y=True)\n",
    "#X, y = load_iris(return_X_y=True)\n",
+    "#y[y==2]=0\n",
    "\n",
    "X, y = load_creditcard()"
   ]
@@ -73,7 +68,7 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23,  2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18,  3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242,    103]))\n\n44.3767 secs\n"
+     "text": "root\nroot - Down\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242,    103]))\nroot - Down - Up\nroot - Down - Up - Down\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18,  3]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23,  2]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n60.9873 secs\n"
    }
   ],
   "source": [
@@ -92,7 +87,7 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Accuracy: 0.999512\n33.1651 secs\n"
+     "text": "Accuracy: 0.999512\n0.3226 secs\n"
    }
   ],
   "source": [
@@ -109,7 +104,7 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "(284807, 2)\n87.5212 secs\n"
+     "text": "(284807, 2)\n0.4148 secs\n"
    }
   ],
   "source": [
@@ -119,6 +114,15 @@
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# outcomes without optimization executing predict_proba. 87 seconds\n",
+    "(284807, 2)\n",
+    "87.5212 secs"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 6,
@@ -127,7 +131,7 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "0.9991397683343457\n12.6601 secs\n"
+     "text": "0.9991397683343457\n20.9481 secs\n"
    }
   ],
   "source": [
@@ -146,7 +150,7 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "1.0\n18.2638 secs\n"
+     "text": "1.0\n32.2779 secs\n"
    }
   ],
   "source": [
@@ -156,6 +160,15 @@
    "print(clf3.score(X, y))\n",
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "from sklearn.utils.estimator_checks import check_estimator\n",
+    "clf = Stree()\n",
+    "check_estimator(clf)"
+   ]
  }
 ],
 "metadata": {
@@ -173,8 +186,8 @@
  },
  "orig_nbformat": 2,
  "kernelspec": {
-   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
-   "display_name": "Python 3.7.6 64-bit ('general': venv)"
+   "name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
+   "display_name": "Python 3.7.6 64-bit ('stree': venv)"
  }
 },
 "nbformat": 4,
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
            os.environ.pop('TESTING')
        except:
            pass
-
+        
    def _get_Xy(self):
        X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
                                   n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
@@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase):
        return X, y

    def _check_tree(self, node: Snode):
+        """Check recursively that the nodes that are not leaves have the correct 
+        number of labels and its sons have the right number of elements in their dataset
+
+        Arguments:
+            node {Snode} -- node to check
+        """
        if node.is_leaf():
            return
        y_prediction = node._clf.predict(node._X)
@@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase):
        unique_y, count_y = np.unique(node._y, return_counts=True)
        _, count_d = np.unique(y_down, return_counts=True)
        _, count_u = np.unique(y_up, return_counts=True)
+        #
        for i in unique_y:
            try:
                number_down = count_d[i]
@@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase):
            self.assertEqual(count_y[i], number_down + number_up)
        # Is the partition made the same as the prediction?
        # as the node is not a leaf...
-        unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
-        self.assertEqual(count_yp[1], y_down.shape[0])
-        self.assertEqual(count_yp[0], y_up.shape[0])
+        _, count_yp = np.unique(y_prediction, return_counts=True)
+        self.assertEqual(count_yp[1], y_up.shape[0])
+        self.assertEqual(count_yp[0], y_down.shape[0])
        self._check_tree(node.get_down())
        self._check_tree(node.get_up())

@@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase):
        return res

    def test_subdatasets(self):
-        """Check if the subdatasets files have the same predictions as the tree itself
+        """Check if the subdatasets files have the same labels as the original dataset
        """
-        model = self._clf._tree._clf
-        X, y = self._get_Xy()
-        model.fit(X, y)
        self._clf.save_sub_datasets()
        with open(self._clf.get_catalog_name()) as cat_file:
            catalog = csv.reader(cat_file, delimiter=',')
@@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase):
        right = (yp == y).astype(int)
        accuracy_computed = sum(right) / len(y)
        self.assertEqual(accuracy_score, accuracy_computed)
+        self.assertGreater(accuracy_score, 0.8)
    
    def test_single_predict_proba(self):
+        """Check that element 28 has a prediction different that the current label
+        """
        # Element 28 has a different prediction than the truth
        X, y = self._get_Xy()
        yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
        self.assertEqual(0, yp[0:, 0])
+        self.assertEqual(1, y[28])
        self.assertEqual(0.9282970550576184, yp[0:, 1])

    def test_multiple_predict_proba(self):
        # First 27 elements the predictions are the same as the truth
        num = 27
        X, y = self._get_Xy()
-        yp = self._clf.predict_proba(X[:num, :])
+        yp = self._clf.predict_proba(X[:num,:])
        self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
        expected_proba = [0.9759887,  0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.9759887, 
                        0.92829706, 0.9759887,  0.9759887,  0.9759887,  0.9759887,  0.92829706, 
@@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase):
                        0.92829706, 0.92829706, 0.9759887 ]
        self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())

+    def test_use_model_predictions(self):
+        """Check that we get the same results wether we use the estimator in nodes
+        to compute labes or we use the hyperplane and the position of samples wrt to it
+        """
+        model_predictions = Stree(random_state=self._random_state,
+                            use_predictions=True)
+        model_hyperplane = Stree(random_state=self._random_state,
+                            use_predictions=False)
+        X, y = self._get_Xy()
+        model_predictions.fit(X, y)
+        model_hyperplane.fit(X, y)
+        self.assertListEqual(
+            model_predictions.predict(X).tolist(),
+            model_hyperplane.predict(X).tolist()
+        )
+        a = model_predictions.score(X, y, print_out=False),
+        b = model_hyperplane.score(X, y, print_out=False)
+        self.assertEqual(a, b)
+        self.assertGreater(b, .95)
+
+    def test_single_vs_multiple_prediction(self):
+        """Check if predicting sample by sample gives the same result as predicting
+        all samples at once
+        """
+        X, _ = self._get_Xy()
+        # Compute prediction line by line
+        yp_line = np.array([], dtype=int)
+        for xp in X:
+            yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
+        # Compute prediction at once
+        yp_once = self._clf.predict(X)
+        #
+        self.assertListEqual(yp_line.tolist(), yp_once.tolist())
+        
+
+
+


--- a/trees/Snode.py
+++ b/trees/Snode.py
@@ -18,8 +18,8 @@ class Snode:
        self._interceptor = 0. if clf is None else clf.intercept_
        self._title = title
        self._belief = 0.  # belief of the prediction in a leaf node based on samples
-        self._X = X if os.environ.get(
-            'TESTING', 'Not Set') != 'Not Set' else None
+        # Only store dataset in Testing 
+        self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
        self._y = y
        self._down = None
        self._up = None
@@ -64,6 +64,6 @@ class Snode:

    def __str__(self) -> str:
        if self.is_leaf():
-            return f"Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
+            return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
        else:
            return f"{self._title}\n"
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -43,26 +43,25 @@ class Stree(BaseEstimator, ClassifierMixin):
            setattr(self, parameter, value)
        return self

-    def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
+    def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
        if self.__use_predictions:
-            yp = clf.predict(X)
+            yp = node._clf.predict(data)
            down = (yp == 1).reshape(-1, 1)
        else:
            # doesn't work with multiclass as each sample has to do inner product with its own coeficients
            # computes positition of every sample is w.r.t. the hyperplane
-            coef = clf.coef_[0, :].reshape(-1, X.shape[1])
-            intercept = clf.intercept_[0]
-            res = X.dot(coef.T) + intercept
+            coef = node._vector[0, :].reshape(-1, data.shape[1])
+            res = data.dot(coef.T) + node._interceptor[0]
            down = res > 0
        up = ~down
-        X_down = X[down[:, 0]] if any(down) else None
-        y_down = y[down[:, 0]] if any(down) else None
-        X_up = X[up[:, 0]] if any(up) else None
-        y_up = y[up[:, 0]] if any(up) else None
-        return [X_up, y_up, X_down, y_down]
+        data_down = data[down[:, 0]] if any(down) else None
+        indices_down = indices[down[:, 0]] if any(down) else None
+        data_up = data[up[:, 0]] if any(up) else None
+        indices_up = indices[up[:, 0]] if any(up) else None
+        return [data_down, indices_down, data_up, indices_up]

    def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
-        X, y = check_X_y(X, y)
+        X, y = check_X_y(X, y.ravel())
        self.n_features_in_ = X.shape[1]
        self._tree = self.train(X, y.ravel(), title)
        self._build_predictor()
@@ -83,47 +82,59 @@ class Stree(BaseEstimator, ClassifierMixin):
    def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
        if np.unique(y).shape[0] == 1:
            # only 1 class => pure dataset
-            return Snode(None, X, y, title + ', <pure> ')
+            return Snode(None, X, y, title + ', <pure>')
        # Train the model
        clf = LinearSVC(max_iter=self._max_iter, C=self._C,
                        random_state=self._random_state)
        clf.fit(X, y)
        tree = Snode(clf, X, y, title)
-        X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
+        X_U, y_u, X_D, y_d = self._split_data(tree, X, y)
        if X_U is None or X_D is None:
            # didn't part anything
-            return Snode(clf, X, y, title + ', <couldn\'t go any further>')
+            return Snode(clf, X, y, title + ', <cgaf>')
        tree.set_up(self.train(X_U, y_u, title + ' - Up'))
        tree.set_down(self.train(X_D, y_d, title + ' - Down'))
        return tree

-    def predict(self, X: np.array) -> np.array:
-        def predict_class(xp: np.array, tree: Snode) -> np.array:
-            if tree.is_leaf():
+    def _predict_values(self, X: np.array) -> np.array:
+        def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
+            if xp is None:
+                return [], []
+            if node.is_leaf():
+                # set a class for every sample in dataset
+                prediction = np.full((xp.shape[0], 1), node._class)
                if self.__proba:
-                    return [tree._class, tree._belief]
+                    prediction_proba = np.full((xp.shape[0], 1), node._belief)
+                    return np.append(prediction, prediction_proba, axis=1), indices
                else:
-                    return tree._class
-            coef = tree._vector[0, :].reshape(-1, xp.shape[1])
-            if xp.dot(coef.T) + tree._interceptor[0] > 0:
-                return predict_class(xp, tree.get_down())
-            return predict_class(xp, tree.get_up())
-
+                    return prediction, indices
+            u, i_u, d, i_d = self._split_data(node, xp, indices)
+            k, l = predict_class(d, i_d, node.get_down())
+            m, n = predict_class(u, i_u, node.get_up())
+            return np.append(k, m), np.append(l, n)
        # sklearn check
        check_is_fitted(self)
        # Input validation
        X = check_array(X)
        # setup prediction & make it happen
-        y = np.array([], dtype=int)
-        for xp in X:
-            y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
-        return y
+        indices = np.arange(X.shape[0])
+        return predict_class(X, indices, self._tree)
+    
+    def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
+        y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
+        indices = indices.astype(int)
+        for i, index in enumerate(indices):
+            y_ordered[index] = y[i]
+        return y_ordered
+
+    def predict(self, X: np.array) -> np.array:
+        return self._reorder_results(*self._predict_values(X))

    def predict_proba(self, X: np.array) -> np.array:
        self.__proba = True
-        result = self.predict(X).reshape(X.shape[0], 2)
+        result, indices = self._predict_values(X)
        self.__proba = False
-        return result
+        return self._reorder_results(result.reshape(X.shape[0], 2), indices)

    def score(self, X: np.array, y: np.array, print_out=True) -> float:
        if not self.__trained:
@@ -180,4 +191,4 @@ class Stree(BaseEstimator, ClassifierMixin):
        """Save the every dataset stored in the tree to check with manual classifier
        """
        with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
-            self._save_datasets(self._tree, catalog, 1)
+            self._save_datasets(self._tree, catalog, 1)