Implement predict & predict_proba optimization

reduces time in two orders of magnitude in creditcard dataset
2025-08-16 07:56:06 +00:00 · 2020-05-15 23:35:33 +02:00
parent e56b955b92
commit 80b5cf8e72
6 changed files with 129 additions and 59 deletions
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
            os.environ.pop('TESTING')
        except:
            pass
-
+        
    def _get_Xy(self):
        X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
                                   n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
@@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase):
        return X, y

    def _check_tree(self, node: Snode):
+        """Check recursively that the nodes that are not leaves have the correct 
+        number of labels and its sons have the right number of elements in their dataset
+
+        Arguments:
+            node {Snode} -- node to check
+        """
        if node.is_leaf():
            return
        y_prediction = node._clf.predict(node._X)
@@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase):
        unique_y, count_y = np.unique(node._y, return_counts=True)
        _, count_d = np.unique(y_down, return_counts=True)
        _, count_u = np.unique(y_up, return_counts=True)
+        #
        for i in unique_y:
            try:
                number_down = count_d[i]
@@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase):
            self.assertEqual(count_y[i], number_down + number_up)
        # Is the partition made the same as the prediction?
        # as the node is not a leaf...
-        unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
-        self.assertEqual(count_yp[1], y_down.shape[0])
-        self.assertEqual(count_yp[0], y_up.shape[0])
+        _, count_yp = np.unique(y_prediction, return_counts=True)
+        self.assertEqual(count_yp[1], y_up.shape[0])
+        self.assertEqual(count_yp[0], y_down.shape[0])
        self._check_tree(node.get_down())
        self._check_tree(node.get_up())

@@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase):
        return res

    def test_subdatasets(self):
-        """Check if the subdatasets files have the same predictions as the tree itself
+        """Check if the subdatasets files have the same labels as the original dataset
        """
-        model = self._clf._tree._clf
-        X, y = self._get_Xy()
-        model.fit(X, y)
        self._clf.save_sub_datasets()
        with open(self._clf.get_catalog_name()) as cat_file:
            catalog = csv.reader(cat_file, delimiter=',')
@@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase):
        right = (yp == y).astype(int)
        accuracy_computed = sum(right) / len(y)
        self.assertEqual(accuracy_score, accuracy_computed)
+        self.assertGreater(accuracy_score, 0.8)
    
    def test_single_predict_proba(self):
+        """Check that element 28 has a prediction different that the current label
+        """
        # Element 28 has a different prediction than the truth
        X, y = self._get_Xy()
        yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
        self.assertEqual(0, yp[0:, 0])
+        self.assertEqual(1, y[28])
        self.assertEqual(0.9282970550576184, yp[0:, 1])

    def test_multiple_predict_proba(self):
        # First 27 elements the predictions are the same as the truth
        num = 27
        X, y = self._get_Xy()
-        yp = self._clf.predict_proba(X[:num, :])
+        yp = self._clf.predict_proba(X[:num,:])
        self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
        expected_proba = [0.9759887,  0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.9759887, 
                        0.92829706, 0.9759887,  0.9759887,  0.9759887,  0.9759887,  0.92829706, 
@@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase):
                        0.92829706, 0.92829706, 0.9759887 ]
        self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())

+    def test_use_model_predictions(self):
+        """Check that we get the same results wether we use the estimator in nodes
+        to compute labes or we use the hyperplane and the position of samples wrt to it
+        """
+        model_predictions = Stree(random_state=self._random_state,
+                            use_predictions=True)
+        model_hyperplane = Stree(random_state=self._random_state,
+                            use_predictions=False)
+        X, y = self._get_Xy()
+        model_predictions.fit(X, y)
+        model_hyperplane.fit(X, y)
+        self.assertListEqual(
+            model_predictions.predict(X).tolist(),
+            model_hyperplane.predict(X).tolist()
+        )
+        a = model_predictions.score(X, y, print_out=False),
+        b = model_hyperplane.score(X, y, print_out=False)
+        self.assertEqual(a, b)
+        self.assertGreater(b, .95)
+
+    def test_single_vs_multiple_prediction(self):
+        """Check if predicting sample by sample gives the same result as predicting
+        all samples at once
+        """
+        X, _ = self._get_Xy()
+        # Compute prediction line by line
+        yp_line = np.array([], dtype=int)
+        for xp in X:
+            yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
+        # Compute prediction at once
+        yp_once = self._clf.predict(X)
+        #
+        self.assertListEqual(yp_line.tolist(), yp_once.tolist())
+        
+
+
+