#3 Rewrite some tests & remove use_predictions

Remove use_predictions parameter as of now, the model always use it
2025-08-15 15:36:00 +00:00 · 2020-06-08 01:51:21 +02:00
parent 05b462716e
commit 3a48d8b405
3 changed files with 64 additions and 139 deletions
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -126,14 +126,12 @@ class Stree(BaseEstimator, ClassifierMixin):
        random_state: int = None,
        max_depth: int = None,
        tol: float = 1e-4,
-        use_predictions: bool = False,
        min_samples_split: int = 0,
    ):
        self.max_iter = max_iter
        self.C = C
        self.kernel = kernel
        self.random_state = random_state
-        self.use_predictions = use_predictions
        self.max_depth = max_depth
        self.tol = tol
        self.min_samples_split = min_samples_split
@@ -172,6 +170,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        :rtype: list
        """
        up = ~down
+        print(self.kernel, up.shape, down.shape)
        return (
            origin[up[:, 0]] if any(up) else None,
            origin[down[:, 0]] if any(down) else None,
@@ -188,14 +187,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        the hyperplane of the node
        :rtype: np.array
        """
-        if self.use_predictions:
-            res = np.expand_dims(node._clf.decision_function(data), 1)
-        else:
-            # doesn't work with multiclass as each sample has to do inner
-            # product with its own coefficients computes positition of every
-            # sample is w.r.t. the hyperplane
-            res = self._linear_function(data, node)
-        return res
+        return np.expand_dims(node._clf.decision_function(data), 1)

    def _split_criteria(self, data: np.array) -> np.array:
        """Set the criteria to split arrays
--- a/stree/tests/Strees_grapher_test.py
+++ b/stree/tests/Strees_grapher_test.py
@@ -32,9 +32,7 @@ class Stree_grapher_test(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        os.environ["TESTING"] = "1"
        self._random_state = 1
-        self._clf = Stree_grapher(
-            dict(random_state=self._random_state, use_predictions=False)
-        )
+        self._clf = Stree_grapher(dict(random_state=self._random_state))
        self._clf.fit(*get_dataset(self._random_state, n_features=4))
        super().__init__(*args, **kwargs)

@@ -102,9 +100,7 @@ class Snode_graph_test(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        os.environ["TESTING"] = "1"
        self._random_state = 1
-        self._clf = Stree_grapher(
-            dict(random_state=self._random_state, use_predictions=False)
-        )
+        self._clf = Stree_grapher(dict(random_state=self._random_state))
        self._clf.fit(*get_dataset(self._random_state))
        super().__init__(*args, **kwargs)

--- a/stree/tests/Strees_test.py
+++ b/stree/tests/Strees_test.py
@@ -28,10 +28,7 @@ class Stree_test(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        os.environ["TESTING"] = "1"
        self._random_state = 1
-        self._clf = Stree(
-            random_state=self._random_state, use_predictions=False
-        )
-        self._clf.fit(*get_dataset(self._random_state))
+        self._kernels = ["linear", "rbf", "poly"]
        super().__init__(*args, **kwargs)

    @classmethod
@@ -82,7 +79,10 @@ class Stree_test(unittest.TestCase):
    def test_build_tree(self):
        """Check if the tree is built the same way as predictions of models
        """
-        self._check_tree(self._clf.tree_)
+        for kernel in self._kernels:
+            clf = Stree(kernel=kernel, random_state=self._random_state)
+            clf.fit(*get_dataset(self._random_state))
+            self._check_tree(clf.tree_)

    def _find_out(
        self, px: np.array, x_original: np.array, y_original
@@ -105,148 +105,85 @@ class Stree_test(unittest.TestCase):
        return res

    def test_single_prediction(self):
+        probs = [0.29026400766, 0.73105613, 0.0307635]
        X, y = get_dataset(self._random_state)
-        yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
-        self.assertEqual(yp[0], y[0])
+        for kernel, prob in zip(self._kernels, probs):
+            clf = Stree(kernel=kernel, random_state=self._random_state)
+            yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
+            self.assertEqual(yp[0], y[0])

    def test_multiple_prediction(self):
        # First 27 elements the predictions are the same as the truth
        num = 27
        X, y = get_dataset(self._random_state)
-        yp = self._clf.predict(X[:num, :])
-        self.assertListEqual(y[:num].tolist(), yp.tolist())
+        for kernel in self._kernels:
+            clf = Stree(kernel=kernel, random_state=self._random_state)
+            yp = clf.fit(X, y).predict(X[:num, :])
+            self.assertListEqual(y[:num].tolist(), yp.tolist())

    def test_score(self):
        X, y = get_dataset(self._random_state)
-        for kernel in ["linear"]:
-            clf = Stree(
-                random_state=self._random_state,
-                kernel=kernel,
-                use_predictions=True,
-            )
+        for kernel, accuracy_expected in zip(
+            self._kernels,
+            [0.9506666666666667, 0.9606666666666667, 0.9433333333333334],
+        ):
+            clf = Stree(random_state=self._random_state, kernel=kernel,)
            clf.fit(X, y)
            accuracy_score = clf.score(X, y)
            yp = clf.predict(X)
            accuracy_computed = np.mean(yp == y)
            self.assertEqual(accuracy_score, accuracy_computed)
-            self.assertGreater(accuracy_score, 0.9)
+            self.assertAlmostEqual(accuracy_expected, accuracy_score)

    def test_single_predict_proba(self):
-        """Check that element 28 has a prediction different that the current
-        label
+        """Check the element 28 probability of being 1
        """
-        # Element 28 has a different prediction than the truth
        decimals = 5
-        prob = 0.29026400766
+        element = 28
+        probs = [0.29026400766, 0.73105613, 0.0307635]
        X, y = get_dataset(self._random_state)
-        yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
-        self.assertEqual(
-            np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)
-        )
-        self.assertEqual(1, y[28])
-
-        self.assertAlmostEqual(
-            round(prob, decimals), round(yp[0, 1], decimals), decimals
-        )
+        self.assertEqual(1, y[element])
+        for kernel, prob in zip(self._kernels, probs):
+            clf = Stree(kernel=kernel, random_state=self._random_state)
+            yp = clf.fit(X, y).predict_proba(
+                X[element, :].reshape(-1, X.shape[1])
+            )
+            self.assertAlmostEqual(
+                np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)
+            )
+            self.assertAlmostEqual(
+                round(prob, decimals), round(yp[0, 1], decimals), decimals
+            )

    def test_multiple_predict_proba(self):
        # First 27 elements the predictions are the same as the truth
        num = 27
-        decimals = 5
        X, y = get_dataset(self._random_state)
-        yp = self._clf.predict_proba(X[:num, :])
-        self.assertListEqual(
-            y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()
-        )
-        expected_proba = [
-            0.88395641,
-            0.36746962,
-            0.84158767,
-            0.34106833,
-            0.14269291,
-            0.85193236,
-            0.29876058,
-            0.7282164,
-            0.85958616,
-            0.89517877,
-            0.99745224,
-            0.18860349,
-            0.30756427,
-            0.8318412,
-            0.18981198,
-            0.15564624,
-            0.25740655,
-            0.22923355,
-            0.87365959,
-            0.49928689,
-            0.95574351,
-            0.28761257,
-            0.28906333,
-            0.32643692,
-            0.29788483,
-            0.01657364,
-            0.81149083,
-        ]
-        expected = np.round(expected_proba, decimals=decimals).tolist()
-        computed = np.round(yp[:, 1], decimals=decimals).tolist()
-        for i in range(len(expected)):
-            self.assertAlmostEqual(expected[i], computed[i], decimals)
-
-    def build_models(self):
-        """Build and train two models, model_clf will use the sklearn
-        classifier to compute predictions and split data. model_computed will
-        use vector of coefficients to compute both predictions and splitted
-        data
-        """
-        model_clf = Stree(
-            random_state=self._random_state, use_predictions=True
-        )
-        model_computed = Stree(
-            random_state=self._random_state, use_predictions=False
-        )
-        X, y = get_dataset(self._random_state)
-        model_clf.fit(X, y)
-        model_computed.fit(X, y)
-        return model_clf, model_computed, X, y
-
-    def test_use_model_predict(self):
-        """Check that we get the same results wether we use the estimator in
-        nodes to compute labels or we use the hyperplane and the position of
-        samples wrt to it
-        """
-        use_clf, use_math, X, _ = self.build_models()
-        self.assertListEqual(
-            use_clf.predict(X).tolist(), use_math.predict(X).tolist()
-        )
-
-    def test_use_model_score(self):
-        use_clf, use_math, X, y = self.build_models()
-        b = use_math.score(X, y)
-        self.assertEqual(use_clf.score(X, y), b)
-        self.assertGreater(b, 0.95)
-
-    def test_use_model_predict_proba(self):
-        use_clf, use_math, X, _ = self.build_models()
-        self.assertListEqual(
-            use_clf.predict_proba(X).tolist(),
-            use_math.predict_proba(X).tolist(),
-        )
+        for kernel in self._kernels:
+            clf = Stree(kernel=kernel, random_state=self._random_state)
+            clf.fit(X, y)
+            yp = clf.predict_proba(X[:num, :])
+            self.assertListEqual(
+                y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()
+            )

    def test_single_vs_multiple_prediction(self):
        """Check if predicting sample by sample gives the same result as
        predicting all samples at once
        """
-        X, _ = get_dataset(self._random_state)
-        # Compute prediction line by line
-        yp_line = np.array([], dtype=int)
-        for xp in X:
-            yp_line = np.append(
-                yp_line, self._clf.predict(xp.reshape(-1, X.shape[1]))
-            )
-        # Compute prediction at once
-        yp_once = self._clf.predict(X)
-        #
-        self.assertListEqual(yp_line.tolist(), yp_once.tolist())
+        X, y = get_dataset(self._random_state)
+        for kernel in self._kernels:
+            clf = Stree(kernel=kernel, random_state=self._random_state)
+            clf.fit(X, y)
+            # Compute prediction line by line
+            yp_line = np.array([], dtype=int)
+            for xp in X:
+                yp_line = np.append(
+                    yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
+                )
+            # Compute prediction at once
+            yp_once = clf.predict(X)
+            self.assertListEqual(yp_line.tolist(), yp_once.tolist())

    def test_iterator_and_str(self):
        """Check preorder iterator
@@ -266,11 +203,13 @@ class Stree_test(unittest.TestCase):
        ]
        computed = []
        expected_string = ""
-        for node in self._clf:
+        clf = Stree(kernel="linear", random_state=self._random_state)
+        clf.fit(*get_dataset(self._random_state))
+        for node in clf:
            computed.append(str(node))
            expected_string += str(node) + "\n"
        self.assertListEqual(expected, computed)
-        self.assertEqual(expected_string, str(self._clf))
+        self.assertEqual(expected_string, str(clf))

    def test_is_a_sklearn_classifier(self):
        import warnings
@@ -323,9 +262,7 @@ class Snode_test(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        os.environ["TESTING"] = "1"
        self._random_state = 1
-        self._clf = Stree(
-            random_state=self._random_state, use_predictions=True
-        )
+        self._clf = Stree(random_state=self._random_state)
        self._clf.fit(*get_dataset(self._random_state))
        super().__init__(*args, **kwargs)