Implement optimized predict and new predict_proba

2025-08-18 17:06:01 +00:00 · 2022-05-31 19:12:48 +02:00
parent 65923af9b4
commit 0a78d5be67
3 changed files with 58 additions and 52 deletions
--- a/stree/Splitter.py
+++ b/stree/Splitter.py
@@ -135,7 +135,7 @@ class Snode:
        if not self.is_leaf():
            return
        classes, card = np.unique(self._y, return_counts=True)
-        self._proba = np.zeros((num_classes,))
+        self._proba = np.zeros((num_classes,), dtype=np.int64)
        for c, n in zip(classes, card):
            self._proba[c] = n
        try:
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -367,28 +367,66 @@ class Stree(BaseEstimator, ClassifierMixin):
            )
        )
-    @staticmethod
+    def __predict_class(self, X: np.array) -> np.array:
-    def _reorder_results(y: np.array, indices: np.array) -> np.array:
+        def compute_prediction(xp, indices, node):
-        """Reorder an array based on the array of indices passed
+            if xp is None:
                return
            if node.is_leaf():
                # set a class for indices
                result[indices] = node._proba
                return
            self.splitter_.partition(xp, node, train=False)
            x_u, x_d = self.splitter_.part(xp)
            i_u, i_d = self.splitter_.part(indices)
            compute_prediction(x_u, i_u, node.get_up())
            compute_prediction(x_d, i_d, node.get_down())
        # setup prediction & make it happen
        result = np.zeros((X.shape[0], self.n_classes_))
        indices = np.arange(X.shape[0])
        compute_prediction(X, indices, self.tree_)
        return result
    def check_predict(self, X) -> np.array:
        check_is_fitted(self, ["tree_"])
        # Input validation
        X = check_array(X)
        if X.shape[1] != self.n_features_:
            raise ValueError(
                f"Expected {self.n_features_} features but got "
                f"({X.shape[1]})"
            )
        return X
    def predict_proba(self, X: np.array) -> np.array:
        """Predict class probabilities of the input samples X.
        The predicted class probability is the fraction of samples of the same
        class in a leaf.
        Parameters
        ----------
-        y : np.array
+        X : dataset of samples.
            data untidy
        indices : np.array
            indices used to set order
        Returns
        -------
-        np.array
+        proba : array of shape (n_samples, n_classes)
-            array y ordered
+            The class probabilities of the input samples.
        Raises
        ------
        ValueError
            if dataset with inconsistent number of features
        NotFittedError
            if model is not fitted
        """
-        # return array of same type given in y
+
-        y_ordered = y.copy()
+        X = self.check_predict(X)
-        indices = indices.astype(int)
+        # return # of samples of each class in leaf node
-        for i, index in enumerate(indices):
+        values = self.__predict_class(X)
-            y_ordered[index] = y[i]
+        normalizer = values.sum(axis=1)[:, np.newaxis]
-        return y_ordered
+        normalizer[normalizer == 0.0] = 1.0
        return values / normalizer
    def predict(self, X: np.array) -> np.array:
        """Predict labels for each sample in dataset passed
@@ -410,40 +448,8 @@ class Stree(BaseEstimator, ClassifierMixin):
        NotFittedError
            if model is not fitted
        """
-
+        X = self.check_predict(X)
-        def predict_class(
+        return self.classes_[np.argmax(self.__predict_class(X), axis=1)]
            xp: np.array, indices: np.array, node: Snode
        ) -> np.array:
            if xp is None:
                return [], []
            if node.is_leaf():
                # set a class for every sample in dataset
                prediction = np.full((xp.shape[0], 1), node._class)
                return prediction, indices
            self.splitter_.partition(xp, node, train=False)
            x_u, x_d = self.splitter_.part(xp)
            i_u, i_d = self.splitter_.part(indices)
            prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
            prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
            return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
        # sklearn check
        check_is_fitted(self, ["tree_"])
        # Input validation
        X = check_array(X)
        if X.shape[1] != self.n_features_:
            raise ValueError(
                f"Expected {self.n_features_} features but got "
                f"({X.shape[1]})"
            )
        # setup prediction & make it happen
        indices = np.arange(X.shape[0])
        result = (
            self._reorder_results(*predict_class(X, indices, self.tree_))
            .astype(int)
            .ravel()
        )
        return self.classes_[result]
    def nodes_leaves(self) -> tuple:
        """Compute the number of nodes and leaves in the built tree
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -695,7 +695,7 @@ class Stree_test(unittest.TestCase):
        )
        expected_tail = (
            ' [shape=box style=filled label="class=1 impurity=0.000 '
-            'counts=[0. 1. 0.]"];\n}\n'
+            'counts=[0 1 0]"];\n}\n'
        )
        self.assertEqual(clf.graph(), expected_head + "}\n")
        clf.fit(X, y)
@@ -715,7 +715,7 @@ class Stree_test(unittest.TestCase):
        )
        expected_tail = (
            ' [shape=box style=filled label="class=1 impurity=0.000 '
-            'counts=[0. 1. 0.]"];\n}\n'
+            'counts=[0 1 0]"];\n}\n'
        )
        self.assertEqual(clf.graph("Sample title"), expected_head + "}\n")
        clf.fit(X, y)