Implement predict_proba with test.

Fix tree overload with dataset in nodes only needed in tests
2025-08-16 07:56:06 +00:00 · 2020-05-14 18:42:17 +02:00
parent e3ae3a3a6c
commit e56b955b92
7 changed files with 154 additions and 281 deletions
--- a/trees/Snode.py
+++ b/trees/Snode.py
@@ -6,6 +6,7 @@ __version__ = "0.9"
 Node of the Stree (binary tree)
 '''

+import os
 import numpy as np
 from sklearn.svm import LinearSVC

@@ -17,11 +18,12 @@ class Snode:
        self._interceptor = 0. if clf is None else clf.intercept_
        self._title = title
        self._belief = 0.  # belief of the prediction in a leaf node based on samples
-        self._X = X
+        self._X = X if os.environ.get(
+            'TESTING', 'Not Set') != 'Not Set' else None
        self._y = y
        self._down = None
        self._up = None
-        self._class = None  # really needed?
+        self._class = None

    def set_down(self, son):
        self._down = son
@@ -42,6 +44,9 @@ class Snode:
        """Compute the class of the predictor and its belief based on the subdataset of the node
        only if it is a leaf
        """
+        # Clean memory
+        #self._X = None
+        #self._y = None
        if not self.is_leaf():
            return
        classes, card = np.unique(self._y, return_counts=True)
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -1,3 +1,4 @@
+# This Python file uses the following encoding: utf-8
 '''
 __author__ = "Ricardo Montañana Gómez"
 __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
@@ -10,23 +11,37 @@ Uses LinearSVC
 import numpy as np
 import typing
 from sklearn.svm import LinearSVC
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

 from trees.Snode import Snode


-class Stree:
+class Stree(BaseEstimator, ClassifierMixin):
    """
    """

-    def __init__(self, C=1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
+    def __init__(self, C=1.0, max_iter: int=1000, random_state: int=0, use_predictions: bool=False):
        self._max_iter = max_iter
        self._C = C
        self._random_state = random_state
-        self._outcomes = None
        self._tree = None
        self.__folder = 'data/'
        self.__use_predictions = use_predictions
        self.__trained = False
+        self.__proba = False
+
+    def get_params(self, deep=True):
+        """Get dict with hyperparameters and its values to accomplish sklearn rules
+        """
+        return {"C": self._C, "random_state": self._random_state, 'max_iter': self._max_iter}
+
+    def set_params(self, **parameters):
+        """Set hyperparmeters as specified by sklearn, needed in Gridsearchs
+        """
+        for parameter, value in parameters.items():
+            setattr(self, parameter, value)
+        return self

    def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
        if self.__use_predictions:
@@ -47,6 +62,8 @@ class Stree:
        return [X_up, y_up, X_down, y_down]

    def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
+        X, y = check_X_y(X, y)
+        self.n_features_in_ = X.shape[1]
        self._tree = self.train(X, y.ravel(), title)
        self._build_predictor()
        self.__trained = True
@@ -83,16 +100,31 @@ class Stree:
    def predict(self, X: np.array) -> np.array:
        def predict_class(xp: np.array, tree: Snode) -> np.array:
            if tree.is_leaf():
-                return tree._class
+                if self.__proba:
+                    return [tree._class, tree._belief]
+                else:
+                    return tree._class
            coef = tree._vector[0, :].reshape(-1, xp.shape[1])
            if xp.dot(coef.T) + tree._interceptor[0] > 0:
                return predict_class(xp, tree.get_down())
            return predict_class(xp, tree.get_up())
+
+        # sklearn check
+        check_is_fitted(self)
+        # Input validation
+        X = check_array(X)
+        # setup prediction & make it happen
        y = np.array([], dtype=int)
        for xp in X:
            y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
        return y

+    def predict_proba(self, X: np.array) -> np.array:
+        self.__proba = True
+        result = self.predict(X).reshape(X.shape[0], 2)
+        self.__proba = False
+        return result
+
    def score(self, X: np.array, y: np.array, print_out=True) -> float:
        if not self.__trained:
            self.fit(X, y)