#2 - Add gini and entropy measures

rename get_dataset to load_dataset add features and impurity to __str__ of node
2025-08-18 17:06:01 +00:00 · 2020-06-14 03:08:55 +02:00
parent ae1c199e21
commit f1ee4de37b
5 changed files with 118 additions and 98 deletions
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -29,7 +29,15 @@ class Snode:
    dataset assigned to it
    """

-    def __init__(self, clf: SVC, X: np.ndarray, y: np.ndarray, title: str):
+    def __init__(
+        self,
+        clf: SVC,
+        X: np.ndarray,
+        y: np.ndarray,
+        features: np.array,
+        impurity: float,
+        title: str,
+    ):
        self._clf = clf
        self._title = title
        self._belief = 0.0
@@ -39,10 +47,21 @@ class Snode:
        self._down = None
        self._up = None
        self._class = None
+        self._feature = None
+        self._sample_weight = None
+        self._features = features
+        self._impurity = impurity

    @classmethod
    def copy(cls, node: "Snode") -> "Snode":
-        return cls(node._clf, node._X, node._y, node._title)
+        return cls(
+            node._clf,
+            node._X,
+            node._y,
+            node._features,
+            node._impurity,
+            node._title,
+        )

    def set_down(self, son):
        self._down = son
@@ -83,11 +102,15 @@ class Snode:
            count_values = np.unique(self._y, return_counts=True)
            result = (
                f"{self._title} - Leaf class={self._class} belief="
-                f"{self._belief: .6f} counts={count_values}"
+                f"{self._belief: .6f} impurity={self._impurity:.4f} "
+                f"counts={count_values}"
            )
            return result
        else:
-            return f"{self._title}"
+            return (
+                f"{self._title} feaures={self._features} impurity="
+                f"{self._impurity:.4f}"
+            )


 class Siterator:
@@ -130,6 +153,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        degree: int = 3,
        gamma="scale",
        split_criteria: str = "max_samples",
+        criterion: str = "gini",
        min_samples_split: int = 0,
        max_features=None,
    ):
@@ -144,6 +168,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        self.min_samples_split = min_samples_split
        self.split_criteria = split_criteria
        self.max_features = max_features
+        self.criterion = criterion

    def _more_tags(self) -> dict:
        """Required by sklearn to supply features of the classifier
@@ -251,6 +276,10 @@ class Stree(BaseEstimator, ClassifierMixin):
                f"split_criteria has to be min_distance or \
                max_samples got ({self.split_criteria})"
            )
+        if self.criterion not in ["gini", "entropy"]:
+            raise ValueError(
+                f"criterion must be gini or entropy got({self.criterion})"
+            )

        check_classification_targets(y)
        X, y = check_X_y(X, y)
@@ -263,6 +292,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        self.depth_ = 0
        self.n_features_ = X.shape[1]
        self.max_features_ = self._initialize_max_features()
+        self.criterion_function_ = getattr(self, f"_{self.criterion}")
        self.tree_ = self.train(X, y, sample_weight, 1, "root")
        self._build_predictor()
        return self
@@ -296,12 +326,20 @@ class Stree(BaseEstimator, ClassifierMixin):
            return None
        if np.unique(y).shape[0] == 1:
            # only 1 class => pure dataset
-            return Snode(None, X, y, title + ", <pure>")
+            return Snode(
+                clf=None,
+                X=X,
+                y=y,
+                features=X.shape[1],
+                impurity=0.0,
+                title=title + ", <pure>",
+            )
        # Train the model
        clf = self._build_clf()
        Xs, indices_subset = self._get_subspace(X)
        clf.fit(Xs, y, sample_weight=sample_weight)
-        node = Snode(clf, Xs, y, title)
+        impurity = self.criterion_function_(y)
+        node = Snode(clf, X, y, indices_subset, impurity, title)
        self.depth_ = max(depth, self.depth_)
        down = self._split_criteria(self._distances(node, Xs), node)
        X_U, X_D = self._split_array(X, down)
@@ -309,7 +347,14 @@ class Stree(BaseEstimator, ClassifierMixin):
        sw_u, sw_d = self._split_array(sample_weight, down)
        if X_U is None or X_D is None:
            # didn't part anything
-            return Snode(clf, X, y, title + ", <cgaf>")
+            return Snode(
+                clf,
+                X,
+                y,
+                features=X.shape[1],
+                impurity=impurity,
+                title=title + ", <cgaf>",
+            )
        node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
        node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
        return node
@@ -484,6 +529,17 @@ class Stree(BaseEstimator, ClassifierMixin):
                )
        return max_features

+    @staticmethod
+    def _gini(y: np.array) -> float:
+        _, count = np.unique(y, return_counts=True)
+        return 1 - np.sum(np.square(count / np.sum(count)))
+
+    @staticmethod
+    def _entropy(y: np.array) -> float:
+        _, count = np.unique(y, return_counts=True)
+        proportion = count / np.sum(count)
+        return -np.sum(proportion * np.log2(proportion))
+
    def _get_subspace(self, dataset: np.array) -> list:
        """Return the best subspace to make a split
        """