better solution to the sklearn bagging problem

Add better tests enhance .coveragerc
2025-08-15 07:26:01 +00:00 · 2020-06-26 11:22:45 +02:00
parent 76723993fd
commit 4b7e4a3fb0
5 changed files with 66 additions and 58 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -10,5 +10,4 @@ exclude_lines =
    if __name__ == .__main__.:
 ignore_errors = True
 omit =
-    stree/tests/*
    stree/__init__.py
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -40,6 +40,7 @@ class Snode:
        features: np.array,
        impurity: float,
        title: str,
+        weight: np.ndarray = None,
    ):
        self._clf = clf
        self._title = title
@@ -51,7 +52,9 @@ class Snode:
        self._up = None
        self._class = None
        self._feature = None
-        self._sample_weight = None
+        self._sample_weight = (
+            weight if os.environ.get("TESTING", "NS") != "NS" else None
+        )
        self._features = features
        self._impurity = impurity

@@ -443,9 +446,6 @@ class Stree(BaseEstimator, ClassifierMixin):
        sample_weight = _check_sample_weight(
            sample_weight, X, dtype=np.float64
        )
-        # solve WARNING: class label 0 specified in weight is not found
-        # in bagging
-        sample_weight += 1e-5
        check_classification_targets(y)
        # Initialize computed parameters
        self.splitter_ = Splitter(
@@ -505,13 +505,22 @@ class Stree(BaseEstimator, ClassifierMixin):
                features=X.shape[1],
                impurity=0.0,
                title=title + ", <pure>",
+                weight=sample_weight,
            )
        # Train the model
        clf = self._build_clf()
        Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
+        # solve WARNING: class label 0 specified in weight is not found
+        # in bagging
+        if any(sample_weight == 0):
+            indices = sample_weight == 0
+            y_next = y[~indices]
+            # touch weights if removing any class
+            if np.unique(y_next).shape[0] != self.n_classes_:
+                sample_weight += 1e-5
        clf.fit(Xs, y, sample_weight=sample_weight)
        impurity = self.splitter_.impurity(y)
-        node = Snode(clf, X, y, features, impurity, title)
+        node = Snode(clf, X, y, features, impurity, title, sample_weight)
        self.depth_ = max(depth, self.depth_)
        self.splitter_.partition(X, node)
        X_U, X_D = self.splitter_.part(X)
@@ -526,6 +535,7 @@ class Stree(BaseEstimator, ClassifierMixin):
                features=X.shape[1],
                impurity=impurity,
                title=title + ", <cgaf>",
+                weight=sample_weight,
            )
        node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
        node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
--- a/stree/tests/Snode_test.py
+++ b/stree/tests/Snode_test.py
@@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase):
            max_card = max(card)
            min_card = min(card)
            if len(classes) > 1:
-                try:
                belief = max_card / (max_card + min_card)
-                except ZeroDivisionError:
-                    belief = 0.0
            else:
                belief = 1
            self.assertEqual(belief, node._belief)
--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -178,20 +178,23 @@ class Splitter_test(unittest.TestCase):

    def test_splitter_parameter(self):
        expected_values = [
-            [1, 3, 4, 5],  # random gini    min_distance
-            [0, 1, 3, 4],  # random gini    max_samples
-            [1, 2, 4, 5],  # random gini    max_distance
-            [0, 2, 3, 5],  # random entropy min_distance
-            [0, 2, 3, 5],  # random entropy max_samples
-            [0, 1, 3, 4],  # random entropy max_distance
-            [0, 1, 2, 5],  # best   gini    min_distance
-            [2, 3, 4, 5],  # best   gini    max_samples
-            [0, 2, 3, 4],  # best   gini    max_distance
-            [0, 1, 2, 5],  # best   entropy min_distance
-            [2, 3, 4, 5],  # best   entropy max_samples
-            [0, 1, 2, 4],  # best   entropy max_distance
+            [1, 2],  # random gini    min_distance
+            [0, 2],  # random gini    max_samples
+            [1, 3],  # random gini    max_distance
+            [1, 2],  # random entropy min_distance
+            [1, 2],  # random entropy max_samples
+            [0, 2],  # random entropy max_distance
+            [1, 2],  # best   gini    min_distance
+            [0, 2],  # best   gini    max_samples
+            [0, 2],  # best   gini    max_distance
+            [0, 1],  # best   entropy min_distance
+            [0, 1],  # best   entropy max_samples
+            [0, 1],  # best   entropy max_distance
        ]
        X, y = load_dataset(self._random_state, n_features=6, n_classes=3)
+        from sklearn.datasets import load_iris
+
+        X, y = load_iris(return_X_y=True)
        rn = 0
        for splitter_type in ["random", "best"]:
            for criterion in ["gini", "entropy"]:
@@ -208,21 +211,11 @@ class Splitter_test(unittest.TestCase):
                    )
                    rn += 3
                    expected = expected_values.pop(0)
-                    dataset, computed = tcl.get_subspace(X, y, max_features=4)
-                    # Flaky test
-                    if (
-                        splitter_type == "best"
-                        and criteria == "max_distance"
-                        and criterion == "gini"
-                        and computed == (1, 2, 3, 4)
-                    ):
-                        # sometimes returns (0, 2, 3, 4) and sometimes
-                        # (1, 2, 3, 4)
-                        expected = [1, 2, 3, 4]
+                    dataset, computed = tcl.get_subspace(X, y, max_features=2)
                    # print(
                    #     "{},  # {:7s}{:8s}{:15s}".format(
-                    #         list(computed), splitter_type,
-                    #         criterion, criteria,
+                    #         list(computed), splitter_type, criterion,
+                    #           criteria,
                    #     )
                    # )
                    self.assertListEqual(expected, list(computed))
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -41,10 +41,7 @@ class Stree_test(unittest.TestCase):
        _, count_u = np.unique(y_up, return_counts=True)
        #
        for i in unique_y:
-            try:
            number_down = count_d[i]
-            except IndexError:
-                number_down = 0
            try:
                number_up = count_u[i]
            except IndexError:
@@ -67,25 +64,6 @@ class Stree_test(unittest.TestCase):
            clf.fit(*load_dataset(self._random_state))
            self._check_tree(clf.tree_)

-    @staticmethod
-    def _find_out(px: np.array, x_original: np.array, y_original) -> list:
-        """Find the original values of y for a given array of samples
-
-        Arguments:
-            px {np.array} -- array of samples to search for
-            x_original {np.array} -- original dataset
-            y_original {[type]} -- original classes
-
-        Returns:
-            np.array -- classes of the given samples
-        """
-        res = []
-        for needle in px:
-            for row in range(x_original.shape[0]):
-                if all(x_original[row, :] == needle):
-                    res.append(y_original[row])
-        return res
-
    def test_single_prediction(self):
        X, y = load_dataset(self._random_state)
        for kernel in self._kernels:
@@ -405,3 +383,34 @@ class Stree_test(unittest.TestCase):
        clf = Stree(splitter="duck")
        with self.assertRaises(ValueError):
            clf.fit(*load_dataset())
+
+    def test_weights_removing_class(self):
+        # This patch solves an stderr message from sklearn svm lib
+        # "WARNING: class label x specified in weight is not found"
+        X = np.array(
+            [
+                [0.1, 0.1],
+                [0.1, 0.2],
+                [0.2, 0.1],
+                [5, 6],
+                [8, 9],
+                [6, 7],
+                [0.2, 0.2],
+            ]
+        )
+        y = np.array([0, 0, 0, 1, 1, 1, 0])
+        epsilon = 1e-5
+        weights = [1, 1, 1, 0, 0, 0, 1]
+        weights = np.array(weights, dtype="float64")
+        weights_epsilon = [x + epsilon for x in weights]
+        weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
+        original = weights_no_zero.copy()
+        clf = Stree()
+        clf.fit(X, y)
+        node = clf.train(X, y, weights, 1, "test",)
+        # if a class is lost with zero weights the patch adds epsilon
+        self.assertListEqual(weights.tolist(), weights_epsilon)
+        self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
+        # zero weights are ok when they don't erase a class
+        _ = clf.train(X, y, weights_no_zero, 1, "test")
+        self.assertListEqual(weights_no_zero.tolist(), original.tolist())