diff --git a/.coveragerc b/.coveragerc index e0f489f..a93868c 100644 --- a/.coveragerc +++ b/.coveragerc @@ -10,5 +10,4 @@ exclude_lines = if __name__ == .__main__.: ignore_errors = True omit = - stree/tests/* stree/__init__.py \ No newline at end of file diff --git a/stree/Strees.py b/stree/Strees.py index 7c32d32..9baebae 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -40,6 +40,7 @@ class Snode: features: np.array, impurity: float, title: str, + weight: np.ndarray = None, ): self._clf = clf self._title = title @@ -51,7 +52,9 @@ class Snode: self._up = None self._class = None self._feature = None - self._sample_weight = None + self._sample_weight = ( + weight if os.environ.get("TESTING", "NS") != "NS" else None + ) self._features = features self._impurity = impurity @@ -443,9 +446,6 @@ class Stree(BaseEstimator, ClassifierMixin): sample_weight = _check_sample_weight( sample_weight, X, dtype=np.float64 ) - # solve WARNING: class label 0 specified in weight is not found - # in bagging - sample_weight += 1e-5 check_classification_targets(y) # Initialize computed parameters self.splitter_ = Splitter( @@ -505,13 +505,22 @@ class Stree(BaseEstimator, ClassifierMixin): features=X.shape[1], impurity=0.0, title=title + ", ", + weight=sample_weight, ) # Train the model clf = self._build_clf() Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) + # solve WARNING: class label 0 specified in weight is not found + # in bagging + if any(sample_weight == 0): + indices = sample_weight == 0 + y_next = y[~indices] + # touch weights if removing any class + if np.unique(y_next).shape[0] != self.n_classes_: + sample_weight += 1e-5 clf.fit(Xs, y, sample_weight=sample_weight) impurity = self.splitter_.impurity(y) - node = Snode(clf, X, y, features, impurity, title) + node = Snode(clf, X, y, features, impurity, title, sample_weight) self.depth_ = max(depth, self.depth_) self.splitter_.partition(X, node) X_U, X_D = self.splitter_.part(X) @@ -526,6 +535,7 @@ class Stree(BaseEstimator, ClassifierMixin): features=X.shape[1], impurity=impurity, title=title + ", ", + weight=sample_weight, ) node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up")) node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down")) diff --git a/stree/tests/Snode_test.py b/stree/tests/Snode_test.py index 6f3c4d2..27e5d0a 100644 --- a/stree/tests/Snode_test.py +++ b/stree/tests/Snode_test.py @@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase): max_card = max(card) min_card = min(card) if len(classes) > 1: - try: - belief = max_card / (max_card + min_card) - except ZeroDivisionError: - belief = 0.0 + belief = max_card / (max_card + min_card) else: belief = 1 self.assertEqual(belief, node._belief) diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py index 16b150a..1dcdbbd 100644 --- a/stree/tests/Splitter_test.py +++ b/stree/tests/Splitter_test.py @@ -178,20 +178,23 @@ class Splitter_test(unittest.TestCase): def test_splitter_parameter(self): expected_values = [ - [1, 3, 4, 5], # random gini min_distance - [0, 1, 3, 4], # random gini max_samples - [1, 2, 4, 5], # random gini max_distance - [0, 2, 3, 5], # random entropy min_distance - [0, 2, 3, 5], # random entropy max_samples - [0, 1, 3, 4], # random entropy max_distance - [0, 1, 2, 5], # best gini min_distance - [2, 3, 4, 5], # best gini max_samples - [0, 2, 3, 4], # best gini max_distance - [0, 1, 2, 5], # best entropy min_distance - [2, 3, 4, 5], # best entropy max_samples - [0, 1, 2, 4], # best entropy max_distance + [1, 2], # random gini min_distance + [0, 2], # random gini max_samples + [1, 3], # random gini max_distance + [1, 2], # random entropy min_distance + [1, 2], # random entropy max_samples + [0, 2], # random entropy max_distance + [1, 2], # best gini min_distance + [0, 2], # best gini max_samples + [0, 2], # best gini max_distance + [0, 1], # best entropy min_distance + [0, 1], # best entropy max_samples + [0, 1], # best entropy max_distance ] X, y = load_dataset(self._random_state, n_features=6, n_classes=3) + from sklearn.datasets import load_iris + + X, y = load_iris(return_X_y=True) rn = 0 for splitter_type in ["random", "best"]: for criterion in ["gini", "entropy"]: @@ -208,21 +211,11 @@ class Splitter_test(unittest.TestCase): ) rn += 3 expected = expected_values.pop(0) - dataset, computed = tcl.get_subspace(X, y, max_features=4) - # Flaky test - if ( - splitter_type == "best" - and criteria == "max_distance" - and criterion == "gini" - and computed == (1, 2, 3, 4) - ): - # sometimes returns (0, 2, 3, 4) and sometimes - # (1, 2, 3, 4) - expected = [1, 2, 3, 4] + dataset, computed = tcl.get_subspace(X, y, max_features=2) # print( # "{}, # {:7s}{:8s}{:15s}".format( - # list(computed), splitter_type, - # criterion, criteria, + # list(computed), splitter_type, criterion, + # criteria, # ) # ) self.assertListEqual(expected, list(computed)) diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index 0a41874..e16a69f 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -41,10 +41,7 @@ class Stree_test(unittest.TestCase): _, count_u = np.unique(y_up, return_counts=True) # for i in unique_y: - try: - number_down = count_d[i] - except IndexError: - number_down = 0 + number_down = count_d[i] try: number_up = count_u[i] except IndexError: @@ -67,25 +64,6 @@ class Stree_test(unittest.TestCase): clf.fit(*load_dataset(self._random_state)) self._check_tree(clf.tree_) - @staticmethod - def _find_out(px: np.array, x_original: np.array, y_original) -> list: - """Find the original values of y for a given array of samples - - Arguments: - px {np.array} -- array of samples to search for - x_original {np.array} -- original dataset - y_original {[type]} -- original classes - - Returns: - np.array -- classes of the given samples - """ - res = [] - for needle in px: - for row in range(x_original.shape[0]): - if all(x_original[row, :] == needle): - res.append(y_original[row]) - return res - def test_single_prediction(self): X, y = load_dataset(self._random_state) for kernel in self._kernels: @@ -405,3 +383,34 @@ class Stree_test(unittest.TestCase): clf = Stree(splitter="duck") with self.assertRaises(ValueError): clf.fit(*load_dataset()) + + def test_weights_removing_class(self): + # This patch solves an stderr message from sklearn svm lib + # "WARNING: class label x specified in weight is not found" + X = np.array( + [ + [0.1, 0.1], + [0.1, 0.2], + [0.2, 0.1], + [5, 6], + [8, 9], + [6, 7], + [0.2, 0.2], + ] + ) + y = np.array([0, 0, 0, 1, 1, 1, 0]) + epsilon = 1e-5 + weights = [1, 1, 1, 0, 0, 0, 1] + weights = np.array(weights, dtype="float64") + weights_epsilon = [x + epsilon for x in weights] + weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1]) + original = weights_no_zero.copy() + clf = Stree() + clf.fit(X, y) + node = clf.train(X, y, weights, 1, "test",) + # if a class is lost with zero weights the patch adds epsilon + self.assertListEqual(weights.tolist(), weights_epsilon) + self.assertListEqual(node._sample_weight.tolist(), weights_epsilon) + # zero weights are ok when they don't erase a class + _ = clf.train(X, y, weights_no_zero, 1, "test") + self.assertListEqual(weights_no_zero.tolist(), original.tolist())