better solution to the sklearn bagging problem

Add better tests
enhance .coveragerc
This commit is contained in:
2020-06-26 11:22:45 +02:00
parent 76723993fd
commit 4b7e4a3fb0
5 changed files with 66 additions and 58 deletions

View File

@@ -10,5 +10,4 @@ exclude_lines =
if __name__ == .__main__.:
ignore_errors = True
omit =
stree/tests/*
stree/__init__.py

View File

@@ -40,6 +40,7 @@ class Snode:
features: np.array,
impurity: float,
title: str,
weight: np.ndarray = None,
):
self._clf = clf
self._title = title
@@ -51,7 +52,9 @@ class Snode:
self._up = None
self._class = None
self._feature = None
self._sample_weight = None
self._sample_weight = (
weight if os.environ.get("TESTING", "NS") != "NS" else None
)
self._features = features
self._impurity = impurity
@@ -443,9 +446,6 @@ class Stree(BaseEstimator, ClassifierMixin):
sample_weight = _check_sample_weight(
sample_weight, X, dtype=np.float64
)
# solve WARNING: class label 0 specified in weight is not found
# in bagging
sample_weight += 1e-5
check_classification_targets(y)
# Initialize computed parameters
self.splitter_ = Splitter(
@@ -505,13 +505,22 @@ class Stree(BaseEstimator, ClassifierMixin):
features=X.shape[1],
impurity=0.0,
title=title + ", <pure>",
weight=sample_weight,
)
# Train the model
clf = self._build_clf()
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
# solve WARNING: class label 0 specified in weight is not found
# in bagging
if any(sample_weight == 0):
indices = sample_weight == 0
y_next = y[~indices]
# touch weights if removing any class
if np.unique(y_next).shape[0] != self.n_classes_:
sample_weight += 1e-5
clf.fit(Xs, y, sample_weight=sample_weight)
impurity = self.splitter_.impurity(y)
node = Snode(clf, X, y, features, impurity, title)
node = Snode(clf, X, y, features, impurity, title, sample_weight)
self.depth_ = max(depth, self.depth_)
self.splitter_.partition(X, node)
X_U, X_D = self.splitter_.part(X)
@@ -526,6 +535,7 @@ class Stree(BaseEstimator, ClassifierMixin):
features=X.shape[1],
impurity=impurity,
title=title + ", <cgaf>",
weight=sample_weight,
)
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))

View File

@@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase):
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)

View File

@@ -178,20 +178,23 @@ class Splitter_test(unittest.TestCase):
def test_splitter_parameter(self):
expected_values = [
[1, 3, 4, 5], # random gini min_distance
[0, 1, 3, 4], # random gini max_samples
[1, 2, 4, 5], # random gini max_distance
[0, 2, 3, 5], # random entropy min_distance
[0, 2, 3, 5], # random entropy max_samples
[0, 1, 3, 4], # random entropy max_distance
[0, 1, 2, 5], # best gini min_distance
[2, 3, 4, 5], # best gini max_samples
[0, 2, 3, 4], # best gini max_distance
[0, 1, 2, 5], # best entropy min_distance
[2, 3, 4, 5], # best entropy max_samples
[0, 1, 2, 4], # best entropy max_distance
[1, 2], # random gini min_distance
[0, 2], # random gini max_samples
[1, 3], # random gini max_distance
[1, 2], # random entropy min_distance
[1, 2], # random entropy max_samples
[0, 2], # random entropy max_distance
[1, 2], # best gini min_distance
[0, 2], # best gini max_samples
[0, 2], # best gini max_distance
[0, 1], # best entropy min_distance
[0, 1], # best entropy max_samples
[0, 1], # best entropy max_distance
]
X, y = load_dataset(self._random_state, n_features=6, n_classes=3)
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
rn = 0
for splitter_type in ["random", "best"]:
for criterion in ["gini", "entropy"]:
@@ -208,21 +211,11 @@ class Splitter_test(unittest.TestCase):
)
rn += 3
expected = expected_values.pop(0)
dataset, computed = tcl.get_subspace(X, y, max_features=4)
# Flaky test
if (
splitter_type == "best"
and criteria == "max_distance"
and criterion == "gini"
and computed == (1, 2, 3, 4)
):
# sometimes returns (0, 2, 3, 4) and sometimes
# (1, 2, 3, 4)
expected = [1, 2, 3, 4]
dataset, computed = tcl.get_subspace(X, y, max_features=2)
# print(
# "{}, # {:7s}{:8s}{:15s}".format(
# list(computed), splitter_type,
# criterion, criteria,
# list(computed), splitter_type, criterion,
# criteria,
# )
# )
self.assertListEqual(expected, list(computed))

View File

@@ -41,10 +41,7 @@ class Stree_test(unittest.TestCase):
_, count_u = np.unique(y_up, return_counts=True)
#
for i in unique_y:
try:
number_down = count_d[i]
except IndexError:
number_down = 0
try:
number_up = count_u[i]
except IndexError:
@@ -67,25 +64,6 @@ class Stree_test(unittest.TestCase):
clf.fit(*load_dataset(self._random_state))
self._check_tree(clf.tree_)
@staticmethod
def _find_out(px: np.array, x_original: np.array, y_original) -> list:
"""Find the original values of y for a given array of samples
Arguments:
px {np.array} -- array of samples to search for
x_original {np.array} -- original dataset
y_original {[type]} -- original classes
Returns:
np.array -- classes of the given samples
"""
res = []
for needle in px:
for row in range(x_original.shape[0]):
if all(x_original[row, :] == needle):
res.append(y_original[row])
return res
def test_single_prediction(self):
X, y = load_dataset(self._random_state)
for kernel in self._kernels:
@@ -405,3 +383,34 @@ class Stree_test(unittest.TestCase):
clf = Stree(splitter="duck")
with self.assertRaises(ValueError):
clf.fit(*load_dataset())
def test_weights_removing_class(self):
# This patch solves an stderr message from sklearn svm lib
# "WARNING: class label x specified in weight is not found"
X = np.array(
[
[0.1, 0.1],
[0.1, 0.2],
[0.2, 0.1],
[5, 6],
[8, 9],
[6, 7],
[0.2, 0.2],
]
)
y = np.array([0, 0, 0, 1, 1, 1, 0])
epsilon = 1e-5
weights = [1, 1, 1, 0, 0, 0, 1]
weights = np.array(weights, dtype="float64")
weights_epsilon = [x + epsilon for x in weights]
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
original = weights_no_zero.copy()
clf = Stree()
clf.fit(X, y)
node = clf.train(X, y, weights, 1, "test",)
# if a class is lost with zero weights the patch adds epsilon
self.assertListEqual(weights.tolist(), weights_epsilon)
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
# zero weights are ok when they don't erase a class
_ = clf.train(X, y, weights_no_zero, 1, "test")
self.assertListEqual(weights_no_zero.tolist(), original.tolist())