mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 07:26:01 +00:00
better solution to the sklearn bagging problem
Add better tests enhance .coveragerc
This commit is contained in:
@@ -10,5 +10,4 @@ exclude_lines =
|
||||
if __name__ == .__main__.:
|
||||
ignore_errors = True
|
||||
omit =
|
||||
stree/tests/*
|
||||
stree/__init__.py
|
@@ -40,6 +40,7 @@ class Snode:
|
||||
features: np.array,
|
||||
impurity: float,
|
||||
title: str,
|
||||
weight: np.ndarray = None,
|
||||
):
|
||||
self._clf = clf
|
||||
self._title = title
|
||||
@@ -51,7 +52,9 @@ class Snode:
|
||||
self._up = None
|
||||
self._class = None
|
||||
self._feature = None
|
||||
self._sample_weight = None
|
||||
self._sample_weight = (
|
||||
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||
)
|
||||
self._features = features
|
||||
self._impurity = impurity
|
||||
|
||||
@@ -443,9 +446,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=np.float64
|
||||
)
|
||||
# solve WARNING: class label 0 specified in weight is not found
|
||||
# in bagging
|
||||
sample_weight += 1e-5
|
||||
check_classification_targets(y)
|
||||
# Initialize computed parameters
|
||||
self.splitter_ = Splitter(
|
||||
@@ -505,13 +505,22 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
features=X.shape[1],
|
||||
impurity=0.0,
|
||||
title=title + ", <pure>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
# Train the model
|
||||
clf = self._build_clf()
|
||||
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||
# solve WARNING: class label 0 specified in weight is not found
|
||||
# in bagging
|
||||
if any(sample_weight == 0):
|
||||
indices = sample_weight == 0
|
||||
y_next = y[~indices]
|
||||
# touch weights if removing any class
|
||||
if np.unique(y_next).shape[0] != self.n_classes_:
|
||||
sample_weight += 1e-5
|
||||
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||
impurity = self.splitter_.impurity(y)
|
||||
node = Snode(clf, X, y, features, impurity, title)
|
||||
node = Snode(clf, X, y, features, impurity, title, sample_weight)
|
||||
self.depth_ = max(depth, self.depth_)
|
||||
self.splitter_.partition(X, node)
|
||||
X_U, X_D = self.splitter_.part(X)
|
||||
@@ -526,6 +535,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
features=X.shape[1],
|
||||
impurity=impurity,
|
||||
title=title + ", <cgaf>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||
|
@@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase):
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
if len(classes) > 1:
|
||||
try:
|
||||
belief = max_card / (max_card + min_card)
|
||||
except ZeroDivisionError:
|
||||
belief = 0.0
|
||||
else:
|
||||
belief = 1
|
||||
self.assertEqual(belief, node._belief)
|
||||
|
@@ -178,20 +178,23 @@ class Splitter_test(unittest.TestCase):
|
||||
|
||||
def test_splitter_parameter(self):
|
||||
expected_values = [
|
||||
[1, 3, 4, 5], # random gini min_distance
|
||||
[0, 1, 3, 4], # random gini max_samples
|
||||
[1, 2, 4, 5], # random gini max_distance
|
||||
[0, 2, 3, 5], # random entropy min_distance
|
||||
[0, 2, 3, 5], # random entropy max_samples
|
||||
[0, 1, 3, 4], # random entropy max_distance
|
||||
[0, 1, 2, 5], # best gini min_distance
|
||||
[2, 3, 4, 5], # best gini max_samples
|
||||
[0, 2, 3, 4], # best gini max_distance
|
||||
[0, 1, 2, 5], # best entropy min_distance
|
||||
[2, 3, 4, 5], # best entropy max_samples
|
||||
[0, 1, 2, 4], # best entropy max_distance
|
||||
[1, 2], # random gini min_distance
|
||||
[0, 2], # random gini max_samples
|
||||
[1, 3], # random gini max_distance
|
||||
[1, 2], # random entropy min_distance
|
||||
[1, 2], # random entropy max_samples
|
||||
[0, 2], # random entropy max_distance
|
||||
[1, 2], # best gini min_distance
|
||||
[0, 2], # best gini max_samples
|
||||
[0, 2], # best gini max_distance
|
||||
[0, 1], # best entropy min_distance
|
||||
[0, 1], # best entropy max_samples
|
||||
[0, 1], # best entropy max_distance
|
||||
]
|
||||
X, y = load_dataset(self._random_state, n_features=6, n_classes=3)
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
rn = 0
|
||||
for splitter_type in ["random", "best"]:
|
||||
for criterion in ["gini", "entropy"]:
|
||||
@@ -208,21 +211,11 @@ class Splitter_test(unittest.TestCase):
|
||||
)
|
||||
rn += 3
|
||||
expected = expected_values.pop(0)
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=4)
|
||||
# Flaky test
|
||||
if (
|
||||
splitter_type == "best"
|
||||
and criteria == "max_distance"
|
||||
and criterion == "gini"
|
||||
and computed == (1, 2, 3, 4)
|
||||
):
|
||||
# sometimes returns (0, 2, 3, 4) and sometimes
|
||||
# (1, 2, 3, 4)
|
||||
expected = [1, 2, 3, 4]
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
||||
# print(
|
||||
# "{}, # {:7s}{:8s}{:15s}".format(
|
||||
# list(computed), splitter_type,
|
||||
# criterion, criteria,
|
||||
# list(computed), splitter_type, criterion,
|
||||
# criteria,
|
||||
# )
|
||||
# )
|
||||
self.assertListEqual(expected, list(computed))
|
||||
|
@@ -41,10 +41,7 @@ class Stree_test(unittest.TestCase):
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
except IndexError:
|
||||
number_down = 0
|
||||
try:
|
||||
number_up = count_u[i]
|
||||
except IndexError:
|
||||
@@ -67,25 +64,6 @@ class Stree_test(unittest.TestCase):
|
||||
clf.fit(*load_dataset(self._random_state))
|
||||
self._check_tree(clf.tree_)
|
||||
|
||||
@staticmethod
|
||||
def _find_out(px: np.array, x_original: np.array, y_original) -> list:
|
||||
"""Find the original values of y for a given array of samples
|
||||
|
||||
Arguments:
|
||||
px {np.array} -- array of samples to search for
|
||||
x_original {np.array} -- original dataset
|
||||
y_original {[type]} -- original classes
|
||||
|
||||
Returns:
|
||||
np.array -- classes of the given samples
|
||||
"""
|
||||
res = []
|
||||
for needle in px:
|
||||
for row in range(x_original.shape[0]):
|
||||
if all(x_original[row, :] == needle):
|
||||
res.append(y_original[row])
|
||||
return res
|
||||
|
||||
def test_single_prediction(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
@@ -405,3 +383,34 @@ class Stree_test(unittest.TestCase):
|
||||
clf = Stree(splitter="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
clf.fit(*load_dataset())
|
||||
|
||||
def test_weights_removing_class(self):
|
||||
# This patch solves an stderr message from sklearn svm lib
|
||||
# "WARNING: class label x specified in weight is not found"
|
||||
X = np.array(
|
||||
[
|
||||
[0.1, 0.1],
|
||||
[0.1, 0.2],
|
||||
[0.2, 0.1],
|
||||
[5, 6],
|
||||
[8, 9],
|
||||
[6, 7],
|
||||
[0.2, 0.2],
|
||||
]
|
||||
)
|
||||
y = np.array([0, 0, 0, 1, 1, 1, 0])
|
||||
epsilon = 1e-5
|
||||
weights = [1, 1, 1, 0, 0, 0, 1]
|
||||
weights = np.array(weights, dtype="float64")
|
||||
weights_epsilon = [x + epsilon for x in weights]
|
||||
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
|
||||
original = weights_no_zero.copy()
|
||||
clf = Stree()
|
||||
clf.fit(X, y)
|
||||
node = clf.train(X, y, weights, 1, "test",)
|
||||
# if a class is lost with zero weights the patch adds epsilon
|
||||
self.assertListEqual(weights.tolist(), weights_epsilon)
|
||||
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
|
||||
# zero weights are ok when they don't erase a class
|
||||
_ = clf.train(X, y, weights_no_zero, 1, "test")
|
||||
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
|
||||
|
Reference in New Issue
Block a user