mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 07:26:01 +00:00
better solution to the sklearn bagging problem
Add better tests enhance .coveragerc
This commit is contained in:
@@ -10,5 +10,4 @@ exclude_lines =
|
|||||||
if __name__ == .__main__.:
|
if __name__ == .__main__.:
|
||||||
ignore_errors = True
|
ignore_errors = True
|
||||||
omit =
|
omit =
|
||||||
stree/tests/*
|
|
||||||
stree/__init__.py
|
stree/__init__.py
|
@@ -40,6 +40,7 @@ class Snode:
|
|||||||
features: np.array,
|
features: np.array,
|
||||||
impurity: float,
|
impurity: float,
|
||||||
title: str,
|
title: str,
|
||||||
|
weight: np.ndarray = None,
|
||||||
):
|
):
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._title = title
|
self._title = title
|
||||||
@@ -51,7 +52,9 @@ class Snode:
|
|||||||
self._up = None
|
self._up = None
|
||||||
self._class = None
|
self._class = None
|
||||||
self._feature = None
|
self._feature = None
|
||||||
self._sample_weight = None
|
self._sample_weight = (
|
||||||
|
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
)
|
||||||
self._features = features
|
self._features = features
|
||||||
self._impurity = impurity
|
self._impurity = impurity
|
||||||
|
|
||||||
@@ -443,9 +446,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
sample_weight = _check_sample_weight(
|
sample_weight = _check_sample_weight(
|
||||||
sample_weight, X, dtype=np.float64
|
sample_weight, X, dtype=np.float64
|
||||||
)
|
)
|
||||||
# solve WARNING: class label 0 specified in weight is not found
|
|
||||||
# in bagging
|
|
||||||
sample_weight += 1e-5
|
|
||||||
check_classification_targets(y)
|
check_classification_targets(y)
|
||||||
# Initialize computed parameters
|
# Initialize computed parameters
|
||||||
self.splitter_ = Splitter(
|
self.splitter_ = Splitter(
|
||||||
@@ -505,13 +505,22 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
features=X.shape[1],
|
features=X.shape[1],
|
||||||
impurity=0.0,
|
impurity=0.0,
|
||||||
title=title + ", <pure>",
|
title=title + ", <pure>",
|
||||||
|
weight=sample_weight,
|
||||||
)
|
)
|
||||||
# Train the model
|
# Train the model
|
||||||
clf = self._build_clf()
|
clf = self._build_clf()
|
||||||
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||||
|
# solve WARNING: class label 0 specified in weight is not found
|
||||||
|
# in bagging
|
||||||
|
if any(sample_weight == 0):
|
||||||
|
indices = sample_weight == 0
|
||||||
|
y_next = y[~indices]
|
||||||
|
# touch weights if removing any class
|
||||||
|
if np.unique(y_next).shape[0] != self.n_classes_:
|
||||||
|
sample_weight += 1e-5
|
||||||
clf.fit(Xs, y, sample_weight=sample_weight)
|
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||||
impurity = self.splitter_.impurity(y)
|
impurity = self.splitter_.impurity(y)
|
||||||
node = Snode(clf, X, y, features, impurity, title)
|
node = Snode(clf, X, y, features, impurity, title, sample_weight)
|
||||||
self.depth_ = max(depth, self.depth_)
|
self.depth_ = max(depth, self.depth_)
|
||||||
self.splitter_.partition(X, node)
|
self.splitter_.partition(X, node)
|
||||||
X_U, X_D = self.splitter_.part(X)
|
X_U, X_D = self.splitter_.part(X)
|
||||||
@@ -526,6 +535,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
features=X.shape[1],
|
features=X.shape[1],
|
||||||
impurity=impurity,
|
impurity=impurity,
|
||||||
title=title + ", <cgaf>",
|
title=title + ", <cgaf>",
|
||||||
|
weight=sample_weight,
|
||||||
)
|
)
|
||||||
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||||
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||||
|
@@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase):
|
|||||||
max_card = max(card)
|
max_card = max(card)
|
||||||
min_card = min(card)
|
min_card = min(card)
|
||||||
if len(classes) > 1:
|
if len(classes) > 1:
|
||||||
try:
|
belief = max_card / (max_card + min_card)
|
||||||
belief = max_card / (max_card + min_card)
|
|
||||||
except ZeroDivisionError:
|
|
||||||
belief = 0.0
|
|
||||||
else:
|
else:
|
||||||
belief = 1
|
belief = 1
|
||||||
self.assertEqual(belief, node._belief)
|
self.assertEqual(belief, node._belief)
|
||||||
|
@@ -178,20 +178,23 @@ class Splitter_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_splitter_parameter(self):
|
def test_splitter_parameter(self):
|
||||||
expected_values = [
|
expected_values = [
|
||||||
[1, 3, 4, 5], # random gini min_distance
|
[1, 2], # random gini min_distance
|
||||||
[0, 1, 3, 4], # random gini max_samples
|
[0, 2], # random gini max_samples
|
||||||
[1, 2, 4, 5], # random gini max_distance
|
[1, 3], # random gini max_distance
|
||||||
[0, 2, 3, 5], # random entropy min_distance
|
[1, 2], # random entropy min_distance
|
||||||
[0, 2, 3, 5], # random entropy max_samples
|
[1, 2], # random entropy max_samples
|
||||||
[0, 1, 3, 4], # random entropy max_distance
|
[0, 2], # random entropy max_distance
|
||||||
[0, 1, 2, 5], # best gini min_distance
|
[1, 2], # best gini min_distance
|
||||||
[2, 3, 4, 5], # best gini max_samples
|
[0, 2], # best gini max_samples
|
||||||
[0, 2, 3, 4], # best gini max_distance
|
[0, 2], # best gini max_distance
|
||||||
[0, 1, 2, 5], # best entropy min_distance
|
[0, 1], # best entropy min_distance
|
||||||
[2, 3, 4, 5], # best entropy max_samples
|
[0, 1], # best entropy max_samples
|
||||||
[0, 1, 2, 4], # best entropy max_distance
|
[0, 1], # best entropy max_distance
|
||||||
]
|
]
|
||||||
X, y = load_dataset(self._random_state, n_features=6, n_classes=3)
|
X, y = load_dataset(self._random_state, n_features=6, n_classes=3)
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
|
|
||||||
|
X, y = load_iris(return_X_y=True)
|
||||||
rn = 0
|
rn = 0
|
||||||
for splitter_type in ["random", "best"]:
|
for splitter_type in ["random", "best"]:
|
||||||
for criterion in ["gini", "entropy"]:
|
for criterion in ["gini", "entropy"]:
|
||||||
@@ -208,21 +211,11 @@ class Splitter_test(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
rn += 3
|
rn += 3
|
||||||
expected = expected_values.pop(0)
|
expected = expected_values.pop(0)
|
||||||
dataset, computed = tcl.get_subspace(X, y, max_features=4)
|
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
||||||
# Flaky test
|
|
||||||
if (
|
|
||||||
splitter_type == "best"
|
|
||||||
and criteria == "max_distance"
|
|
||||||
and criterion == "gini"
|
|
||||||
and computed == (1, 2, 3, 4)
|
|
||||||
):
|
|
||||||
# sometimes returns (0, 2, 3, 4) and sometimes
|
|
||||||
# (1, 2, 3, 4)
|
|
||||||
expected = [1, 2, 3, 4]
|
|
||||||
# print(
|
# print(
|
||||||
# "{}, # {:7s}{:8s}{:15s}".format(
|
# "{}, # {:7s}{:8s}{:15s}".format(
|
||||||
# list(computed), splitter_type,
|
# list(computed), splitter_type, criterion,
|
||||||
# criterion, criteria,
|
# criteria,
|
||||||
# )
|
# )
|
||||||
# )
|
# )
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
|
@@ -41,10 +41,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
_, count_u = np.unique(y_up, return_counts=True)
|
_, count_u = np.unique(y_up, return_counts=True)
|
||||||
#
|
#
|
||||||
for i in unique_y:
|
for i in unique_y:
|
||||||
try:
|
number_down = count_d[i]
|
||||||
number_down = count_d[i]
|
|
||||||
except IndexError:
|
|
||||||
number_down = 0
|
|
||||||
try:
|
try:
|
||||||
number_up = count_u[i]
|
number_up = count_u[i]
|
||||||
except IndexError:
|
except IndexError:
|
||||||
@@ -67,25 +64,6 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
self._check_tree(clf.tree_)
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _find_out(px: np.array, x_original: np.array, y_original) -> list:
|
|
||||||
"""Find the original values of y for a given array of samples
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
px {np.array} -- array of samples to search for
|
|
||||||
x_original {np.array} -- original dataset
|
|
||||||
y_original {[type]} -- original classes
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
np.array -- classes of the given samples
|
|
||||||
"""
|
|
||||||
res = []
|
|
||||||
for needle in px:
|
|
||||||
for row in range(x_original.shape[0]):
|
|
||||||
if all(x_original[row, :] == needle):
|
|
||||||
res.append(y_original[row])
|
|
||||||
return res
|
|
||||||
|
|
||||||
def test_single_prediction(self):
|
def test_single_prediction(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
@@ -405,3 +383,34 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf = Stree(splitter="duck")
|
clf = Stree(splitter="duck")
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
clf.fit(*load_dataset())
|
clf.fit(*load_dataset())
|
||||||
|
|
||||||
|
def test_weights_removing_class(self):
|
||||||
|
# This patch solves an stderr message from sklearn svm lib
|
||||||
|
# "WARNING: class label x specified in weight is not found"
|
||||||
|
X = np.array(
|
||||||
|
[
|
||||||
|
[0.1, 0.1],
|
||||||
|
[0.1, 0.2],
|
||||||
|
[0.2, 0.1],
|
||||||
|
[5, 6],
|
||||||
|
[8, 9],
|
||||||
|
[6, 7],
|
||||||
|
[0.2, 0.2],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
y = np.array([0, 0, 0, 1, 1, 1, 0])
|
||||||
|
epsilon = 1e-5
|
||||||
|
weights = [1, 1, 1, 0, 0, 0, 1]
|
||||||
|
weights = np.array(weights, dtype="float64")
|
||||||
|
weights_epsilon = [x + epsilon for x in weights]
|
||||||
|
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
|
||||||
|
original = weights_no_zero.copy()
|
||||||
|
clf = Stree()
|
||||||
|
clf.fit(X, y)
|
||||||
|
node = clf.train(X, y, weights, 1, "test",)
|
||||||
|
# if a class is lost with zero weights the patch adds epsilon
|
||||||
|
self.assertListEqual(weights.tolist(), weights_epsilon)
|
||||||
|
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
|
||||||
|
# zero weights are ok when they don't erase a class
|
||||||
|
_ = clf.train(X, y, weights_no_zero, 1, "test")
|
||||||
|
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
|
||||||
|
Reference in New Issue
Block a user