Complete implementation of splitter_type = impurity with tests

Remove max_distance & min_distance splitter types
This commit is contained in:
2020-10-17 16:56:15 +02:00
parent 044918f834
commit c593b55bec
4 changed files with 67 additions and 35 deletions

View File

@@ -57,6 +57,7 @@ class Snode:
) )
self._features = features self._features = features
self._impurity = impurity self._impurity = impurity
self._partition_column: int = -1
@classmethod @classmethod
def copy(cls, node: "Snode") -> "Snode": def copy(cls, node: "Snode") -> "Snode":
@@ -69,6 +70,12 @@ class Snode:
node._title, node._title,
) )
def set_partition_column(self, col: int):
self._partition_column = col
def get_partition_column(self) -> int:
return self._partition_column
def set_down(self, son): def set_down(self, son):
self._down = son self._down = son
@@ -239,7 +246,7 @@ class Splitter:
node = Snode( node = Snode(
self._clf, dataset, labels, feature_set, 0.0, "subset" self._clf, dataset, labels, feature_set, 0.0, "subset"
) )
self.partition(dataset, node) self.partition(dataset, node, train=True)
y1, y2 = self.part(labels) y1, y2 = self.part(labels)
gain = self.information_gain(labels, y1, y2) gain = self.information_gain(labels, y1, y2)
if gain > max_gain: if gain > max_gain:
@@ -272,7 +279,7 @@ class Splitter:
return dataset[:, indices], indices return dataset[:, indices], indices
def _impurity(self, data: np.array, _) -> np.array: def _impurity(self, data: np.array, _) -> np.array:
"""return distances of the class whose partition has less impurity """return column of dataset to be taken into account to split dataset
:param data: distances to hyper plane of every class :param data: distances to hyper plane of every class
:type data: np.array (m, n_classes) :type data: np.array (m, n_classes)
@@ -289,15 +296,15 @@ class Splitter:
y[data > 0] = 1 y[data > 0] = 1
y = y.astype(int) y = y.astype(int)
for col in range(data.shape[1]): for col in range(data.shape[1]):
impurity_of_class = self.partition_impurity(y[col]) impurity_of_class = self.partition_impurity(y[:, col])
if impurity_of_class < min_impurity: if impurity_of_class < min_impurity:
selected = col selected = col
min_impurity = impurity_of_class min_impurity = impurity_of_class
return data[:, selected] return selected
@staticmethod @staticmethod
def _max_samples(data: np.array, y: np.array) -> np.array: def _max_samples(data: np.array, y: np.array) -> np.array:
"""return distances of the class with more samples """return column of dataset to be taken into account to split dataset
:param data: distances to hyper plane of every class :param data: distances to hyper plane of every class
:type data: np.array (m, n_classes) :type data: np.array (m, n_classes)
@@ -308,10 +315,9 @@ class Splitter:
""" """
# select the class with max number of samples # select the class with max number of samples
_, samples = np.unique(y, return_counts=True) _, samples = np.unique(y, return_counts=True)
selected = np.argmax(samples) return np.argmax(samples)
return data[:, selected]
def partition(self, samples: np.array, node: Snode): def partition(self, samples: np.array, node: Snode, train: bool):
"""Set the criteria to split arrays. Compute the indices of the samples """Set the criteria to split arrays. Compute the indices of the samples
that should go to one side of the tree (down) that should go to one side of the tree (down)
@@ -325,7 +331,16 @@ class Splitter:
if data.ndim > 1: if data.ndim > 1:
# split criteria for multiclass # split criteria for multiclass
# Convert data to a (m, 1) array selecting values for samples # Convert data to a (m, 1) array selecting values for samples
data = self.decision_criteria(data, node._y) if train:
# in train time we have to compute the column to take into
# account to split the dataset
col = self.decision_criteria(data, node._y)
node.set_partition_column(col)
else:
# in predcit time just use the column computed in train time
# is taking the classifier of class <col>
col = node.get_partition_column()
data = data[:, col]
self._down = data > 0 self._down = data > 0
@staticmethod @staticmethod
@@ -344,6 +359,7 @@ class Splitter:
def part(self, origin: np.array) -> list: def part(self, origin: np.array) -> list:
"""Split an array in two based on indices (down) and its complement """Split an array in two based on indices (down) and its complement
partition has to be called first to establish down indices
:param origin: dataset to split :param origin: dataset to split
:type origin: np.array :type origin: np.array
@@ -377,7 +393,7 @@ class Stree(BaseEstimator, ClassifierMixin):
tol: float = 1e-4, tol: float = 1e-4,
degree: int = 3, degree: int = 3,
gamma="scale", gamma="scale",
split_criteria: str = "max_samples", split_criteria: str = "impurity",
criterion: str = "gini", criterion: str = "gini",
min_samples_split: int = 0, min_samples_split: int = 0,
max_features=None, max_features=None,
@@ -518,7 +534,7 @@ class Stree(BaseEstimator, ClassifierMixin):
impurity = self.splitter_.partition_impurity(y) impurity = self.splitter_.partition_impurity(y)
node = Snode(clf, X, y, features, impurity, title, sample_weight) node = Snode(clf, X, y, features, impurity, title, sample_weight)
self.depth_ = max(depth, self.depth_) self.depth_ = max(depth, self.depth_)
self.splitter_.partition(X, node) self.splitter_.partition(X, node, True)
X_U, X_D = self.splitter_.part(X) X_U, X_D = self.splitter_.part(X)
y_u, y_d = self.splitter_.part(y) y_u, y_d = self.splitter_.part(y)
sw_u, sw_d = self.splitter_.part(sample_weight) sw_u, sw_d = self.splitter_.part(sample_weight)
@@ -605,7 +621,7 @@ class Stree(BaseEstimator, ClassifierMixin):
# set a class for every sample in dataset # set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class) prediction = np.full((xp.shape[0], 1), node._class)
return prediction, indices return prediction, indices
self.splitter_.partition(xp, node) self.splitter_.partition(xp, node, train=False)
x_u, x_d = self.splitter_.part(xp) x_u, x_d = self.splitter_.part(xp)
i_u, i_d = self.splitter_.part(indices) i_u, i_d = self.splitter_.part(indices)
prx_u, prin_u = predict_class(x_u, i_u, node.get_up()) prx_u, prin_u = predict_class(x_u, i_u, node.get_up())

View File

@@ -40,12 +40,13 @@ class Snode_test(unittest.TestCase):
# Check Class # Check Class
class_computed = classes[card == max_card] class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class) self.assertEqual(class_computed, node._class)
# Check Partition column
self.assertEqual(node._partition_column, -1)
check_leave(self._clf.tree_) check_leave(self._clf.tree_)
def test_nodes_coefs(self): def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled """Check if the nodes of the tree have the right attributes filled"""
"""
def run_tree(node: Snode): def run_tree(node: Snode):
if node._belief < 1: if node._belief < 1:
@@ -54,16 +55,19 @@ class Snode_test(unittest.TestCase):
self.assertIsNotNone(node._clf.coef_) self.assertIsNotNone(node._clf.coef_)
if node.is_leaf(): if node.is_leaf():
return return
run_tree(node.get_down())
run_tree(node.get_up()) run_tree(node.get_up())
run_tree(node.get_down())
run_tree(self._clf.tree_) model = Stree(self._random_state)
model.fit(*load_dataset(self._random_state, 3, 4))
run_tree(model.tree_)
def test_make_predictor_on_leaf(self): def test_make_predictor_on_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test") test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
test.make_predictor() test.make_predictor()
self.assertEqual(1, test._class) self.assertEqual(1, test._class)
self.assertEqual(0.75, test._belief) self.assertEqual(0.75, test._belief)
self.assertEqual(-1, test._partition_column)
def test_make_predictor_on_not_leaf(self): def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test") test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
@@ -71,11 +75,14 @@ class Snode_test(unittest.TestCase):
test.make_predictor() test.make_predictor()
self.assertIsNone(test._class) self.assertIsNone(test._class)
self.assertEqual(0, test._belief) self.assertEqual(0, test._belief)
self.assertEqual(-1, test._partition_column)
self.assertEqual(-1, test.get_up()._partition_column)
def test_make_predictor_on_leaf_bogus_data(self): def test_make_predictor_on_leaf_bogus_data(self):
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test") test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
test.make_predictor() test.make_predictor()
self.assertIsNone(test._class) self.assertIsNone(test._class)
self.assertEqual(-1, test._partition_column)
def test_copy_node(self): def test_copy_node(self):
px = [1, 2, 3, 4] px = [1, 2, 3, 4]
@@ -86,3 +93,4 @@ class Snode_test(unittest.TestCase):
self.assertListEqual(computed._y, py) self.assertListEqual(computed._y, py)
self.assertEqual("test", computed._title) self.assertEqual("test", computed._title)
self.assertIsInstance(computed._clf, Stree) self.assertIsInstance(computed._clf, Stree)
self.assertEqual(test._partition_column, computed._partition_column)

View File

@@ -134,13 +134,17 @@ class Splitter_test(unittest.TestCase):
[0.7, 0.01, -0.1], [0.7, 0.01, -0.1],
[0.7, -0.9, 0.5], [0.7, -0.9, 0.5],
[0.1, 0.2, 0.3], [0.1, 0.2, 0.3],
[-0.1, 0.2, 0.3],
[-0.1, 0.2, 0.3],
] ]
) )
expected = np.array([0.2, 0.01, -0.9, 0.2]) expected = np.array([-0.1, 0.7, 0.7, 0.1, -0.1, -0.1])
y = [1, 2, 1, 0] y = [1, 2, 1, 0, 0, 0]
computed = tcl._max_samples(data, y) computed = tcl._max_samples(data, y)
self.assertEqual((4,), computed.shape) self.assertEqual(0, computed)
self.assertListEqual(expected.tolist(), computed.tolist()) computed_data = data[:, computed]
self.assertEqual((6,), computed_data.shape)
self.assertListEqual(expected.tolist(), computed_data.tolist())
def test_impurity(self): def test_impurity(self):
tcl = self.build(criteria="impurity") tcl = self.build(criteria="impurity")
@@ -150,12 +154,16 @@ class Splitter_test(unittest.TestCase):
[0.7, 0.01, -0.1], [0.7, 0.01, -0.1],
[0.7, -0.9, 0.5], [0.7, -0.9, 0.5],
[0.1, 0.2, 0.3], [0.1, 0.2, 0.3],
[-0.1, 0.2, 0.3],
[-0.1, 0.2, 0.3],
] ]
) )
expected = np.array([-0.1, 0.7, 0.7, 0.1]) expected = np.array([0.2, 0.01, -0.9, 0.2, 0.2, 0.2])
computed = tcl._impurity(data, None) computed = tcl._impurity(data, None)
self.assertEqual((4,), computed.shape) self.assertEqual(1, computed)
self.assertListEqual(expected.tolist(), computed.tolist()) computed_data = data[:, computed]
self.assertEqual((6,), computed_data.shape)
self.assertListEqual(expected.tolist(), computed_data.tolist())
def test_best_splitter_few_sets(self): def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
@@ -168,9 +176,9 @@ class Splitter_test(unittest.TestCase):
def test_splitter_parameter(self): def test_splitter_parameter(self):
expected_values = [ expected_values = [
[0, 1, 7, 9], # best entropy max_samples [0, 1, 7, 9], # best entropy max_samples
[3, 8, 10, 11], # best entropy impurity [0, 2, 4, 5], # best entropy impurity
[0, 2, 8, 12], # best gini max_samples [0, 2, 8, 12], # best gini max_samples
[1, 2, 5, 12], # best gini impurity [4, 5, 9, 12], # best gini impurity
[1, 2, 5, 10], # random entropy max_samples [1, 2, 5, 10], # random entropy max_samples
[4, 8, 9, 12], # random entropy impurity [4, 8, 9, 12], # random entropy impurity
[3, 9, 11, 12], # random gini max_samples [3, 9, 11, 12], # random gini max_samples

View File

@@ -217,7 +217,7 @@ class Stree_test(unittest.TestCase):
random_state=self._random_state, random_state=self._random_state,
) )
clf.fit(px, py) clf.fit(px, py)
print(f"{name} {criteria} {kernel}") # print(f"{name} {criteria} {kernel}")
outcome = outcomes[name][f"{criteria} {kernel}"] outcome = outcomes[name][f"{criteria} {kernel}"]
self.assertAlmostEqual(outcome, clf.score(px, py)) self.assertAlmostEqual(outcome, clf.score(px, py))
@@ -310,23 +310,23 @@ class Stree_test(unittest.TestCase):
def test_score_multi_class(self): def test_score_multi_class(self):
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
accuracies = [ accuracies = [
0.651685393258427, # Wine linear impurity 0.7022472, # Wine linear impurity
0.8314607, # Wine linear max_samples 0.8314607, # Wine linear max_samples
0.6629213483146067, # Wine rbf impurity 0.4044944, # Wine rbf impurity
0.4044944, # Wine rbf max_samples 0.4044944, # Wine rbf max_samples
0.9157303, # Wine poly impurity 0.3988764, # Wine poly impurity
0.7640449, # Wine poly max_samples 0.7640449, # Wine poly max_samples
0.9933333, # Iris linear impurity 0.6600000, # Iris linear impurity
0.9666667, # Iris linear max_samples 0.9666667, # Iris linear max_samples
0.9800000, # Iris rbf impurity 0.3333333, # Iris rbf impurity
0.9800000, # Iris rbf max_samples 0.9800000, # Iris rbf max_samples
1.0000000, # Iris poly impurity 0.3333333, # Iris poly impurity
1.0000000, # Iris poly max_samples 1.0000000, # Iris poly max_samples
0.8993333, # Synthetic linear impurity 0.7153333, # Synthetic linear impurity
0.9313333, # Synthetic linear max_samples 0.9313333, # Synthetic linear max_samples
0.8320000, # Synthetic rbf impurity 0.4806667, # Synthetic rbf impurity
0.8320000, # Synthetic rbf max_samples 0.8320000, # Synthetic rbf max_samples
0.6066667, # Synthetic poly impurity 0.4786667, # Synthetic poly impurity
0.6340000, # Synthetic poly max_samples 0.6340000, # Synthetic poly max_samples
] ]
datasets = [ datasets = [