mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 15:36:00 +00:00
Fix entroy and information_gain functions
This commit is contained in:
@@ -10,6 +10,7 @@ import os
|
|||||||
import numbers
|
import numbers
|
||||||
import random
|
import random
|
||||||
import warnings
|
import warnings
|
||||||
|
from math import log
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
@@ -163,10 +164,10 @@ class Splitter:
|
|||||||
f"criterion must be gini or entropy got({criterion})"
|
f"criterion must be gini or entropy got({criterion})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if criteria not in ["min_distance", "max_samples"]:
|
if criteria not in ["min_distance", "max_samples", "max_distance"]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"split_criteria has to be min_distance or \
|
"split_criteria has to be min_distance "
|
||||||
max_samples got ({criteria})"
|
f"max_distance or max_samples got ({criteria})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if splitter_type not in ["random", "best"]:
|
if splitter_type not in ["random", "best"]:
|
||||||
@@ -186,24 +187,47 @@ class Splitter:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _entropy(y: np.array) -> float:
|
def _entropy(y: np.array) -> float:
|
||||||
_, count = np.unique(y, return_counts=True)
|
n_labels = len(y)
|
||||||
proportion = count / np.sum(count)
|
if n_labels <= 1:
|
||||||
return -np.sum(proportion * np.log2(proportion))
|
return 0
|
||||||
|
counts = np.bincount(y)
|
||||||
|
proportions = counts / n_labels
|
||||||
|
n_classes = np.count_nonzero(proportions)
|
||||||
|
if n_classes <= 1:
|
||||||
|
return 0
|
||||||
|
entropy = 0.0
|
||||||
|
# Compute standard entropy.
|
||||||
|
for prop in proportions:
|
||||||
|
if prop != 0.0:
|
||||||
|
entropy -= prop * log(prop, n_classes)
|
||||||
|
return entropy
|
||||||
|
|
||||||
def information_gain(
|
def information_gain(
|
||||||
self, labels_up: np.array, labels_dn: np.array
|
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||||
) -> float:
|
) -> float:
|
||||||
card_up = labels_up.shape[0] if labels_up is not None else 0
|
imp_prev = self.criterion_function(labels)
|
||||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
card_up = card_dn = imp_up = imp_dn = 0
|
||||||
|
if labels_up is not None:
|
||||||
|
card_up = labels_up.shape[0]
|
||||||
|
imp_up = self.criterion_function(labels_up)
|
||||||
|
if labels_dn is not None:
|
||||||
|
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||||
|
imp_dn = self.criterion_function(labels_dn)
|
||||||
samples = card_up + card_dn
|
samples = card_up + card_dn
|
||||||
up = card_up / samples * self.criterion_function(labels_up)
|
if samples == 0:
|
||||||
dn = card_dn / samples * self.criterion_function(labels_dn)
|
return 0
|
||||||
return up + dn
|
else:
|
||||||
|
result = (
|
||||||
|
imp_prev
|
||||||
|
- (card_up / samples) * imp_up
|
||||||
|
- (card_dn / samples) * imp_dn
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
def _select_best_set(
|
def _select_best_set(
|
||||||
self, dataset: np.array, labels: np.array, features_sets: list
|
self, dataset: np.array, labels: np.array, features_sets: list
|
||||||
) -> list:
|
) -> list:
|
||||||
min_impurity = 1
|
max_gain = 0
|
||||||
selected = None
|
selected = None
|
||||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
for feature_set in features_sets:
|
for feature_set in features_sets:
|
||||||
@@ -213,11 +237,12 @@ class Splitter:
|
|||||||
)
|
)
|
||||||
self.partition(dataset, node)
|
self.partition(dataset, node)
|
||||||
y1, y2 = self.part(labels)
|
y1, y2 = self.part(labels)
|
||||||
impurity = self.information_gain(y1, y2)
|
gain = self.information_gain(labels, y1, y2)
|
||||||
if impurity < min_impurity:
|
if gain > max_gain:
|
||||||
min_impurity = impurity
|
max_gain = gain
|
||||||
selected = feature_set
|
selected = feature_set
|
||||||
return selected
|
|
||||||
|
return selected if selected is not None else feature_set
|
||||||
|
|
||||||
def _get_subspaces_set(
|
def _get_subspaces_set(
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
@@ -226,7 +251,8 @@ class Splitter:
|
|||||||
features_sets = list(combinations(features, max_features))
|
features_sets = list(combinations(features, max_features))
|
||||||
if len(features_sets) > 1:
|
if len(features_sets) > 1:
|
||||||
if self._splitter_type == "random":
|
if self._splitter_type == "random":
|
||||||
return features_sets[random.randint(0, len(features_sets) - 1)]
|
index = random.randint(0, len(features_sets) - 1)
|
||||||
|
return features_sets[index]
|
||||||
else:
|
else:
|
||||||
return self._select_best_set(dataset, labels, features_sets)
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
else:
|
else:
|
||||||
@@ -248,6 +274,14 @@ class Splitter:
|
|||||||
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
|
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _max_distance(data: np.array, _) -> np.array:
|
||||||
|
# chooses the greatest distance of every sample
|
||||||
|
indices = np.argmax(np.abs(data), axis=1)
|
||||||
|
return np.array(
|
||||||
|
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _max_samples(data: np.array, y: np.array) -> np.array:
|
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||||
# select the class with max number of samples
|
# select the class with max number of samples
|
||||||
|
@@ -46,7 +46,11 @@ class Splitter_test(unittest.TestCase):
|
|||||||
self.build(clf=None)
|
self.build(clf=None)
|
||||||
for splitter_type in ["best", "random"]:
|
for splitter_type in ["best", "random"]:
|
||||||
for criterion in ["gini", "entropy"]:
|
for criterion in ["gini", "entropy"]:
|
||||||
for criteria in ["min_distance", "max_samples"]:
|
for criteria in [
|
||||||
|
"min_distance",
|
||||||
|
"max_samples",
|
||||||
|
"max_distance",
|
||||||
|
]:
|
||||||
tcl = self.build(
|
tcl = self.build(
|
||||||
splitter_type=splitter_type,
|
splitter_type=splitter_type,
|
||||||
criterion=criterion,
|
criterion=criterion,
|
||||||
@@ -57,30 +61,66 @@ class Splitter_test(unittest.TestCase):
|
|||||||
self.assertEqual(criteria, tcl._criteria)
|
self.assertEqual(criteria, tcl._criteria)
|
||||||
|
|
||||||
def test_gini(self):
|
def test_gini(self):
|
||||||
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
expected_values = [
|
||||||
expected = 0.48
|
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
|
||||||
self.assertEqual(expected, Splitter._gini(y))
|
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
|
||||||
tcl = self.build(criterion="gini")
|
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
|
||||||
self.assertEqual(expected, tcl.criterion_function(y))
|
([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
|
||||||
|
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
|
||||||
|
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
|
||||||
|
([0], 0),
|
||||||
|
([1, 1, 1, 1], 0),
|
||||||
|
]
|
||||||
|
for labels, expected in expected_values:
|
||||||
|
self.assertAlmostEqual(expected, Splitter._gini(labels))
|
||||||
|
tcl = self.build(criterion="gini")
|
||||||
|
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||||
|
|
||||||
def test_entropy(self):
|
def test_entropy(self):
|
||||||
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
expected_values = [
|
||||||
expected = 0.9709505944546686
|
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
|
||||||
self.assertAlmostEqual(expected, Splitter._entropy(y))
|
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
|
||||||
tcl = self.build(criterion="entropy")
|
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
|
||||||
self.assertEqual(expected, tcl.criterion_function(y))
|
([0, 0, 1, 1, 1, 1, 0, 0], 1),
|
||||||
|
([0, 0, 1, 1, 2, 2, 3, 3], 1),
|
||||||
|
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
|
||||||
|
([1], 0),
|
||||||
|
([0, 0, 0, 0], 0),
|
||||||
|
]
|
||||||
|
for labels, expected in expected_values:
|
||||||
|
self.assertAlmostEqual(expected, Splitter._entropy(labels))
|
||||||
|
tcl = self.build(criterion="entropy")
|
||||||
|
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||||
|
|
||||||
def test_information_gain(self):
|
def test_information_gain(self):
|
||||||
yu = np.array([0, 1, 1, 1, 1, 1])
|
expected_values = [
|
||||||
yd = np.array([0, 0, 0, 1])
|
(
|
||||||
values_expected = [
|
[0, 1, 1, 1, 1, 1],
|
||||||
("gini", 0.31666666666666665),
|
[0, 0, 0, 1],
|
||||||
("entropy", 0.7145247027726656),
|
0.16333333333333333,
|
||||||
|
0.25642589168200297,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
|
||||||
|
[5, 3, 2, 1, 1],
|
||||||
|
0.007381776239907684,
|
||||||
|
-0.03328610916207225,
|
||||||
|
),
|
||||||
|
([], [], 0.0, 0.0),
|
||||||
|
([1], [], 0.0, 0.0),
|
||||||
|
([], [1], 0.0, 0.0),
|
||||||
|
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
|
||||||
|
([], [1, 1, 1, 2], 0.0, 0.0),
|
||||||
]
|
]
|
||||||
for criterion, expected in values_expected:
|
for yu, yd, expected_gini, expected_entropy in expected_values:
|
||||||
tcl = self.build(criterion=criterion)
|
yu = np.array(yu, dtype=np.int32)
|
||||||
computed = tcl.information_gain(yu, yd)
|
yd = np.array(yd, dtype=np.int32)
|
||||||
self.assertAlmostEqual(expected, computed)
|
tcl = self.build(criterion="gini")
|
||||||
|
computed = tcl.information_gain(np.append(yu, yd), yu, yd)
|
||||||
|
self.assertAlmostEqual(expected_gini, computed)
|
||||||
|
tcl = self.build(criterion="entropy")
|
||||||
|
computed = tcl.information_gain(np.append(yu, yd), yu, yd)
|
||||||
|
self.assertAlmostEqual(expected_entropy, computed)
|
||||||
|
|
||||||
def test_max_samples(self):
|
def test_max_samples(self):
|
||||||
tcl = self.build(criteria="max_samples")
|
tcl = self.build(criteria="max_samples")
|
||||||
@@ -113,27 +153,52 @@ class Splitter_test(unittest.TestCase):
|
|||||||
self.assertEqual((4,), computed.shape)
|
self.assertEqual((4,), computed.shape)
|
||||||
self.assertListEqual(expected.tolist(), computed.tolist())
|
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||||
|
|
||||||
|
def test_max_distance(self):
|
||||||
|
tcl = self.build(criteria="max_distance")
|
||||||
|
data = np.array(
|
||||||
|
[
|
||||||
|
[-0.1, 0.2, -0.3],
|
||||||
|
[0.7, 0.01, -0.1],
|
||||||
|
[0.7, -0.9, 0.5],
|
||||||
|
[0.1, 0.2, 0.3],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = np.array([-0.3, 0.7, -0.9, 0.3])
|
||||||
|
computed = tcl._max_distance(data, None)
|
||||||
|
self.assertEqual((4,), computed.shape)
|
||||||
|
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||||
|
|
||||||
def test_splitter_parameter(self):
|
def test_splitter_parameter(self):
|
||||||
expected_values = [
|
expected_values = [
|
||||||
[1, 7, 9],
|
[1, 5, 6], # random gini min_distance
|
||||||
[1, 7, 9],
|
[1, 2, 3], # random gini max_samples
|
||||||
[1, 7, 9],
|
[0, 2, 3], # random gini max_distance
|
||||||
[1, 7, 9],
|
[2, 4, 6], # random entropy min_distance
|
||||||
[0, 5, 6],
|
[2, 5, 6], # random entropy max_samples
|
||||||
[0, 5, 6],
|
[0, 4, 6], # random entropy max_distance
|
||||||
[0, 5, 6],
|
[3, 4, 6], # best gini min_distance
|
||||||
[0, 5, 6],
|
[3, 4, 6], # best gini max_samples
|
||||||
|
[1, 4, 6], # best gini max_distance
|
||||||
|
[3, 4, 6], # best entropy min_distance
|
||||||
|
[3, 4, 6], # best entropy max_samples
|
||||||
|
[1, 4, 6], # best entropy max_distance
|
||||||
]
|
]
|
||||||
X, y = load_dataset(self._random_state, n_features=12)
|
X, y = load_dataset(self._random_state, n_features=7, n_classes=3)
|
||||||
for splitter_type in ["best", "random"]:
|
rn = 0
|
||||||
|
for splitter_type in ["random", "best"]:
|
||||||
for criterion in ["gini", "entropy"]:
|
for criterion in ["gini", "entropy"]:
|
||||||
for criteria in ["min_distance", "max_samples"]:
|
for criteria in [
|
||||||
|
"min_distance",
|
||||||
|
"max_samples",
|
||||||
|
"max_distance",
|
||||||
|
]:
|
||||||
tcl = self.build(
|
tcl = self.build(
|
||||||
splitter_type=splitter_type,
|
splitter_type=splitter_type,
|
||||||
criterion=criterion,
|
criterion=criterion,
|
||||||
criteria=criteria,
|
criteria=criteria,
|
||||||
random_state=self._random_state,
|
random_state=rn,
|
||||||
)
|
)
|
||||||
|
rn += 3
|
||||||
expected = expected_values.pop(0)
|
expected = expected_values.pop(0)
|
||||||
dataset, computed = tcl.get_subspace(X, y, max_features=3)
|
dataset, computed = tcl.get_subspace(X, y, max_features=3)
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
|
@@ -239,6 +239,9 @@ class Stree_test(unittest.TestCase):
|
|||||||
"min_distance linear": 0.9533333333333334,
|
"min_distance linear": 0.9533333333333334,
|
||||||
"min_distance rbf": 0.836,
|
"min_distance rbf": 0.836,
|
||||||
"min_distance poly": 0.9473333333333334,
|
"min_distance poly": 0.9473333333333334,
|
||||||
|
"max_distance linear": 0.9533333333333334,
|
||||||
|
"max_distance rbf": 0.836,
|
||||||
|
"max_distance poly": 0.9473333333333334,
|
||||||
},
|
},
|
||||||
"Iris": {
|
"Iris": {
|
||||||
"max_samples linear": 0.98,
|
"max_samples linear": 0.98,
|
||||||
@@ -247,11 +250,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
"min_distance linear": 0.98,
|
"min_distance linear": 0.98,
|
||||||
"min_distance rbf": 1.0,
|
"min_distance rbf": 1.0,
|
||||||
"min_distance poly": 1.0,
|
"min_distance poly": 1.0,
|
||||||
|
"max_distance linear": 0.98,
|
||||||
|
"max_distance rbf": 1.0,
|
||||||
|
"max_distance poly": 1.0,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
for name, dataset in datasets.items():
|
for name, dataset in datasets.items():
|
||||||
px, py = dataset
|
px, py = dataset
|
||||||
for criteria in ["max_samples", "min_distance"]:
|
for criteria in ["max_samples", "min_distance", "max_distance"]:
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
C=1e4,
|
C=1e4,
|
||||||
|
Reference in New Issue
Block a user