Fix entroy and information_gain functions

This commit is contained in:
2020-06-16 13:56:02 +02:00
parent a20e45e8e7
commit 3e52a4746c
3 changed files with 156 additions and 51 deletions

View File

@@ -10,6 +10,7 @@ import os
import numbers import numbers
import random import random
import warnings import warnings
from math import log
from itertools import combinations from itertools import combinations
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
@@ -163,10 +164,10 @@ class Splitter:
f"criterion must be gini or entropy got({criterion})" f"criterion must be gini or entropy got({criterion})"
) )
if criteria not in ["min_distance", "max_samples"]: if criteria not in ["min_distance", "max_samples", "max_distance"]:
raise ValueError( raise ValueError(
f"split_criteria has to be min_distance or \ "split_criteria has to be min_distance "
max_samples got ({criteria})" f"max_distance or max_samples got ({criteria})"
) )
if splitter_type not in ["random", "best"]: if splitter_type not in ["random", "best"]:
@@ -186,24 +187,47 @@ class Splitter:
@staticmethod @staticmethod
def _entropy(y: np.array) -> float: def _entropy(y: np.array) -> float:
_, count = np.unique(y, return_counts=True) n_labels = len(y)
proportion = count / np.sum(count) if n_labels <= 1:
return -np.sum(proportion * np.log2(proportion)) return 0
counts = np.bincount(y)
proportions = counts / n_labels
n_classes = np.count_nonzero(proportions)
if n_classes <= 1:
return 0
entropy = 0.0
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log(prop, n_classes)
return entropy
def information_gain( def information_gain(
self, labels_up: np.array, labels_dn: np.array self, labels: np.array, labels_up: np.array, labels_dn: np.array
) -> float: ) -> float:
card_up = labels_up.shape[0] if labels_up is not None else 0 imp_prev = self.criterion_function(labels)
card_dn = labels_dn.shape[0] if labels_dn is not None else 0 card_up = card_dn = imp_up = imp_dn = 0
if labels_up is not None:
card_up = labels_up.shape[0]
imp_up = self.criterion_function(labels_up)
if labels_dn is not None:
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
imp_dn = self.criterion_function(labels_dn)
samples = card_up + card_dn samples = card_up + card_dn
up = card_up / samples * self.criterion_function(labels_up) if samples == 0:
dn = card_dn / samples * self.criterion_function(labels_dn) return 0
return up + dn else:
result = (
imp_prev
- (card_up / samples) * imp_up
- (card_dn / samples) * imp_dn
)
return result
def _select_best_set( def _select_best_set(
self, dataset: np.array, labels: np.array, features_sets: list self, dataset: np.array, labels: np.array, features_sets: list
) -> list: ) -> list:
min_impurity = 1 max_gain = 0
selected = None selected = None
warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=ConvergenceWarning)
for feature_set in features_sets: for feature_set in features_sets:
@@ -213,11 +237,12 @@ class Splitter:
) )
self.partition(dataset, node) self.partition(dataset, node)
y1, y2 = self.part(labels) y1, y2 = self.part(labels)
impurity = self.information_gain(y1, y2) gain = self.information_gain(labels, y1, y2)
if impurity < min_impurity: if gain > max_gain:
min_impurity = impurity max_gain = gain
selected = feature_set selected = feature_set
return selected
return selected if selected is not None else feature_set
def _get_subspaces_set( def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
@@ -226,7 +251,8 @@ class Splitter:
features_sets = list(combinations(features, max_features)) features_sets = list(combinations(features, max_features))
if len(features_sets) > 1: if len(features_sets) > 1:
if self._splitter_type == "random": if self._splitter_type == "random":
return features_sets[random.randint(0, len(features_sets) - 1)] index = random.randint(0, len(features_sets) - 1)
return features_sets[index]
else: else:
return self._select_best_set(dataset, labels, features_sets) return self._select_best_set(dataset, labels, features_sets)
else: else:
@@ -248,6 +274,14 @@ class Splitter:
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
) )
@staticmethod
def _max_distance(data: np.array, _) -> np.array:
# chooses the greatest distance of every sample
indices = np.argmax(np.abs(data), axis=1)
return np.array(
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
)
@staticmethod @staticmethod
def _max_samples(data: np.array, y: np.array) -> np.array: def _max_samples(data: np.array, y: np.array) -> np.array:
# select the class with max number of samples # select the class with max number of samples

View File

@@ -46,7 +46,11 @@ class Splitter_test(unittest.TestCase):
self.build(clf=None) self.build(clf=None)
for splitter_type in ["best", "random"]: for splitter_type in ["best", "random"]:
for criterion in ["gini", "entropy"]: for criterion in ["gini", "entropy"]:
for criteria in ["min_distance", "max_samples"]: for criteria in [
"min_distance",
"max_samples",
"max_distance",
]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
@@ -57,30 +61,66 @@ class Splitter_test(unittest.TestCase):
self.assertEqual(criteria, tcl._criteria) self.assertEqual(criteria, tcl._criteria)
def test_gini(self): def test_gini(self):
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] expected_values = [
expected = 0.48 ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
self.assertEqual(expected, Splitter._gini(y)) ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
tcl = self.build(criterion="gini") ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
self.assertEqual(expected, tcl.criterion_function(y)) ([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
([0], 0),
([1, 1, 1, 1], 0),
]
for labels, expected in expected_values:
self.assertAlmostEqual(expected, Splitter._gini(labels))
tcl = self.build(criterion="gini")
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
def test_entropy(self): def test_entropy(self):
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] expected_values = [
expected = 0.9709505944546686 ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
self.assertAlmostEqual(expected, Splitter._entropy(y)) ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
tcl = self.build(criterion="entropy") ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
self.assertEqual(expected, tcl.criterion_function(y)) ([0, 0, 1, 1, 1, 1, 0, 0], 1),
([0, 0, 1, 1, 2, 2, 3, 3], 1),
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
([1], 0),
([0, 0, 0, 0], 0),
]
for labels, expected in expected_values:
self.assertAlmostEqual(expected, Splitter._entropy(labels))
tcl = self.build(criterion="entropy")
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
def test_information_gain(self): def test_information_gain(self):
yu = np.array([0, 1, 1, 1, 1, 1]) expected_values = [
yd = np.array([0, 0, 0, 1]) (
values_expected = [ [0, 1, 1, 1, 1, 1],
("gini", 0.31666666666666665), [0, 0, 0, 1],
("entropy", 0.7145247027726656), 0.16333333333333333,
0.25642589168200297,
),
(
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
[5, 3, 2, 1, 1],
0.007381776239907684,
-0.03328610916207225,
),
([], [], 0.0, 0.0),
([1], [], 0.0, 0.0),
([], [1], 0.0, 0.0),
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
([], [1, 1, 1, 2], 0.0, 0.0),
] ]
for criterion, expected in values_expected: for yu, yd, expected_gini, expected_entropy in expected_values:
tcl = self.build(criterion=criterion) yu = np.array(yu, dtype=np.int32)
computed = tcl.information_gain(yu, yd) yd = np.array(yd, dtype=np.int32)
self.assertAlmostEqual(expected, computed) tcl = self.build(criterion="gini")
computed = tcl.information_gain(np.append(yu, yd), yu, yd)
self.assertAlmostEqual(expected_gini, computed)
tcl = self.build(criterion="entropy")
computed = tcl.information_gain(np.append(yu, yd), yu, yd)
self.assertAlmostEqual(expected_entropy, computed)
def test_max_samples(self): def test_max_samples(self):
tcl = self.build(criteria="max_samples") tcl = self.build(criteria="max_samples")
@@ -113,27 +153,52 @@ class Splitter_test(unittest.TestCase):
self.assertEqual((4,), computed.shape) self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist()) self.assertListEqual(expected.tolist(), computed.tolist())
def test_max_distance(self):
tcl = self.build(criteria="max_distance")
data = np.array(
[
[-0.1, 0.2, -0.3],
[0.7, 0.01, -0.1],
[0.7, -0.9, 0.5],
[0.1, 0.2, 0.3],
]
)
expected = np.array([-0.3, 0.7, -0.9, 0.3])
computed = tcl._max_distance(data, None)
self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_splitter_parameter(self): def test_splitter_parameter(self):
expected_values = [ expected_values = [
[1, 7, 9], [1, 5, 6], # random gini min_distance
[1, 7, 9], [1, 2, 3], # random gini max_samples
[1, 7, 9], [0, 2, 3], # random gini max_distance
[1, 7, 9], [2, 4, 6], # random entropy min_distance
[0, 5, 6], [2, 5, 6], # random entropy max_samples
[0, 5, 6], [0, 4, 6], # random entropy max_distance
[0, 5, 6], [3, 4, 6], # best gini min_distance
[0, 5, 6], [3, 4, 6], # best gini max_samples
[1, 4, 6], # best gini max_distance
[3, 4, 6], # best entropy min_distance
[3, 4, 6], # best entropy max_samples
[1, 4, 6], # best entropy max_distance
] ]
X, y = load_dataset(self._random_state, n_features=12) X, y = load_dataset(self._random_state, n_features=7, n_classes=3)
for splitter_type in ["best", "random"]: rn = 0
for splitter_type in ["random", "best"]:
for criterion in ["gini", "entropy"]: for criterion in ["gini", "entropy"]:
for criteria in ["min_distance", "max_samples"]: for criteria in [
"min_distance",
"max_samples",
"max_distance",
]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
random_state=self._random_state, random_state=rn,
) )
rn += 3
expected = expected_values.pop(0) expected = expected_values.pop(0)
dataset, computed = tcl.get_subspace(X, y, max_features=3) dataset, computed = tcl.get_subspace(X, y, max_features=3)
self.assertListEqual(expected, list(computed)) self.assertListEqual(expected, list(computed))

View File

@@ -239,6 +239,9 @@ class Stree_test(unittest.TestCase):
"min_distance linear": 0.9533333333333334, "min_distance linear": 0.9533333333333334,
"min_distance rbf": 0.836, "min_distance rbf": 0.836,
"min_distance poly": 0.9473333333333334, "min_distance poly": 0.9473333333333334,
"max_distance linear": 0.9533333333333334,
"max_distance rbf": 0.836,
"max_distance poly": 0.9473333333333334,
}, },
"Iris": { "Iris": {
"max_samples linear": 0.98, "max_samples linear": 0.98,
@@ -247,11 +250,14 @@ class Stree_test(unittest.TestCase):
"min_distance linear": 0.98, "min_distance linear": 0.98,
"min_distance rbf": 1.0, "min_distance rbf": 1.0,
"min_distance poly": 1.0, "min_distance poly": 1.0,
"max_distance linear": 0.98,
"max_distance rbf": 1.0,
"max_distance poly": 1.0,
}, },
} }
for name, dataset in datasets.items(): for name, dataset in datasets.items():
px, py = dataset px, py = dataset
for criteria in ["max_samples", "min_distance"]: for criteria in ["max_samples", "min_distance", "max_distance"]:
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree( clf = Stree(
C=1e4, C=1e4,