From 3e52a4746c434563b32781543e5c4a1e4b8a05b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 16 Jun 2020 13:56:02 +0200 Subject: [PATCH] Fix entroy and information_gain functions --- stree/Strees.py | 70 ++++++++++++++----- stree/tests/Splitter_test.py | 129 ++++++++++++++++++++++++++--------- stree/tests/Stree_test.py | 8 ++- 3 files changed, 156 insertions(+), 51 deletions(-) diff --git a/stree/Strees.py b/stree/Strees.py index ceeed7a..965b214 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -10,6 +10,7 @@ import os import numbers import random import warnings +from math import log from itertools import combinations import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin @@ -163,10 +164,10 @@ class Splitter: f"criterion must be gini or entropy got({criterion})" ) - if criteria not in ["min_distance", "max_samples"]: + if criteria not in ["min_distance", "max_samples", "max_distance"]: raise ValueError( - f"split_criteria has to be min_distance or \ - max_samples got ({criteria})" + "split_criteria has to be min_distance " + f"max_distance or max_samples got ({criteria})" ) if splitter_type not in ["random", "best"]: @@ -186,24 +187,47 @@ class Splitter: @staticmethod def _entropy(y: np.array) -> float: - _, count = np.unique(y, return_counts=True) - proportion = count / np.sum(count) - return -np.sum(proportion * np.log2(proportion)) + n_labels = len(y) + if n_labels <= 1: + return 0 + counts = np.bincount(y) + proportions = counts / n_labels + n_classes = np.count_nonzero(proportions) + if n_classes <= 1: + return 0 + entropy = 0.0 + # Compute standard entropy. + for prop in proportions: + if prop != 0.0: + entropy -= prop * log(prop, n_classes) + return entropy def information_gain( - self, labels_up: np.array, labels_dn: np.array + self, labels: np.array, labels_up: np.array, labels_dn: np.array ) -> float: - card_up = labels_up.shape[0] if labels_up is not None else 0 - card_dn = labels_dn.shape[0] if labels_dn is not None else 0 + imp_prev = self.criterion_function(labels) + card_up = card_dn = imp_up = imp_dn = 0 + if labels_up is not None: + card_up = labels_up.shape[0] + imp_up = self.criterion_function(labels_up) + if labels_dn is not None: + card_dn = labels_dn.shape[0] if labels_dn is not None else 0 + imp_dn = self.criterion_function(labels_dn) samples = card_up + card_dn - up = card_up / samples * self.criterion_function(labels_up) - dn = card_dn / samples * self.criterion_function(labels_dn) - return up + dn + if samples == 0: + return 0 + else: + result = ( + imp_prev + - (card_up / samples) * imp_up + - (card_dn / samples) * imp_dn + ) + return result def _select_best_set( self, dataset: np.array, labels: np.array, features_sets: list ) -> list: - min_impurity = 1 + max_gain = 0 selected = None warnings.filterwarnings("ignore", category=ConvergenceWarning) for feature_set in features_sets: @@ -213,11 +237,12 @@ class Splitter: ) self.partition(dataset, node) y1, y2 = self.part(labels) - impurity = self.information_gain(y1, y2) - if impurity < min_impurity: - min_impurity = impurity + gain = self.information_gain(labels, y1, y2) + if gain > max_gain: + max_gain = gain selected = feature_set - return selected + + return selected if selected is not None else feature_set def _get_subspaces_set( self, dataset: np.array, labels: np.array, max_features: int @@ -226,7 +251,8 @@ class Splitter: features_sets = list(combinations(features, max_features)) if len(features_sets) > 1: if self._splitter_type == "random": - return features_sets[random.randint(0, len(features_sets) - 1)] + index = random.randint(0, len(features_sets) - 1) + return features_sets[index] else: return self._select_best_set(dataset, labels, features_sets) else: @@ -248,6 +274,14 @@ class Splitter: [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] ) + @staticmethod + def _max_distance(data: np.array, _) -> np.array: + # chooses the greatest distance of every sample + indices = np.argmax(np.abs(data), axis=1) + return np.array( + [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] + ) + @staticmethod def _max_samples(data: np.array, y: np.array) -> np.array: # select the class with max number of samples diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py index 68c6123..8099e04 100644 --- a/stree/tests/Splitter_test.py +++ b/stree/tests/Splitter_test.py @@ -46,7 +46,11 @@ class Splitter_test(unittest.TestCase): self.build(clf=None) for splitter_type in ["best", "random"]: for criterion in ["gini", "entropy"]: - for criteria in ["min_distance", "max_samples"]: + for criteria in [ + "min_distance", + "max_samples", + "max_distance", + ]: tcl = self.build( splitter_type=splitter_type, criterion=criterion, @@ -57,30 +61,66 @@ class Splitter_test(unittest.TestCase): self.assertEqual(criteria, tcl._criteria) def test_gini(self): - y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] - expected = 0.48 - self.assertEqual(expected, Splitter._gini(y)) - tcl = self.build(criterion="gini") - self.assertEqual(expected, tcl.criterion_function(y)) + expected_values = [ + ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48), + ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778), + ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306), + ([0, 0, 1, 1, 1, 1, 0, 0], 0.5), + ([0, 0, 1, 1, 2, 2, 3, 3], 0.75), + ([0, 0, 1, 1, 1, 1, 1, 1], 0.375), + ([0], 0), + ([1, 1, 1, 1], 0), + ] + for labels, expected in expected_values: + self.assertAlmostEqual(expected, Splitter._gini(labels)) + tcl = self.build(criterion="gini") + self.assertAlmostEqual(expected, tcl.criterion_function(labels)) def test_entropy(self): - y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] - expected = 0.9709505944546686 - self.assertAlmostEqual(expected, Splitter._entropy(y)) - tcl = self.build(criterion="entropy") - self.assertEqual(expected, tcl.criterion_function(y)) + expected_values = [ + ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686), + ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589), + ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999), + ([0, 0, 1, 1, 1, 1, 0, 0], 1), + ([0, 0, 1, 1, 2, 2, 3, 3], 1), + ([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328), + ([1], 0), + ([0, 0, 0, 0], 0), + ] + for labels, expected in expected_values: + self.assertAlmostEqual(expected, Splitter._entropy(labels)) + tcl = self.build(criterion="entropy") + self.assertAlmostEqual(expected, tcl.criterion_function(labels)) def test_information_gain(self): - yu = np.array([0, 1, 1, 1, 1, 1]) - yd = np.array([0, 0, 0, 1]) - values_expected = [ - ("gini", 0.31666666666666665), - ("entropy", 0.7145247027726656), + expected_values = [ + ( + [0, 1, 1, 1, 1, 1], + [0, 0, 0, 1], + 0.16333333333333333, + 0.25642589168200297, + ), + ( + [0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], + [5, 3, 2, 1, 1], + 0.007381776239907684, + -0.03328610916207225, + ), + ([], [], 0.0, 0.0), + ([1], [], 0.0, 0.0), + ([], [1], 0.0, 0.0), + ([0, 0, 0, 0], [0, 0], 0.0, 0.0), + ([], [1, 1, 1, 2], 0.0, 0.0), ] - for criterion, expected in values_expected: - tcl = self.build(criterion=criterion) - computed = tcl.information_gain(yu, yd) - self.assertAlmostEqual(expected, computed) + for yu, yd, expected_gini, expected_entropy in expected_values: + yu = np.array(yu, dtype=np.int32) + yd = np.array(yd, dtype=np.int32) + tcl = self.build(criterion="gini") + computed = tcl.information_gain(np.append(yu, yd), yu, yd) + self.assertAlmostEqual(expected_gini, computed) + tcl = self.build(criterion="entropy") + computed = tcl.information_gain(np.append(yu, yd), yu, yd) + self.assertAlmostEqual(expected_entropy, computed) def test_max_samples(self): tcl = self.build(criteria="max_samples") @@ -113,27 +153,52 @@ class Splitter_test(unittest.TestCase): self.assertEqual((4,), computed.shape) self.assertListEqual(expected.tolist(), computed.tolist()) + def test_max_distance(self): + tcl = self.build(criteria="max_distance") + data = np.array( + [ + [-0.1, 0.2, -0.3], + [0.7, 0.01, -0.1], + [0.7, -0.9, 0.5], + [0.1, 0.2, 0.3], + ] + ) + expected = np.array([-0.3, 0.7, -0.9, 0.3]) + computed = tcl._max_distance(data, None) + self.assertEqual((4,), computed.shape) + self.assertListEqual(expected.tolist(), computed.tolist()) + def test_splitter_parameter(self): expected_values = [ - [1, 7, 9], - [1, 7, 9], - [1, 7, 9], - [1, 7, 9], - [0, 5, 6], - [0, 5, 6], - [0, 5, 6], - [0, 5, 6], + [1, 5, 6], # random gini min_distance + [1, 2, 3], # random gini max_samples + [0, 2, 3], # random gini max_distance + [2, 4, 6], # random entropy min_distance + [2, 5, 6], # random entropy max_samples + [0, 4, 6], # random entropy max_distance + [3, 4, 6], # best gini min_distance + [3, 4, 6], # best gini max_samples + [1, 4, 6], # best gini max_distance + [3, 4, 6], # best entropy min_distance + [3, 4, 6], # best entropy max_samples + [1, 4, 6], # best entropy max_distance ] - X, y = load_dataset(self._random_state, n_features=12) - for splitter_type in ["best", "random"]: + X, y = load_dataset(self._random_state, n_features=7, n_classes=3) + rn = 0 + for splitter_type in ["random", "best"]: for criterion in ["gini", "entropy"]: - for criteria in ["min_distance", "max_samples"]: + for criteria in [ + "min_distance", + "max_samples", + "max_distance", + ]: tcl = self.build( splitter_type=splitter_type, criterion=criterion, criteria=criteria, - random_state=self._random_state, + random_state=rn, ) + rn += 3 expected = expected_values.pop(0) dataset, computed = tcl.get_subspace(X, y, max_features=3) self.assertListEqual(expected, list(computed)) diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index ccc0442..5a6c08e 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -239,6 +239,9 @@ class Stree_test(unittest.TestCase): "min_distance linear": 0.9533333333333334, "min_distance rbf": 0.836, "min_distance poly": 0.9473333333333334, + "max_distance linear": 0.9533333333333334, + "max_distance rbf": 0.836, + "max_distance poly": 0.9473333333333334, }, "Iris": { "max_samples linear": 0.98, @@ -247,11 +250,14 @@ class Stree_test(unittest.TestCase): "min_distance linear": 0.98, "min_distance rbf": 1.0, "min_distance poly": 1.0, + "max_distance linear": 0.98, + "max_distance rbf": 1.0, + "max_distance poly": 1.0, }, } for name, dataset in datasets.items(): px, py = dataset - for criteria in ["max_samples", "min_distance"]: + for criteria in ["max_samples", "min_distance", "max_distance"]: for kernel in self._kernels: clf = Stree( C=1e4,