From 3e52a4746c434563b32781543e5c4a1e4b8a05b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Tue, 16 Jun 2020 13:56:02 +0200
Subject: [PATCH] Fix entroy and information_gain functions

---
 stree/Strees.py              |  70 ++++++++++++++-----
 stree/tests/Splitter_test.py | 129 ++++++++++++++++++++++++++---------
 stree/tests/Stree_test.py    |   8 ++-
 3 files changed, 156 insertions(+), 51 deletions(-)

diff --git a/stree/Strees.py b/stree/Strees.py
index ceeed7a..965b214 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -10,6 +10,7 @@ import os
 import numbers
 import random
 import warnings
+from math import log
 from itertools import combinations
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
@@ -163,10 +164,10 @@ class Splitter:
                 f"criterion must be gini or entropy got({criterion})"
             )
 
-        if criteria not in ["min_distance", "max_samples"]:
+        if criteria not in ["min_distance", "max_samples", "max_distance"]:
             raise ValueError(
-                f"split_criteria has to be min_distance or \
-                max_samples got ({criteria})"
+                "split_criteria has to be min_distance "
+                f"max_distance or max_samples got ({criteria})"
             )
 
         if splitter_type not in ["random", "best"]:
@@ -186,24 +187,47 @@ class Splitter:
 
     @staticmethod
     def _entropy(y: np.array) -> float:
-        _, count = np.unique(y, return_counts=True)
-        proportion = count / np.sum(count)
-        return -np.sum(proportion * np.log2(proportion))
+        n_labels = len(y)
+        if n_labels <= 1:
+            return 0
+        counts = np.bincount(y)
+        proportions = counts / n_labels
+        n_classes = np.count_nonzero(proportions)
+        if n_classes <= 1:
+            return 0
+        entropy = 0.0
+        # Compute standard entropy.
+        for prop in proportions:
+            if prop != 0.0:
+                entropy -= prop * log(prop, n_classes)
+        return entropy
 
     def information_gain(
-        self, labels_up: np.array, labels_dn: np.array
+        self, labels: np.array, labels_up: np.array, labels_dn: np.array
     ) -> float:
-        card_up = labels_up.shape[0] if labels_up is not None else 0
-        card_dn = labels_dn.shape[0] if labels_dn is not None else 0
+        imp_prev = self.criterion_function(labels)
+        card_up = card_dn = imp_up = imp_dn = 0
+        if labels_up is not None:
+            card_up = labels_up.shape[0]
+            imp_up = self.criterion_function(labels_up)
+        if labels_dn is not None:
+            card_dn = labels_dn.shape[0] if labels_dn is not None else 0
+            imp_dn = self.criterion_function(labels_dn)
         samples = card_up + card_dn
-        up = card_up / samples * self.criterion_function(labels_up)
-        dn = card_dn / samples * self.criterion_function(labels_dn)
-        return up + dn
+        if samples == 0:
+            return 0
+        else:
+            result = (
+                imp_prev
+                - (card_up / samples) * imp_up
+                - (card_dn / samples) * imp_dn
+            )
+            return result
 
     def _select_best_set(
         self, dataset: np.array, labels: np.array, features_sets: list
     ) -> list:
-        min_impurity = 1
+        max_gain = 0
         selected = None
         warnings.filterwarnings("ignore", category=ConvergenceWarning)
         for feature_set in features_sets:
@@ -213,11 +237,12 @@ class Splitter:
             )
             self.partition(dataset, node)
             y1, y2 = self.part(labels)
-            impurity = self.information_gain(y1, y2)
-            if impurity < min_impurity:
-                min_impurity = impurity
+            gain = self.information_gain(labels, y1, y2)
+            if gain > max_gain:
+                max_gain = gain
                 selected = feature_set
-        return selected
+
+        return selected if selected is not None else feature_set
 
     def _get_subspaces_set(
         self, dataset: np.array, labels: np.array, max_features: int
@@ -226,7 +251,8 @@ class Splitter:
         features_sets = list(combinations(features, max_features))
         if len(features_sets) > 1:
             if self._splitter_type == "random":
-                return features_sets[random.randint(0, len(features_sets) - 1)]
+                index = random.randint(0, len(features_sets) - 1)
+                return features_sets[index]
             else:
                 return self._select_best_set(dataset, labels, features_sets)
         else:
@@ -248,6 +274,14 @@ class Splitter:
             [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
         )
 
+    @staticmethod
+    def _max_distance(data: np.array, _) -> np.array:
+        # chooses the greatest distance of every sample
+        indices = np.argmax(np.abs(data), axis=1)
+        return np.array(
+            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
+        )
+
     @staticmethod
     def _max_samples(data: np.array, y: np.array) -> np.array:
         # select the class with max number of samples
diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py
index 68c6123..8099e04 100644
--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -46,7 +46,11 @@ class Splitter_test(unittest.TestCase):
             self.build(clf=None)
         for splitter_type in ["best", "random"]:
             for criterion in ["gini", "entropy"]:
-                for criteria in ["min_distance", "max_samples"]:
+                for criteria in [
+                    "min_distance",
+                    "max_samples",
+                    "max_distance",
+                ]:
                     tcl = self.build(
                         splitter_type=splitter_type,
                         criterion=criterion,
@@ -57,30 +61,66 @@ class Splitter_test(unittest.TestCase):
                     self.assertEqual(criteria, tcl._criteria)
 
     def test_gini(self):
-        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
-        expected = 0.48
-        self.assertEqual(expected, Splitter._gini(y))
-        tcl = self.build(criterion="gini")
-        self.assertEqual(expected, tcl.criterion_function(y))
+        expected_values = [
+            ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
+            ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
+            ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
+            ([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
+            ([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
+            ([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
+            ([0], 0),
+            ([1, 1, 1, 1], 0),
+        ]
+        for labels, expected in expected_values:
+            self.assertAlmostEqual(expected, Splitter._gini(labels))
+            tcl = self.build(criterion="gini")
+            self.assertAlmostEqual(expected, tcl.criterion_function(labels))
 
     def test_entropy(self):
-        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
-        expected = 0.9709505944546686
-        self.assertAlmostEqual(expected, Splitter._entropy(y))
-        tcl = self.build(criterion="entropy")
-        self.assertEqual(expected, tcl.criterion_function(y))
+        expected_values = [
+            ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
+            ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
+            ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
+            ([0, 0, 1, 1, 1, 1, 0, 0], 1),
+            ([0, 0, 1, 1, 2, 2, 3, 3], 1),
+            ([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
+            ([1], 0),
+            ([0, 0, 0, 0], 0),
+        ]
+        for labels, expected in expected_values:
+            self.assertAlmostEqual(expected, Splitter._entropy(labels))
+            tcl = self.build(criterion="entropy")
+            self.assertAlmostEqual(expected, tcl.criterion_function(labels))
 
     def test_information_gain(self):
-        yu = np.array([0, 1, 1, 1, 1, 1])
-        yd = np.array([0, 0, 0, 1])
-        values_expected = [
-            ("gini", 0.31666666666666665),
-            ("entropy", 0.7145247027726656),
+        expected_values = [
+            (
+                [0, 1, 1, 1, 1, 1],
+                [0, 0, 0, 1],
+                0.16333333333333333,
+                0.25642589168200297,
+            ),
+            (
+                [0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
+                [5, 3, 2, 1, 1],
+                0.007381776239907684,
+                -0.03328610916207225,
+            ),
+            ([], [], 0.0, 0.0),
+            ([1], [], 0.0, 0.0),
+            ([], [1], 0.0, 0.0),
+            ([0, 0, 0, 0], [0, 0], 0.0, 0.0),
+            ([], [1, 1, 1, 2], 0.0, 0.0),
         ]
-        for criterion, expected in values_expected:
-            tcl = self.build(criterion=criterion)
-            computed = tcl.information_gain(yu, yd)
-            self.assertAlmostEqual(expected, computed)
+        for yu, yd, expected_gini, expected_entropy in expected_values:
+            yu = np.array(yu, dtype=np.int32)
+            yd = np.array(yd, dtype=np.int32)
+            tcl = self.build(criterion="gini")
+            computed = tcl.information_gain(np.append(yu, yd), yu, yd)
+            self.assertAlmostEqual(expected_gini, computed)
+            tcl = self.build(criterion="entropy")
+            computed = tcl.information_gain(np.append(yu, yd), yu, yd)
+            self.assertAlmostEqual(expected_entropy, computed)
 
     def test_max_samples(self):
         tcl = self.build(criteria="max_samples")
@@ -113,27 +153,52 @@ class Splitter_test(unittest.TestCase):
         self.assertEqual((4,), computed.shape)
         self.assertListEqual(expected.tolist(), computed.tolist())
 
+    def test_max_distance(self):
+        tcl = self.build(criteria="max_distance")
+        data = np.array(
+            [
+                [-0.1, 0.2, -0.3],
+                [0.7, 0.01, -0.1],
+                [0.7, -0.9, 0.5],
+                [0.1, 0.2, 0.3],
+            ]
+        )
+        expected = np.array([-0.3, 0.7, -0.9, 0.3])
+        computed = tcl._max_distance(data, None)
+        self.assertEqual((4,), computed.shape)
+        self.assertListEqual(expected.tolist(), computed.tolist())
+
     def test_splitter_parameter(self):
         expected_values = [
-            [1, 7, 9],
-            [1, 7, 9],
-            [1, 7, 9],
-            [1, 7, 9],
-            [0, 5, 6],
-            [0, 5, 6],
-            [0, 5, 6],
-            [0, 5, 6],
+            [1, 5, 6],  # random gini    min_distance
+            [1, 2, 3],  # random gini    max_samples
+            [0, 2, 3],  # random gini    max_distance
+            [2, 4, 6],  # random entropy min_distance
+            [2, 5, 6],  # random entropy max_samples
+            [0, 4, 6],  # random entropy max_distance
+            [3, 4, 6],  # best   gini    min_distance
+            [3, 4, 6],  # best   gini    max_samples
+            [1, 4, 6],  # best   gini    max_distance
+            [3, 4, 6],  # best   entropy min_distance
+            [3, 4, 6],  # best   entropy max_samples
+            [1, 4, 6],  # best   entropy max_distance
         ]
-        X, y = load_dataset(self._random_state, n_features=12)
-        for splitter_type in ["best", "random"]:
+        X, y = load_dataset(self._random_state, n_features=7, n_classes=3)
+        rn = 0
+        for splitter_type in ["random", "best"]:
             for criterion in ["gini", "entropy"]:
-                for criteria in ["min_distance", "max_samples"]:
+                for criteria in [
+                    "min_distance",
+                    "max_samples",
+                    "max_distance",
+                ]:
                     tcl = self.build(
                         splitter_type=splitter_type,
                         criterion=criterion,
                         criteria=criteria,
-                        random_state=self._random_state,
+                        random_state=rn,
                     )
+                    rn += 3
                     expected = expected_values.pop(0)
                     dataset, computed = tcl.get_subspace(X, y, max_features=3)
                     self.assertListEqual(expected, list(computed))
diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py
index ccc0442..5a6c08e 100644
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -239,6 +239,9 @@ class Stree_test(unittest.TestCase):
                 "min_distance linear": 0.9533333333333334,
                 "min_distance rbf": 0.836,
                 "min_distance poly": 0.9473333333333334,
+                "max_distance linear": 0.9533333333333334,
+                "max_distance rbf": 0.836,
+                "max_distance poly": 0.9473333333333334,
             },
             "Iris": {
                 "max_samples linear": 0.98,
@@ -247,11 +250,14 @@ class Stree_test(unittest.TestCase):
                 "min_distance linear": 0.98,
                 "min_distance rbf": 1.0,
                 "min_distance poly": 1.0,
+                "max_distance linear": 0.98,
+                "max_distance rbf": 1.0,
+                "max_distance poly": 1.0,
             },
         }
         for name, dataset in datasets.items():
             px, py = dataset
-            for criteria in ["max_samples", "min_distance"]:
+            for criteria in ["max_samples", "min_distance", "max_distance"]:
                 for kernel in self._kernels:
                     clf = Stree(
                         C=1e4,