Fix entroy and information_gain functions

2025-08-18 17:06:01 +00:00 · 2020-06-16 13:56:02 +02:00
parent a20e45e8e7
commit 3e52a4746c
3 changed files with 156 additions and 51 deletions
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -10,6 +10,7 @@ import os
 import numbers
 import random
 import warnings
+from math import log
 from itertools import combinations
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
@@ -163,10 +164,10 @@ class Splitter:
                f"criterion must be gini or entropy got({criterion})"
            )

-        if criteria not in ["min_distance", "max_samples"]:
+        if criteria not in ["min_distance", "max_samples", "max_distance"]:
            raise ValueError(
-                f"split_criteria has to be min_distance or \
-                max_samples got ({criteria})"
+                "split_criteria has to be min_distance "
+                f"max_distance or max_samples got ({criteria})"
            )

        if splitter_type not in ["random", "best"]:
@@ -186,24 +187,47 @@ class Splitter:

    @staticmethod
    def _entropy(y: np.array) -> float:
-        _, count = np.unique(y, return_counts=True)
-        proportion = count / np.sum(count)
-        return -np.sum(proportion * np.log2(proportion))
+        n_labels = len(y)
+        if n_labels <= 1:
+            return 0
+        counts = np.bincount(y)
+        proportions = counts / n_labels
+        n_classes = np.count_nonzero(proportions)
+        if n_classes <= 1:
+            return 0
+        entropy = 0.0
+        # Compute standard entropy.
+        for prop in proportions:
+            if prop != 0.0:
+                entropy -= prop * log(prop, n_classes)
+        return entropy

    def information_gain(
-        self, labels_up: np.array, labels_dn: np.array
+        self, labels: np.array, labels_up: np.array, labels_dn: np.array
    ) -> float:
-        card_up = labels_up.shape[0] if labels_up is not None else 0
-        card_dn = labels_dn.shape[0] if labels_dn is not None else 0
+        imp_prev = self.criterion_function(labels)
+        card_up = card_dn = imp_up = imp_dn = 0
+        if labels_up is not None:
+            card_up = labels_up.shape[0]
+            imp_up = self.criterion_function(labels_up)
+        if labels_dn is not None:
+            card_dn = labels_dn.shape[0] if labels_dn is not None else 0
+            imp_dn = self.criterion_function(labels_dn)
        samples = card_up + card_dn
-        up = card_up / samples * self.criterion_function(labels_up)
-        dn = card_dn / samples * self.criterion_function(labels_dn)
-        return up + dn
+        if samples == 0:
+            return 0
+        else:
+            result = (
+                imp_prev
+                - (card_up / samples) * imp_up
+                - (card_dn / samples) * imp_dn
+            )
+            return result

    def _select_best_set(
        self, dataset: np.array, labels: np.array, features_sets: list
    ) -> list:
-        min_impurity = 1
+        max_gain = 0
        selected = None
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        for feature_set in features_sets:
@@ -213,11 +237,12 @@ class Splitter:
            )
            self.partition(dataset, node)
            y1, y2 = self.part(labels)
-            impurity = self.information_gain(y1, y2)
-            if impurity < min_impurity:
-                min_impurity = impurity
+            gain = self.information_gain(labels, y1, y2)
+            if gain > max_gain:
+                max_gain = gain
                selected = feature_set
-        return selected
+
+        return selected if selected is not None else feature_set

    def _get_subspaces_set(
        self, dataset: np.array, labels: np.array, max_features: int
@@ -226,7 +251,8 @@ class Splitter:
        features_sets = list(combinations(features, max_features))
        if len(features_sets) > 1:
            if self._splitter_type == "random":
-                return features_sets[random.randint(0, len(features_sets) - 1)]
+                index = random.randint(0, len(features_sets) - 1)
+                return features_sets[index]
            else:
                return self._select_best_set(dataset, labels, features_sets)
        else:
@@ -248,6 +274,14 @@ class Splitter:
            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
        )

+    @staticmethod
+    def _max_distance(data: np.array, _) -> np.array:
+        # chooses the greatest distance of every sample
+        indices = np.argmax(np.abs(data), axis=1)
+        return np.array(
+            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
+        )
+
    @staticmethod
    def _max_samples(data: np.array, y: np.array) -> np.array:
        # select the class with max number of samples