Solve the mistake of min and max distance

The split criteria functions min and max distance return classes while max_samples return distances positives and negatives to hyperplane of the class with more samples in node
2025-08-15 23:46:02 +00:00 · 2020-06-17 00:13:52 +02:00
parent 3e52a4746c
commit ecd0b86f4d
3 changed files with 171 additions and 53 deletions
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -268,29 +268,56 @@ class Splitter:
    @staticmethod
    def _min_distance(data: np.array, _) -> np.array:
-        # chooses the lowest distance of every sample
+        """Assign class to min distances
-        indices = np.argmin(np.abs(data), axis=1)
+
-        return np.array(
+        return a vector of classes so partition can separate class 0 from
-            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
+        the rest of classes, ie. class 0 goes to one splitted node and the
-        )
+        rest of classes go to the other
        :param data: distances to hyper plane of every class
        :type data: np.array (m, n_classes)
        :param _: enable call compat with other measures
        :type _: None
        :return: vector with the class assigned to each sample
        :rtype: np.array shape (m,)
        """
        return np.argmin(data, axis=1)
    @staticmethod
    def _max_distance(data: np.array, _) -> np.array:
-        # chooses the greatest distance of every sample
+        """Assign class to max distances
-        indices = np.argmax(np.abs(data), axis=1)
+
-        return np.array(
+        return a vector of classes so partition can separate class 0 from
-            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
+        the rest of classes, ie. class 0 goes to one splitted node and the
-        )
+        rest of classes go to the other
        :param data: distances to hyper plane of every class
        :type data: np.array (m, n_classes)
        :param _: enable call compat with other measures
        :type _: None
        :return: vector with the class assigned to each sample values
        (can be 0, 1, ...)
        :rtype: np.array shape (m,)
        """
        return np.argmax(data, axis=1)
    @staticmethod
    def _max_samples(data: np.array, y: np.array) -> np.array:
        """return distances of the class with more samples
        :param data: distances to hyper plane of every class
        :type data: np.array (m, n_classes)
        :param y: vector of labels (classes)
        :type y: np.array (m,)
        :return: vector with distances to hyperplane (can be positive or neg.)
        :rtype: np.array shape (m,)
        """
        # select the class with max number of samples
        _, samples = np.unique(y, return_counts=True)
        selected = np.argmax(samples)
        return data[:, selected]
    def partition(self, samples: np.array, node: Snode):
-        """Set the criteria to split arrays
+        """Set the criteria to split arrays. Compute the indices of the samples
        that should go to one side of the tree (down)
        """
        data = self._distances(node, samples)
--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -111,15 +111,23 @@ class Splitter_test(unittest.TestCase):
            ([], [1], 0.0, 0.0),
            ([0, 0, 0, 0], [0, 0], 0.0, 0.0),
            ([], [1, 1, 1, 2], 0.0, 0.0),
            (None, [1, 2, 3], 0.0, 0.0),
            ([1, 2, 3], None, 0.0, 0.0),
        ]
        for yu, yd, expected_gini, expected_entropy in expected_values:
-            yu = np.array(yu, dtype=np.int32)
+            yu = np.array(yu, dtype=np.int32) if yu is not None else None
-            yd = np.array(yd, dtype=np.int32)
+            yd = np.array(yd, dtype=np.int32) if yd is not None else None
            if yu is not None and yd is not None:
                complete = np.append(yu, yd)
            elif yd is not None:
                complete = yd
            else:
                complete = yu
            tcl = self.build(criterion="gini")
-            computed = tcl.information_gain(np.append(yu, yd), yu, yd)
+            computed = tcl.information_gain(complete, yu, yd)
            self.assertAlmostEqual(expected_gini, computed)
            tcl = self.build(criterion="entropy")
-            computed = tcl.information_gain(np.append(yu, yd), yu, yd)
+            computed = tcl.information_gain(complete, yu, yd)
            self.assertAlmostEqual(expected_entropy, computed)
    def test_max_samples(self):
@@ -148,7 +156,7 @@ class Splitter_test(unittest.TestCase):
                [0.1, 0.2, 0.3],
            ]
        )
-        expected = np.array([-0.1, 0.01, 0.5, 0.1])
+        expected = np.array([2, 2, 1, 0])
        computed = tcl._min_distance(data, None)
        self.assertEqual((4,), computed.shape)
        self.assertListEqual(expected.tolist(), computed.tolist())
@@ -163,27 +171,27 @@ class Splitter_test(unittest.TestCase):
                [0.1, 0.2, 0.3],
            ]
        )
-        expected = np.array([-0.3, 0.7, -0.9, 0.3])
+        expected = np.array([1, 0, 0, 2])
        computed = tcl._max_distance(data, None)
        self.assertEqual((4,), computed.shape)
        self.assertListEqual(expected.tolist(), computed.tolist())
    def test_splitter_parameter(self):
        expected_values = [
-            [1, 5, 6],  # random gini    min_distance
+            [1, 3, 4, 5],  # random gini    min_distance
-            [1, 2, 3],  # random gini    max_samples
+            [0, 1, 3, 4],  # random gini    max_samples
-            [0, 2, 3],  # random gini    max_distance
+            [1, 2, 4, 5],  # random gini    max_distance
-            [2, 4, 6],  # random entropy min_distance
+            [0, 2, 3, 5],  # random entropy min_distance
-            [2, 5, 6],  # random entropy max_samples
+            [0, 2, 3, 5],  # random entropy max_samples
-            [0, 4, 6],  # random entropy max_distance
+            [0, 1, 3, 4],  # random entropy max_distance
-            [3, 4, 6],  # best   gini    min_distance
+            [0, 1, 2, 5],  # best   gini    min_distance
-            [3, 4, 6],  # best   gini    max_samples
+            [2, 3, 4, 5],  # best   gini    max_samples
-            [1, 4, 6],  # best   gini    max_distance
+            [0, 2, 3, 4],  # best   gini    max_distance
-            [3, 4, 6],  # best   entropy min_distance
+            [0, 1, 2, 5],  # best   entropy min_distance
-            [3, 4, 6],  # best   entropy max_samples
+            [2, 3, 4, 5],  # best   entropy max_samples
-            [1, 4, 6],  # best   entropy max_distance
+            [0, 1, 2, 4],  # best   entropy max_distance
        ]
-        X, y = load_dataset(self._random_state, n_features=7, n_classes=3)
+        X, y = load_dataset(self._random_state, n_features=6, n_classes=3)
        rn = 0
        for splitter_type in ["random", "best"]:
            for criterion in ["gini", "entropy"]:
@@ -200,7 +208,23 @@ class Splitter_test(unittest.TestCase):
                    )
                    rn += 3
                    expected = expected_values.pop(0)
-                    dataset, computed = tcl.get_subspace(X, y, max_features=3)
+                    dataset, computed = tcl.get_subspace(X, y, max_features=4)
                    # Flaky test
                    if (
                        splitter_type == "best"
                        and criteria == "max_distance"
                        and criterion == "gini"
                        and computed == (1, 2, 3, 4)
                    ):
                        # sometimes returns (0, 2, 3, 4) and sometimes
                        # (1, 2, 3, 4)
                        expected = [1, 2, 3, 4]
                    # print(
                    #     "{},  # {:7s}{:8s}{:15s}".format(
                    #         list(computed), splitter_type,
                    #         criterion, criteria,
                    #     )
                    # )
                    self.assertListEqual(expected, list(computed))
                    self.assertListEqual(
                        X[:, computed].tolist(), dataset.tolist()
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -1,8 +1,10 @@
 import os
 import unittest
 import warnings
 import numpy as np
-from sklearn.datasets import load_iris
+from sklearn.datasets import load_iris, load_wine
 from sklearn.exceptions import ConvergenceWarning
 from stree import Stree, Snode
 from .utils import load_dataset
@@ -59,8 +61,6 @@ class Stree_test(unittest.TestCase):
    def test_build_tree(self):
        """Check if the tree is built the same way as predictions of models
        """
        import warnings
        warnings.filterwarnings("ignore")
        for kernel in self._kernels:
            clf = Stree(kernel=kernel, random_state=self._random_state)
@@ -102,22 +102,6 @@ class Stree_test(unittest.TestCase):
            yp = clf.fit(X, y).predict(X[:num, :])
            self.assertListEqual(y[:num].tolist(), yp.tolist())
    def test_score(self):
        X, y = load_dataset(self._random_state)
        accuracies = [
            0.9506666666666667,
            0.9606666666666667,
            0.9433333333333334,
        ]
        for kernel, accuracy_expected in zip(self._kernels, accuracies):
            clf = Stree(random_state=self._random_state, kernel=kernel,)
            clf.fit(X, y)
            accuracy_score = clf.score(X, y)
            yp = clf.predict(X)
            accuracy_computed = np.mean(yp == y)
            self.assertEqual(accuracy_score, accuracy_computed)
            self.assertAlmostEqual(accuracy_expected, accuracy_score)
    def test_single_vs_multiple_prediction(self):
        """Check if predicting sample by sample gives the same result as
        predicting all samples at once
@@ -164,9 +148,6 @@ class Stree_test(unittest.TestCase):
    @staticmethod
    def test_is_a_sklearn_classifier():
        import warnings
        from sklearn.exceptions import ConvergenceWarning
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        from sklearn.utils.estimator_checks import check_estimator
@@ -328,12 +309,98 @@ class Stree_test(unittest.TestCase):
        with self.assertRaises(ValueError):
            clf.predict(X[:, :3])
    # Tests of score
    def test_score_binary(self):
        X, y = load_dataset(self._random_state)
        accuracies = [
            0.9506666666666667,
            0.9606666666666667,
            0.9433333333333334,
        ]
        for kernel, accuracy_expected in zip(self._kernels, accuracies):
            clf = Stree(random_state=self._random_state, kernel=kernel,)
            clf.fit(X, y)
            accuracy_score = clf.score(X, y)
            yp = clf.predict(X)
            accuracy_computed = np.mean(yp == y)
            self.assertEqual(accuracy_score, accuracy_computed)
            self.assertAlmostEqual(accuracy_expected, accuracy_score)
    def test_score_max_features(self):
        X, y = load_dataset(self._random_state)
        clf = Stree(random_state=self._random_state, max_features=2)
        clf.fit(X, y)
        self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))
    def test_score_multi_class(self):
        warnings.filterwarnings("ignore")
        accuracies = [
            0.8258427,  # Wine    linear min_distance
            0.6741573,  # Wine    linear max_distance
            0.8314607,  # Wine    linear max_samples
            0.6629213,  # Wine    rbf   min_distance
            1.0000000,  # Wine    rbf   max_distance
            0.4044944,  # Wine    rbf   max_samples
            0.9157303,  # Wine    poly  min_distance
            1.0000000,  # Wine    poly  max_distance
            0.7640449,  # Wine    poly  max_samples
            0.9933333,  # Iris    linear min_distance
            0.9666667,  # Iris    linear max_distance
            0.9666667,  # Iris    linear max_samples
            0.9800000,  # Iris    rbf   min_distance
            0.9800000,  # Iris    rbf   max_distance
            0.9800000,  # Iris    rbf   max_samples
            1.0000000,  # Iris    poly  min_distance
            1.0000000,  # Iris    poly  max_distance
            1.0000000,  # Iris    poly  max_samples
            0.8993333,  # Synthetic linear min_distance
            0.6533333,  # Synthetic linear max_distance
            0.9313333,  # Synthetic linear max_samples
            0.8320000,  # Synthetic rbf   min_distance
            0.6660000,  # Synthetic rbf   max_distance
            0.8320000,  # Synthetic rbf   max_samples
            0.6066667,  # Synthetic poly  min_distance
            0.6840000,  # Synthetic poly  max_distance
            0.6340000,  # Synthetic poly  max_samples
        ]
        datasets = [
            ("Wine", load_wine(return_X_y=True)),
            ("Iris", load_iris(return_X_y=True)),
            (
                "Synthetic",
                load_dataset(self._random_state, n_classes=3, n_features=5),
            ),
        ]
        for dataset_name, dataset in datasets:
            X, y = dataset
            for kernel in self._kernels:
                for criteria in [
                    "min_distance",
                    "max_distance",
                    "max_samples",
                ]:
                    clf = Stree(
                        C=17,
                        random_state=self._random_state,
                        kernel=kernel,
                        split_criteria=criteria,
                        degree=5,
                        gamma="auto",
                    )
                    clf.fit(X, y)
                    accuracy_score = clf.score(X, y)
                    yp = clf.predict(X)
                    accuracy_computed = np.mean(yp == y)
                    # print(
                    #     "{:.7f},  # {:7} {:5} {}".format(
                    #         accuracy_score, dataset_name, kernel, criteria
                    #     )
                    # )
                    accuracy_expected = accuracies.pop(0)
                    self.assertEqual(accuracy_score, accuracy_computed)
                    self.assertAlmostEqual(accuracy_expected, accuracy_score)
    def test_bogus_splitter_parameter(self):
        clf = Stree(splitter="duck")
        with self.assertRaises(ValueError):