Generates 5 random subspaces at most

Remove itertools combinations from subspaces
Fix mistakes in function comments
2025-08-17 16:36:01 +00:00 · 2020-12-11 11:32:54 +01:00 · 2020-12-10 14:14:42 +01:00 · 2020-11-11 19:14:36 +01:00 · 2020-11-03 11:36:05 +01:00
3 changed files with 42 additions and 22 deletions
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -10,8 +10,8 @@ import os
 import numbers
 import random
 import warnings
-from math import log
-from itertools import combinations
+from math import log, factorial
+from typing import Optional
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.svm import SVC, LinearSVC
@@ -253,19 +253,32 @@ class Splitter:
                selected = feature_set
        return selected if selected is not None else feature_set

+    @staticmethod
+    def _generate_spaces(features: int, max_features: int) -> list:
+        comb = set()
+        # Generate at most 5 combinations
+        if max_features == features:
+            set_length = 1
+        else:
+            number = factorial(features) / (
+                factorial(max_features) * factorial(features - max_features)
+            )
+            set_length = min(5, number)
+        while len(comb) < set_length:
+            comb.add(
+                tuple(sorted(random.sample(range(features), max_features)))
+            )
+        return list(comb)
+
    def _get_subspaces_set(
        self, dataset: np.array, labels: np.array, max_features: int
    ) -> np.array:
-        features = range(dataset.shape[1])
-        features_sets = list(combinations(features, max_features))
+        features_sets = self._generate_spaces(dataset.shape[1], max_features)
        if len(features_sets) > 1:
            if self._splitter_type == "random":
                index = random.randint(0, len(features_sets) - 1)
                return features_sets[index]
            else:
-                # get only 3 sets at most
-                if len(features_sets) > 3:
-                    features_sets = random.sample(features_sets, 3)
                return self._select_best_set(dataset, labels, features_sets)
        else:
            return features_sets[0]
@@ -284,9 +297,8 @@ class Splitter:
        :type data: np.array (m, n_classes)
        :param y: vector of labels (classes)
        :type y: np.array (m,)
-        :return: vector with the class assigned to each sample values
-        (can be 0, 1, ...) -1 if none produces information gain
-        :rtype: np.array shape (m,)
+        :return: column of dataset to be taken into account to split dataset
+        :rtype: int
        """
        max_gain = 0
        selected = -1
@@ -307,8 +319,8 @@ class Splitter:
        :type data: np.array (m, n_classes)
        :param y: vector of labels (classes)
        :type y: np.array (m,)
-        :return: vector with distances to hyperplane (can be positive or neg.)
-        :rtype: np.array shape (m,)
+        :return: column of dataset to be taken into account to split dataset
+        :rtype: int
        """
        # select the class with max number of samples
        _, samples = np.unique(y, return_counts=True)
@@ -489,7 +501,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        sample_weight: np.ndarray,
        depth: int,
        title: str,
-    ) -> Snode:
+    ) -> Optional[Snode]:
        """Recursive function to split the original dataset into predictor
        nodes (leaves)

--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -166,6 +166,14 @@ class Splitter_test(unittest.TestCase):
        self.assertEqual((6,), computed_data.shape)
        self.assertListEqual(expected.tolist(), computed_data.tolist())

+    def test_generate_subspaces(self):
+        features = 250
+        for max_features in range(2, features):
+            num = len(Splitter._generate_spaces(features, max_features))
+            self.assertEqual(5, num)
+        self.assertEqual(3, len(Splitter._generate_spaces(3, 2)))
+        self.assertEqual(4, len(Splitter._generate_spaces(4, 3)))
+
    def test_best_splitter_few_sets(self):
        X, y = load_iris(return_X_y=True)
        X = np.delete(X, 3, 1)
@@ -176,14 +184,14 @@ class Splitter_test(unittest.TestCase):

    def test_splitter_parameter(self):
        expected_values = [
-            [0, 1, 7, 9],  # best   entropy max_samples
-            [3, 8, 10, 11],  # best   entropy impurity
-            [0, 2, 8, 12],  # best   gini    max_samples
-            [1, 2, 5, 12],  # best   gini    impurity
-            [1, 2, 5, 10],  # random entropy max_samples
-            [4, 8, 9, 12],  # random entropy impurity
-            [3, 9, 11, 12],  # random gini    max_samples
-            [1, 5, 6, 9],  # random gini    impurity
+            [1, 4, 9, 12],  # best   entropy max_samples
+            [1, 3, 6, 10],  # best   entropy impurity
+            [6, 8, 10, 12],  # best   gini    max_samples
+            [7, 8, 10, 11],  # best   gini    impurity
+            [0, 3, 8, 12],  # random entropy max_samples
+            [0, 3, 9, 11],  # random entropy impurity
+            [0, 4, 7, 12],  # random gini    max_samples
+            [0, 2, 5, 6],  # random gini    impurity
        ]
        X, y = load_wine(return_X_y=True)
        rn = 0
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -313,7 +313,7 @@ class Stree_test(unittest.TestCase):
        X, y = load_dataset(self._random_state)
        clf = Stree(random_state=self._random_state, max_features=2)
        clf.fit(X, y)
-        self.assertAlmostEqual(0.944, clf.score(X, y))
+        self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))

    def test_bogus_splitter_parameter(self):
        clf = Stree(splitter="duck")
Author	SHA1	Message	Date
Ricardo Montañana	6896f76ca9	Generates 5 random subspaces at most	2020-12-11 11:32:54 +01:00
Ricardo Montañana	3f01234ebf	Remove itertools combinations from subspaces	2020-12-10 14:14:42 +01:00
Ricardo Montañana	475ad7e752	Fix mistakes in function comments	2020-11-11 19:14:36 +01:00
Ricardo Montañana Gómez	1c869e154e	Enhance partition (#16 ) #15 Create impurity function in Stree (consistent name, same criteria as other splitter parameter) Create test for the new function Update init test Update test splitter parameters Rename old impurity function to partition_impurity close #15 * Complete implementation of splitter_type = impurity with tests Remove max_distance & min_distance splitter types * Fix mistake in computing multiclass node belief Set default criterion for split to entropy instead of gini Set default max_iter to 1e5 instead of 1e3 change up-down criterion to match SVC multiclass Fix impurity method of splitting nodes Update jupyter Notebooks	2020-11-03 11:36:05 +01:00