From 3f01234ebf1f70ba5e73e85b63e8419fb4ebdf8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 10 Dec 2020 14:14:42 +0100 Subject: [PATCH] Remove itertools combinations from subspaces --- stree/Strees.py | 18 ++++++++++++------ stree/tests/Splitter_test.py | 16 ++++++++-------- stree/tests/Stree_test.py | 2 +- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/stree/Strees.py b/stree/Strees.py index f47775a..e04bee3 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -11,7 +11,6 @@ import numbers import random import warnings from math import log -from itertools import combinations import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import SVC, LinearSVC @@ -253,19 +252,26 @@ class Splitter: selected = feature_set return selected if selected is not None else feature_set + @staticmethod + def _generate_spaces(features: int, max_features: int) -> list: + comb = set() + # Generate at most 3 combinations + set_length = 1 if max_features == features else 3 + while len(comb) < set_length: + comb.add( + tuple(sorted(random.sample(range(features), max_features))) + ) + return list(comb) + def _get_subspaces_set( self, dataset: np.array, labels: np.array, max_features: int ) -> np.array: - features = range(dataset.shape[1]) - features_sets = list(combinations(features, max_features)) + features_sets = self._generate_spaces(dataset.shape[1], max_features) if len(features_sets) > 1: if self._splitter_type == "random": index = random.randint(0, len(features_sets) - 1) return features_sets[index] else: - # get only 3 sets at most - if len(features_sets) > 3: - features_sets = random.sample(features_sets, 3) return self._select_best_set(dataset, labels, features_sets) else: return features_sets[0] diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py index a0dbc96..6a5e4f8 100644 --- a/stree/tests/Splitter_test.py +++ b/stree/tests/Splitter_test.py @@ -176,14 +176,14 @@ class Splitter_test(unittest.TestCase): def test_splitter_parameter(self): expected_values = [ - [0, 1, 7, 9], # best entropy max_samples - [3, 8, 10, 11], # best entropy impurity - [0, 2, 8, 12], # best gini max_samples - [1, 2, 5, 12], # best gini impurity - [1, 2, 5, 10], # random entropy max_samples - [4, 8, 9, 12], # random entropy impurity - [3, 9, 11, 12], # random gini max_samples - [1, 5, 6, 9], # random gini impurity + [0, 4, 6, 12], # best entropy max_samples + [1, 3, 6, 10], # best entropy impurity + [0, 1, 5, 11], # best gini max_samples + [0, 1, 7, 9], # best gini impurity + [0, 4, 6, 8], # random entropy max_samples + [4, 5, 8, 9], # random entropy impurity + [0, 4, 10, 12], # random gini max_samples + [1, 5, 8, 12], # random gini impurity ] X, y = load_wine(return_X_y=True) rn = 0 diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index 77fa82a..3ebfb70 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -313,7 +313,7 @@ class Stree_test(unittest.TestCase): X, y = load_dataset(self._random_state) clf = Stree(random_state=self._random_state, max_features=2) clf.fit(X, y) - self.assertAlmostEqual(0.944, clf.score(X, y)) + self.assertAlmostEqual(0.9246666666666666, clf.score(X, y)) def test_bogus_splitter_parameter(self): clf = Stree(splitter="duck")