From 02de394c961bf72ae5b1de563f03c4b7ad919382 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Mon, 26 Apr 2021 01:48:50 +0200 Subject: [PATCH] Add select KBest features #17 (#35) --- stree/Strees.py | 54 +++++++++++++++++++----------------- stree/tests/Splitter_test.py | 52 ++++++++++++++++++++++------------ stree/tests/Stree_test.py | 2 +- 3 files changed, 65 insertions(+), 43 deletions(-) diff --git a/stree/Strees.py b/stree/Strees.py index d81a3ec..b99140d 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -15,6 +15,7 @@ from typing import Optional import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import SVC, LinearSVC +from sklearn.feature_selection import SelectKBest from sklearn.preprocessing import StandardScaler from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import check_classification_targets @@ -179,7 +180,7 @@ class Splitter: self, clf: SVC = None, criterion: str = None, - splitter_type: str = None, + feature_select: str = None, criteria: str = None, min_samples_split: int = None, random_state=None, @@ -192,7 +193,7 @@ class Splitter: self._criterion = criterion self._min_samples_split = min_samples_split self._criteria = criteria - self._splitter_type = splitter_type + self._feature_select = feature_select self._normalize = normalize if clf is None: @@ -211,9 +212,10 @@ class Splitter: f"criteria has to be max_samples or impurity; got ({criteria})" ) - if splitter_type not in ["random", "best"]: + if feature_select not in ["random", "best"]: raise ValueError( - f"splitter must be either random or best, got({splitter_type})" + "splitter must be either random or best, got " + f"({feature_select})" ) self.criterion_function = getattr(self, f"_{self._criterion}") self.decision_criteria = getattr(self, f"_{self._criteria}") @@ -330,13 +332,10 @@ class Splitter: """ comb = set() # Generate at most 5 combinations - if max_features == features: - set_length = 1 - else: - number = factorial(features) / ( - factorial(max_features) * factorial(features - max_features) - ) - set_length = min(5, number) + number = factorial(features) / ( + factorial(max_features) * factorial(features - max_features) + ) + set_length = min(5, number) while len(comb) < set_length: comb.add( tuple(sorted(random.sample(range(features), max_features))) @@ -345,9 +344,9 @@ class Splitter: def _get_subspaces_set( self, dataset: np.array, labels: np.array, max_features: int - ) -> np.array: + ) -> tuple: """Compute the indices of the features selected by splitter depending - on the self._splitter_type hyper parameter + on the self._feature_select hyper parameter Parameters ---------- @@ -361,23 +360,28 @@ class Splitter: Returns ------- - np.array + tuple indices of the features selected """ - features_sets = self._generate_spaces(dataset.shape[1], max_features) - if len(features_sets) > 1: - if self._splitter_type == "random": - index = random.randint(0, len(features_sets) - 1) - return features_sets[index] - else: - return self._select_best_set(dataset, labels, features_sets) - else: - return features_sets[0] + if dataset.shape[1] == max_features: + # No feature reduction applies + return tuple(range(dataset.shape[1])) + if self._feature_select == "random": + features_sets = self._generate_spaces( + dataset.shape[1], max_features + ) + return self._select_best_set(dataset, labels, features_sets) + # Take KBest features + return ( + SelectKBest(k=max_features) + .fit(dataset, labels) + .get_support(indices=True) + ) def get_subspace( self, dataset: np.array, labels: np.array, max_features: int ) -> tuple: - """Return a subspace of the selected dataset of max_features length. + """Re3turn a subspace of the selected dataset of max_features length. Depending on hyperparmeter Parameters @@ -613,7 +617,7 @@ class Stree(BaseEstimator, ClassifierMixin): self.splitter_ = Splitter( clf=self._build_clf(), criterion=self.criterion, - splitter_type=self.splitter, + feature_select=self.splitter, criteria=self.split_criteria, random_state=self.random_state, min_samples_split=self.min_samples_split, diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py index c70039e..3e45f29 100644 --- a/stree/tests/Splitter_test.py +++ b/stree/tests/Splitter_test.py @@ -6,6 +6,7 @@ import numpy as np from sklearn.svm import SVC from sklearn.datasets import load_wine, load_iris from stree import Splitter +from .utils import load_dataset class Splitter_test(unittest.TestCase): @@ -17,7 +18,7 @@ class Splitter_test(unittest.TestCase): def build( clf=SVC, min_samples_split=0, - splitter_type="random", + feature_select="random", criterion="gini", criteria="max_samples", random_state=None, @@ -25,7 +26,7 @@ class Splitter_test(unittest.TestCase): return Splitter( clf=clf(random_state=random_state, kernel="rbf"), min_samples_split=min_samples_split, - splitter_type=splitter_type, + feature_select=feature_select, criterion=criterion, criteria=criteria, random_state=random_state, @@ -39,20 +40,20 @@ class Splitter_test(unittest.TestCase): with self.assertRaises(ValueError): self.build(criterion="duck") with self.assertRaises(ValueError): - self.build(splitter_type="duck") + self.build(feature_select="duck") with self.assertRaises(ValueError): self.build(criteria="duck") with self.assertRaises(ValueError): _ = Splitter(clf=None) - for splitter_type in ["best", "random"]: + for feature_select in ["best", "random"]: for criterion in ["gini", "entropy"]: for criteria in ["max_samples", "impurity"]: tcl = self.build( - splitter_type=splitter_type, + feature_select=feature_select, criterion=criterion, criteria=criteria, ) - self.assertEqual(splitter_type, tcl._splitter_type) + self.assertEqual(feature_select, tcl._feature_select) self.assertEqual(criterion, tcl._criterion) self.assertEqual(criteria, tcl._criteria) @@ -177,32 +178,34 @@ class Splitter_test(unittest.TestCase): def test_best_splitter_few_sets(self): X, y = load_iris(return_X_y=True) X = np.delete(X, 3, 1) - tcl = self.build(splitter_type="best", random_state=self._random_state) + tcl = self.build( + feature_select="best", random_state=self._random_state + ) dataset, computed = tcl.get_subspace(X, y, max_features=2) self.assertListEqual([0, 2], list(computed)) self.assertListEqual(X[:, computed].tolist(), dataset.tolist()) def test_splitter_parameter(self): expected_values = [ - [1, 4, 9, 12], # best entropy max_samples - [1, 3, 6, 10], # best entropy impurity - [6, 8, 10, 12], # best gini max_samples - [7, 8, 10, 11], # best gini impurity + [0, 6, 11, 12], # best entropy max_samples + [0, 6, 11, 12], # best entropy impurity + [0, 6, 11, 12], # best gini max_samples + [0, 6, 11, 12], # best gini impurity [0, 3, 8, 12], # random entropy max_samples - [0, 3, 9, 11], # random entropy impurity - [0, 4, 7, 12], # random gini max_samples - [0, 2, 5, 6], # random gini impurity + [0, 3, 7, 12], # random entropy impurity + [1, 7, 9, 12], # random gini max_samples + [1, 5, 8, 12], # random gini impurity ] X, y = load_wine(return_X_y=True) rn = 0 - for splitter_type in ["best", "random"]: + for feature_select in ["best", "random"]: for criterion in ["entropy", "gini"]: for criteria in [ "max_samples", "impurity", ]: tcl = self.build( - splitter_type=splitter_type, + feature_select=feature_select, criterion=criterion, criteria=criteria, ) @@ -213,7 +216,7 @@ class Splitter_test(unittest.TestCase): # print( # "{}, # {:7s}{:8s}{:15s}".format( # list(computed), - # splitter_type, + # feature_select, # criterion, # criteria, # ) @@ -222,3 +225,18 @@ class Splitter_test(unittest.TestCase): self.assertListEqual( X[:, computed].tolist(), dataset.tolist() ) + + def test_get_best_subspaces(self): + results = [ + (4, [3, 4, 11, 13]), + (7, [1, 3, 4, 5, 11, 13, 16]), + (9, [1, 3, 4, 5, 7, 10, 11, 13, 16]), + ] + X, y = load_dataset(n_features=20) + for k, expected in results: + tcl = self.build( + feature_select="best", + ) + Xs, computed = tcl.get_subspace(X, y, k) + self.assertListEqual(expected, list(computed)) + self.assertListEqual(X[:, expected].tolist(), Xs.tolist()) diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index c954126..afbab36 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -315,7 +315,7 @@ class Stree_test(unittest.TestCase): X, y = load_dataset(self._random_state) clf = Stree(random_state=self._random_state, max_features=2) clf.fit(X, y) - self.assertAlmostEqual(0.9246666666666666, clf.score(X, y)) + self.assertAlmostEqual(0.9453333333333334, clf.score(X, y)) def test_bogus_splitter_parameter(self): clf = Stree(splitter="duck")