Implement hyperparam. context based normalization

2025-08-18 08:56:00 +00:00 · 2021-04-15 02:13:30 +02:00
4 changed files with 43 additions and 121 deletions
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -1,56 +0,0 @@
 name: "CodeQL"
 on:
  push:
    branches: [ master ]
  pull_request:
    # The branches below must be a subset of the branches above
    branches: [ master ]
  schedule:
    - cron: '16 17 * * 3'
 jobs:
  analyze:
    name: Analyze
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        language: [ 'python' ]
        # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
        # Learn more:
        # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
    steps:
    - name: Checkout repository
      uses: actions/checkout@v2
    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
      uses: github/codeql-action/init@v1
      with:
        languages: ${{ matrix.language }}
        # If you wish to specify custom queries, you can do so here or in a config file.
        # By default, queries listed here will override any specified in a config file.
        # Prefix the list here with "+" to use these queries and those in the config file.
        # queries: ./path/to/local/query, your-org/your-repo/queries@main
    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
    # If this step fails, then you should remove it and run the build manually (see below)
    - name: Autobuild
      uses: github/codeql-action/autobuild@v1
    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 https://git.io/JvXDl
    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
    #    and modify them (or add more) to build your code if your project
    #    uses a compiled language
    #- run: |
    #   make bootstrap
    #   make release
    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v1
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -15,7 +15,6 @@ from typing import Optional
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.svm import SVC, LinearSVC
 from sklearn.feature_selection import SelectKBest
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import check_consistent_length
 from sklearn.utils.multiclass import check_classification_targets
@@ -180,7 +179,7 @@ class Splitter:
        self,
        clf: SVC = None,
        criterion: str = None,
-        feature_select: str = None,
+        splitter_type: str = None,
        criteria: str = None,
        min_samples_split: int = None,
        random_state=None,
@@ -193,7 +192,7 @@ class Splitter:
        self._criterion = criterion
        self._min_samples_split = min_samples_split
        self._criteria = criteria
-        self._feature_select = feature_select
+        self._splitter_type = splitter_type
        self._normalize = normalize
        if clf is None:
@@ -212,10 +211,9 @@ class Splitter:
                f"criteria has to be max_samples or impurity; got ({criteria})"
            )
-        if feature_select not in ["random", "best"]:
+        if splitter_type not in ["random", "best"]:
            raise ValueError(
-                "splitter must be either random or best, got "
+                f"splitter must be either random or best, got({splitter_type})"
                f"({feature_select})"
            )
        self.criterion_function = getattr(self, f"_{self._criterion}")
        self.decision_criteria = getattr(self, f"_{self._criteria}")
@@ -332,10 +330,13 @@ class Splitter:
        """
        comb = set()
        # Generate at most 5 combinations
-        number = factorial(features) / (
+        if max_features == features:
-            factorial(max_features) * factorial(features - max_features)
+            set_length = 1
-        )
+        else:
-        set_length = min(5, number)
+            number = factorial(features) / (
                factorial(max_features) * factorial(features - max_features)
            )
            set_length = min(5, number)
        while len(comb) < set_length:
            comb.add(
                tuple(sorted(random.sample(range(features), max_features)))
@@ -344,9 +345,9 @@ class Splitter:
    def _get_subspaces_set(
        self, dataset: np.array, labels: np.array, max_features: int
-    ) -> tuple:
+    ) -> np.array:
        """Compute the indices of the features selected by splitter depending
-        on the self._feature_select hyper parameter
+        on the self._splitter_type hyper parameter
        Parameters
        ----------
@@ -360,28 +361,23 @@ class Splitter:
        Returns
        -------
-        tuple
+        np.array
            indices of the features selected
        """
-        if dataset.shape[1] == max_features:
+        features_sets = self._generate_spaces(dataset.shape[1], max_features)
-            # No feature reduction applies
+        if len(features_sets) > 1:
-            return tuple(range(dataset.shape[1]))
+            if self._splitter_type == "random":
-        if self._feature_select == "random":
+                index = random.randint(0, len(features_sets) - 1)
-            features_sets = self._generate_spaces(
+                return features_sets[index]
-                dataset.shape[1], max_features
+            else:
-            )
+                return self._select_best_set(dataset, labels, features_sets)
-            return self._select_best_set(dataset, labels, features_sets)
+        else:
-        # Take KBest features
+            return features_sets[0]
        return (
            SelectKBest(k=max_features)
            .fit(dataset, labels)
            .get_support(indices=True)
        )
    def get_subspace(
        self, dataset: np.array, labels: np.array, max_features: int
    ) -> tuple:
-        """Re3turn a subspace of the selected dataset of max_features length.
+        """Return a subspace of the selected dataset of max_features length.
        Depending on hyperparmeter
        Parameters
@@ -617,7 +613,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        self.splitter_ = Splitter(
            clf=self._build_clf(),
            criterion=self.criterion,
-            feature_select=self.splitter,
+            splitter_type=self.splitter,
            criteria=self.split_criteria,
            random_state=self.random_state,
            min_samples_split=self.min_samples_split,
--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -6,7 +6,6 @@ import numpy as np
 from sklearn.svm import SVC
 from sklearn.datasets import load_wine, load_iris
 from stree import Splitter
 from .utils import load_dataset
 class Splitter_test(unittest.TestCase):
@@ -18,7 +17,7 @@ class Splitter_test(unittest.TestCase):
    def build(
        clf=SVC,
        min_samples_split=0,
-        feature_select="random",
+        splitter_type="random",
        criterion="gini",
        criteria="max_samples",
        random_state=None,
@@ -26,7 +25,7 @@ class Splitter_test(unittest.TestCase):
        return Splitter(
            clf=clf(random_state=random_state, kernel="rbf"),
            min_samples_split=min_samples_split,
-            feature_select=feature_select,
+            splitter_type=splitter_type,
            criterion=criterion,
            criteria=criteria,
            random_state=random_state,
@@ -40,20 +39,20 @@ class Splitter_test(unittest.TestCase):
        with self.assertRaises(ValueError):
            self.build(criterion="duck")
        with self.assertRaises(ValueError):
-            self.build(feature_select="duck")
+            self.build(splitter_type="duck")
        with self.assertRaises(ValueError):
            self.build(criteria="duck")
        with self.assertRaises(ValueError):
            _ = Splitter(clf=None)
-        for feature_select in ["best", "random"]:
+        for splitter_type in ["best", "random"]:
            for criterion in ["gini", "entropy"]:
                for criteria in ["max_samples", "impurity"]:
                    tcl = self.build(
-                        feature_select=feature_select,
+                        splitter_type=splitter_type,
                        criterion=criterion,
                        criteria=criteria,
                    )
-                    self.assertEqual(feature_select, tcl._feature_select)
+                    self.assertEqual(splitter_type, tcl._splitter_type)
                    self.assertEqual(criterion, tcl._criterion)
                    self.assertEqual(criteria, tcl._criteria)
@@ -178,34 +177,32 @@ class Splitter_test(unittest.TestCase):
    def test_best_splitter_few_sets(self):
        X, y = load_iris(return_X_y=True)
        X = np.delete(X, 3, 1)
-        tcl = self.build(
+        tcl = self.build(splitter_type="best", random_state=self._random_state)
            feature_select="best", random_state=self._random_state
        )
        dataset, computed = tcl.get_subspace(X, y, max_features=2)
        self.assertListEqual([0, 2], list(computed))
        self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
    def test_splitter_parameter(self):
        expected_values = [
-            [0, 6, 11, 12],  # best   entropy max_samples
+            [1, 4, 9, 12],  # best   entropy max_samples
-            [0, 6, 11, 12],  # best   entropy impurity
+            [1, 3, 6, 10],  # best   entropy impurity
-            [0, 6, 11, 12],  # best   gini    max_samples
+            [6, 8, 10, 12],  # best   gini    max_samples
-            [0, 6, 11, 12],  # best   gini    impurity
+            [7, 8, 10, 11],  # best   gini    impurity
            [0, 3, 8, 12],  # random entropy max_samples
-            [0, 3, 7, 12],  # random entropy impurity
+            [0, 3, 9, 11],  # random entropy impurity
-            [1, 7, 9, 12],  # random gini    max_samples
+            [0, 4, 7, 12],  # random gini    max_samples
-            [1, 5, 8, 12],  # random gini    impurity
+            [0, 2, 5, 6],  # random gini    impurity
        ]
        X, y = load_wine(return_X_y=True)
        rn = 0
-        for feature_select in ["best", "random"]:
+        for splitter_type in ["best", "random"]:
            for criterion in ["entropy", "gini"]:
                for criteria in [
                    "max_samples",
                    "impurity",
                ]:
                    tcl = self.build(
-                        feature_select=feature_select,
+                        splitter_type=splitter_type,
                        criterion=criterion,
                        criteria=criteria,
                    )
@@ -216,7 +213,7 @@ class Splitter_test(unittest.TestCase):
                    # print(
                    #     "{},  # {:7s}{:8s}{:15s}".format(
                    #         list(computed),
-                    #         feature_select,
+                    #         splitter_type,
                    #         criterion,
                    #         criteria,
                    #     )
@@ -225,18 +222,3 @@ class Splitter_test(unittest.TestCase):
                    self.assertListEqual(
                        X[:, computed].tolist(), dataset.tolist()
                    )
    def test_get_best_subspaces(self):
        results = [
            (4, [3, 4, 11, 13]),
            (7, [1, 3, 4, 5, 11, 13, 16]),
            (9, [1, 3, 4, 5, 7, 10, 11, 13, 16]),
        ]
        X, y = load_dataset(n_features=20)
        for k, expected in results:
            tcl = self.build(
                feature_select="best",
            )
            Xs, computed = tcl.get_subspace(X, y, k)
            self.assertListEqual(expected, list(computed))
            self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -315,7 +315,7 @@ class Stree_test(unittest.TestCase):
        X, y = load_dataset(self._random_state)
        clf = Stree(random_state=self._random_state, max_features=2)
        clf.fit(X, y)
-        self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
+        self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
    def test_bogus_splitter_parameter(self):
        clf = Stree(splitter="duck")