Compare commits

..

1 Commits

Author SHA1 Message Date
9cb69ebc75 Implement hyperparam. context based normalization 2021-04-15 02:13:30 +02:00
4 changed files with 43 additions and 121 deletions

View File

@@ -1,56 +0,0 @@
name: "CodeQL"
on:
push:
branches: [ master ]
pull_request:
# The branches below must be a subset of the branches above
branches: [ master ]
schedule:
- cron: '16 17 * * 3'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
language: [ 'python' ]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
# Learn more:
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
steps:
- name: Checkout repository
uses: actions/checkout@v2
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v1
# Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language
#- run: |
# make bootstrap
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

View File

@@ -15,7 +15,6 @@ from typing import Optional
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_consistent_length from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.multiclass import check_classification_targets
@@ -180,7 +179,7 @@ class Splitter:
self, self,
clf: SVC = None, clf: SVC = None,
criterion: str = None, criterion: str = None,
feature_select: str = None, splitter_type: str = None,
criteria: str = None, criteria: str = None,
min_samples_split: int = None, min_samples_split: int = None,
random_state=None, random_state=None,
@@ -193,7 +192,7 @@ class Splitter:
self._criterion = criterion self._criterion = criterion
self._min_samples_split = min_samples_split self._min_samples_split = min_samples_split
self._criteria = criteria self._criteria = criteria
self._feature_select = feature_select self._splitter_type = splitter_type
self._normalize = normalize self._normalize = normalize
if clf is None: if clf is None:
@@ -212,10 +211,9 @@ class Splitter:
f"criteria has to be max_samples or impurity; got ({criteria})" f"criteria has to be max_samples or impurity; got ({criteria})"
) )
if feature_select not in ["random", "best"]: if splitter_type not in ["random", "best"]:
raise ValueError( raise ValueError(
"splitter must be either random or best, got " f"splitter must be either random or best, got({splitter_type})"
f"({feature_select})"
) )
self.criterion_function = getattr(self, f"_{self._criterion}") self.criterion_function = getattr(self, f"_{self._criterion}")
self.decision_criteria = getattr(self, f"_{self._criteria}") self.decision_criteria = getattr(self, f"_{self._criteria}")
@@ -332,10 +330,13 @@ class Splitter:
""" """
comb = set() comb = set()
# Generate at most 5 combinations # Generate at most 5 combinations
number = factorial(features) / ( if max_features == features:
factorial(max_features) * factorial(features - max_features) set_length = 1
) else:
set_length = min(5, number) number = factorial(features) / (
factorial(max_features) * factorial(features - max_features)
)
set_length = min(5, number)
while len(comb) < set_length: while len(comb) < set_length:
comb.add( comb.add(
tuple(sorted(random.sample(range(features), max_features))) tuple(sorted(random.sample(range(features), max_features)))
@@ -344,9 +345,9 @@ class Splitter:
def _get_subspaces_set( def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
) -> tuple: ) -> np.array:
"""Compute the indices of the features selected by splitter depending """Compute the indices of the features selected by splitter depending
on the self._feature_select hyper parameter on the self._splitter_type hyper parameter
Parameters Parameters
---------- ----------
@@ -360,28 +361,23 @@ class Splitter:
Returns Returns
------- -------
tuple np.array
indices of the features selected indices of the features selected
""" """
if dataset.shape[1] == max_features: features_sets = self._generate_spaces(dataset.shape[1], max_features)
# No feature reduction applies if len(features_sets) > 1:
return tuple(range(dataset.shape[1])) if self._splitter_type == "random":
if self._feature_select == "random": index = random.randint(0, len(features_sets) - 1)
features_sets = self._generate_spaces( return features_sets[index]
dataset.shape[1], max_features else:
) return self._select_best_set(dataset, labels, features_sets)
return self._select_best_set(dataset, labels, features_sets) else:
# Take KBest features return features_sets[0]
return (
SelectKBest(k=max_features)
.fit(dataset, labels)
.get_support(indices=True)
)
def get_subspace( def get_subspace(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
) -> tuple: ) -> tuple:
"""Re3turn a subspace of the selected dataset of max_features length. """Return a subspace of the selected dataset of max_features length.
Depending on hyperparmeter Depending on hyperparmeter
Parameters Parameters
@@ -617,7 +613,7 @@ class Stree(BaseEstimator, ClassifierMixin):
self.splitter_ = Splitter( self.splitter_ = Splitter(
clf=self._build_clf(), clf=self._build_clf(),
criterion=self.criterion, criterion=self.criterion,
feature_select=self.splitter, splitter_type=self.splitter,
criteria=self.split_criteria, criteria=self.split_criteria,
random_state=self.random_state, random_state=self.random_state,
min_samples_split=self.min_samples_split, min_samples_split=self.min_samples_split,

View File

@@ -6,7 +6,6 @@ import numpy as np
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.datasets import load_wine, load_iris from sklearn.datasets import load_wine, load_iris
from stree import Splitter from stree import Splitter
from .utils import load_dataset
class Splitter_test(unittest.TestCase): class Splitter_test(unittest.TestCase):
@@ -18,7 +17,7 @@ class Splitter_test(unittest.TestCase):
def build( def build(
clf=SVC, clf=SVC,
min_samples_split=0, min_samples_split=0,
feature_select="random", splitter_type="random",
criterion="gini", criterion="gini",
criteria="max_samples", criteria="max_samples",
random_state=None, random_state=None,
@@ -26,7 +25,7 @@ class Splitter_test(unittest.TestCase):
return Splitter( return Splitter(
clf=clf(random_state=random_state, kernel="rbf"), clf=clf(random_state=random_state, kernel="rbf"),
min_samples_split=min_samples_split, min_samples_split=min_samples_split,
feature_select=feature_select, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
random_state=random_state, random_state=random_state,
@@ -40,20 +39,20 @@ class Splitter_test(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(criterion="duck") self.build(criterion="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(feature_select="duck") self.build(splitter_type="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(criteria="duck") self.build(criteria="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = Splitter(clf=None) _ = Splitter(clf=None)
for feature_select in ["best", "random"]: for splitter_type in ["best", "random"]:
for criterion in ["gini", "entropy"]: for criterion in ["gini", "entropy"]:
for criteria in ["max_samples", "impurity"]: for criteria in ["max_samples", "impurity"]:
tcl = self.build( tcl = self.build(
feature_select=feature_select, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
) )
self.assertEqual(feature_select, tcl._feature_select) self.assertEqual(splitter_type, tcl._splitter_type)
self.assertEqual(criterion, tcl._criterion) self.assertEqual(criterion, tcl._criterion)
self.assertEqual(criteria, tcl._criteria) self.assertEqual(criteria, tcl._criteria)
@@ -178,34 +177,32 @@ class Splitter_test(unittest.TestCase):
def test_best_splitter_few_sets(self): def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1) X = np.delete(X, 3, 1)
tcl = self.build( tcl = self.build(splitter_type="best", random_state=self._random_state)
feature_select="best", random_state=self._random_state
)
dataset, computed = tcl.get_subspace(X, y, max_features=2) dataset, computed = tcl.get_subspace(X, y, max_features=2)
self.assertListEqual([0, 2], list(computed)) self.assertListEqual([0, 2], list(computed))
self.assertListEqual(X[:, computed].tolist(), dataset.tolist()) self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
def test_splitter_parameter(self): def test_splitter_parameter(self):
expected_values = [ expected_values = [
[0, 6, 11, 12], # best entropy max_samples [1, 4, 9, 12], # best entropy max_samples
[0, 6, 11, 12], # best entropy impurity [1, 3, 6, 10], # best entropy impurity
[0, 6, 11, 12], # best gini max_samples [6, 8, 10, 12], # best gini max_samples
[0, 6, 11, 12], # best gini impurity [7, 8, 10, 11], # best gini impurity
[0, 3, 8, 12], # random entropy max_samples [0, 3, 8, 12], # random entropy max_samples
[0, 3, 7, 12], # random entropy impurity [0, 3, 9, 11], # random entropy impurity
[1, 7, 9, 12], # random gini max_samples [0, 4, 7, 12], # random gini max_samples
[1, 5, 8, 12], # random gini impurity [0, 2, 5, 6], # random gini impurity
] ]
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
rn = 0 rn = 0
for feature_select in ["best", "random"]: for splitter_type in ["best", "random"]:
for criterion in ["entropy", "gini"]: for criterion in ["entropy", "gini"]:
for criteria in [ for criteria in [
"max_samples", "max_samples",
"impurity", "impurity",
]: ]:
tcl = self.build( tcl = self.build(
feature_select=feature_select, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
) )
@@ -216,7 +213,7 @@ class Splitter_test(unittest.TestCase):
# print( # print(
# "{}, # {:7s}{:8s}{:15s}".format( # "{}, # {:7s}{:8s}{:15s}".format(
# list(computed), # list(computed),
# feature_select, # splitter_type,
# criterion, # criterion,
# criteria, # criteria,
# ) # )
@@ -225,18 +222,3 @@ class Splitter_test(unittest.TestCase):
self.assertListEqual( self.assertListEqual(
X[:, computed].tolist(), dataset.tolist() X[:, computed].tolist(), dataset.tolist()
) )
def test_get_best_subspaces(self):
results = [
(4, [3, 4, 11, 13]),
(7, [1, 3, 4, 5, 11, 13, 16]),
(9, [1, 3, 4, 5, 7, 10, 11, 13, 16]),
]
X, y = load_dataset(n_features=20)
for k, expected in results:
tcl = self.build(
feature_select="best",
)
Xs, computed = tcl.get_subspace(X, y, k)
self.assertListEqual(expected, list(computed))
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())

View File

@@ -315,7 +315,7 @@ class Stree_test(unittest.TestCase):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2) clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y) clf.fit(X, y)
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y)) self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
def test_bogus_splitter_parameter(self): def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck") clf = Stree(splitter="duck")