Merge branch 'master' into package_doc_#7

This commit is contained in:
Ricardo Montañana Gómez
2021-04-26 09:04:21 +02:00
committed by GitHub
4 changed files with 120 additions and 40 deletions

56
.github/workflows/codeql-analysis.yml vendored Normal file
View File

@@ -0,0 +1,56 @@
name: "CodeQL"
on:
push:
branches: [ master ]
pull_request:
# The branches below must be a subset of the branches above
branches: [ master ]
schedule:
- cron: '16 17 * * 3'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
language: [ 'python' ]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
# Learn more:
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
steps:
- name: Checkout repository
uses: actions/checkout@v2
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v1
# Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language
#- run: |
# make bootstrap
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

View File

@@ -11,6 +11,7 @@ from typing import Optional
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.multiclass import check_classification_targets
from sklearn.exceptions import ConvergenceWarning from sklearn.exceptions import ConvergenceWarning
@@ -172,7 +173,7 @@ class Splitter:
self, self,
clf: SVC = None, clf: SVC = None,
criterion: str = None, criterion: str = None,
splitter_type: str = None, feature_select: str = None,
criteria: str = None, criteria: str = None,
min_samples_split: int = None, min_samples_split: int = None,
random_state=None, random_state=None,
@@ -185,7 +186,7 @@ class Splitter:
self._criterion = criterion self._criterion = criterion
self._min_samples_split = min_samples_split self._min_samples_split = min_samples_split
self._criteria = criteria self._criteria = criteria
self._splitter_type = splitter_type self._feature_select = feature_select
self._normalize = normalize self._normalize = normalize
if clf is None: if clf is None:
@@ -204,9 +205,10 @@ class Splitter:
f"criteria has to be max_samples or impurity; got ({criteria})" f"criteria has to be max_samples or impurity; got ({criteria})"
) )
if splitter_type not in ["random", "best"]: if feature_select not in ["random", "best"]:
raise ValueError( raise ValueError(
f"splitter must be either random or best, got({splitter_type})" "splitter must be either random or best, got "
f"({feature_select})"
) )
self.criterion_function = getattr(self, f"_{self._criterion}") self.criterion_function = getattr(self, f"_{self._criterion}")
self.decision_criteria = getattr(self, f"_{self._criteria}") self.decision_criteria = getattr(self, f"_{self._criteria}")
@@ -340,13 +342,10 @@ class Splitter:
""" """
comb = set() comb = set()
# Generate at most 5 combinations # Generate at most 5 combinations
if max_features == features: number = factorial(features) / (
set_length = 1 factorial(max_features) * factorial(features - max_features)
else: )
number = factorial(features) / ( set_length = min(5, number)
factorial(max_features) * factorial(features - max_features)
)
set_length = min(5, number)
while len(comb) < set_length: while len(comb) < set_length:
comb.add( comb.add(
tuple(sorted(random.sample(range(features), max_features))) tuple(sorted(random.sample(range(features), max_features)))
@@ -355,9 +354,9 @@ class Splitter:
def _get_subspaces_set( def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
) -> np.array: ) -> tuple:
"""Compute the indices of the features selected by splitter depending """Compute the indices of the features selected by splitter depending
on the self._splitter_type hyper parameter on the self._feature_select hyper parameter
Parameters Parameters
---------- ----------
@@ -371,21 +370,28 @@ class Splitter:
Returns Returns
------- -------
np.array tuple
indices of the features selected indices of the features selected
""" """
features_sets = self._generate_spaces(dataset.shape[1], max_features) if dataset.shape[1] == max_features:
if len(features_sets) > 1: # No feature reduction applies
if self._splitter_type == "random": return tuple(range(dataset.shape[1]))
index = random.randint(0, len(features_sets) - 1) if self._feature_select == "random":
return features_sets[index] features_sets = self._generate_spaces(
dataset.shape[1], max_features
)
return self._select_best_set(dataset, labels, features_sets) return self._select_best_set(dataset, labels, features_sets)
return features_sets[0] # Take KBest features
return (
SelectKBest(k=max_features)
.fit(dataset, labels)
.get_support(indices=True)
)
def get_subspace( def get_subspace(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
) -> tuple: ) -> tuple:
"""Return a subspace of the selected dataset of max_features length. """Re3turn a subspace of the selected dataset of max_features length.
Depending on hyperparmeter Depending on hyperparmeter
Parameters Parameters
@@ -632,7 +638,7 @@ class Stree(BaseEstimator, ClassifierMixin):
self.splitter_ = Splitter( self.splitter_ = Splitter(
clf=self._build_clf(), clf=self._build_clf(),
criterion=self.criterion, criterion=self.criterion,
splitter_type=self.splitter, feature_select=self.splitter,
criteria=self.split_criteria, criteria=self.split_criteria,
random_state=self.random_state, random_state=self.random_state,
min_samples_split=self.min_samples_split, min_samples_split=self.min_samples_split,

View File

@@ -6,6 +6,7 @@ import numpy as np
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.datasets import load_wine, load_iris from sklearn.datasets import load_wine, load_iris
from stree import Splitter from stree import Splitter
from .utils import load_dataset
class Splitter_test(unittest.TestCase): class Splitter_test(unittest.TestCase):
@@ -17,7 +18,7 @@ class Splitter_test(unittest.TestCase):
def build( def build(
clf=SVC, clf=SVC,
min_samples_split=0, min_samples_split=0,
splitter_type="random", feature_select="random",
criterion="gini", criterion="gini",
criteria="max_samples", criteria="max_samples",
random_state=None, random_state=None,
@@ -25,7 +26,7 @@ class Splitter_test(unittest.TestCase):
return Splitter( return Splitter(
clf=clf(random_state=random_state, kernel="rbf"), clf=clf(random_state=random_state, kernel="rbf"),
min_samples_split=min_samples_split, min_samples_split=min_samples_split,
splitter_type=splitter_type, feature_select=feature_select,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
random_state=random_state, random_state=random_state,
@@ -39,20 +40,20 @@ class Splitter_test(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(criterion="duck") self.build(criterion="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(splitter_type="duck") self.build(feature_select="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(criteria="duck") self.build(criteria="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = Splitter(clf=None) _ = Splitter(clf=None)
for splitter_type in ["best", "random"]: for feature_select in ["best", "random"]:
for criterion in ["gini", "entropy"]: for criterion in ["gini", "entropy"]:
for criteria in ["max_samples", "impurity"]: for criteria in ["max_samples", "impurity"]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, feature_select=feature_select,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
) )
self.assertEqual(splitter_type, tcl._splitter_type) self.assertEqual(feature_select, tcl._feature_select)
self.assertEqual(criterion, tcl._criterion) self.assertEqual(criterion, tcl._criterion)
self.assertEqual(criteria, tcl._criteria) self.assertEqual(criteria, tcl._criteria)
@@ -177,32 +178,34 @@ class Splitter_test(unittest.TestCase):
def test_best_splitter_few_sets(self): def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1) X = np.delete(X, 3, 1)
tcl = self.build(splitter_type="best", random_state=self._random_state) tcl = self.build(
feature_select="best", random_state=self._random_state
)
dataset, computed = tcl.get_subspace(X, y, max_features=2) dataset, computed = tcl.get_subspace(X, y, max_features=2)
self.assertListEqual([0, 2], list(computed)) self.assertListEqual([0, 2], list(computed))
self.assertListEqual(X[:, computed].tolist(), dataset.tolist()) self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
def test_splitter_parameter(self): def test_splitter_parameter(self):
expected_values = [ expected_values = [
[1, 4, 9, 12], # best entropy max_samples [0, 6, 11, 12], # best entropy max_samples
[1, 3, 6, 10], # best entropy impurity [0, 6, 11, 12], # best entropy impurity
[6, 8, 10, 12], # best gini max_samples [0, 6, 11, 12], # best gini max_samples
[7, 8, 10, 11], # best gini impurity [0, 6, 11, 12], # best gini impurity
[0, 3, 8, 12], # random entropy max_samples [0, 3, 8, 12], # random entropy max_samples
[0, 3, 9, 11], # random entropy impurity [0, 3, 7, 12], # random entropy impurity
[0, 4, 7, 12], # random gini max_samples [1, 7, 9, 12], # random gini max_samples
[0, 2, 5, 6], # random gini impurity [1, 5, 8, 12], # random gini impurity
] ]
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
rn = 0 rn = 0
for splitter_type in ["best", "random"]: for feature_select in ["best", "random"]:
for criterion in ["entropy", "gini"]: for criterion in ["entropy", "gini"]:
for criteria in [ for criteria in [
"max_samples", "max_samples",
"impurity", "impurity",
]: ]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, feature_select=feature_select,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
) )
@@ -213,7 +216,7 @@ class Splitter_test(unittest.TestCase):
# print( # print(
# "{}, # {:7s}{:8s}{:15s}".format( # "{}, # {:7s}{:8s}{:15s}".format(
# list(computed), # list(computed),
# splitter_type, # feature_select,
# criterion, # criterion,
# criteria, # criteria,
# ) # )
@@ -222,3 +225,18 @@ class Splitter_test(unittest.TestCase):
self.assertListEqual( self.assertListEqual(
X[:, computed].tolist(), dataset.tolist() X[:, computed].tolist(), dataset.tolist()
) )
def test_get_best_subspaces(self):
results = [
(4, [3, 4, 11, 13]),
(7, [1, 3, 4, 5, 11, 13, 16]),
(9, [1, 3, 4, 5, 7, 10, 11, 13, 16]),
]
X, y = load_dataset(n_features=20)
for k, expected in results:
tcl = self.build(
feature_select="best",
)
Xs, computed = tcl.get_subspace(X, y, k)
self.assertListEqual(expected, list(computed))
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())

View File

@@ -330,7 +330,7 @@ class Stree_test(unittest.TestCase):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2) clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y) clf.fit(X, y)
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y)) self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
def test_bogus_splitter_parameter(self): def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck") clf = Stree(splitter="duck")