Compare commits

...

3 Commits

Author SHA1 Message Date
1b08cb9bdf Add select KBest features #17 2021-04-26 01:15:30 +02:00
Ricardo Montañana Gómez
a4aac9d310 Create codeql-analysis.yml (#25) 2021-04-19 23:34:26 +02:00
Ricardo Montañana Gómez
8a18c998df Implement hyperparam. context based normalization (#32) 2021-04-18 18:57:39 +02:00
5 changed files with 161 additions and 49 deletions

56
.github/workflows/codeql-analysis.yml vendored Normal file
View File

@@ -0,0 +1,56 @@
name: "CodeQL"
on:
push:
branches: [ master ]
pull_request:
# The branches below must be a subset of the branches above
branches: [ master ]
schedule:
- cron: '16 17 * * 3'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
language: [ 'python' ]
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
# Learn more:
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
steps:
- name: Checkout repository
uses: actions/checkout@v2
# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.
# queries: ./path/to/local/query, your-org/your-repo/queries@main
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
# If this step fails, then you should remove it and run the build manually (see below)
- name: Autobuild
uses: github/codeql-action/autobuild@v1
# Command-line programs to run using the OS shell.
# 📚 https://git.io/JvXDl
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
# and modify them (or add more) to build your code if your project
# uses a compiled language
#- run: |
# make bootstrap
# make release
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1

View File

@@ -15,6 +15,8 @@ from typing import Optional
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_consistent_length from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.multiclass import check_classification_targets
from sklearn.exceptions import ConvergenceWarning from sklearn.exceptions import ConvergenceWarning
@@ -41,6 +43,7 @@ class Snode:
impurity: float, impurity: float,
title: str, title: str,
weight: np.ndarray = None, weight: np.ndarray = None,
scaler: StandardScaler = None,
): ):
self._clf = clf self._clf = clf
self._title = title self._title = title
@@ -58,6 +61,7 @@ class Snode:
self._features = features self._features = features
self._impurity = impurity self._impurity = impurity
self._partition_column: int = -1 self._partition_column: int = -1
self._scaler = scaler
@classmethod @classmethod
def copy(cls, node: "Snode") -> "Snode": def copy(cls, node: "Snode") -> "Snode":
@@ -68,6 +72,8 @@ class Snode:
node._features, node._features,
node._impurity, node._impurity,
node._title, node._title,
node._sample_weight,
node._scaler,
) )
def set_partition_column(self, col: int): def set_partition_column(self, col: int):
@@ -174,10 +180,11 @@ class Splitter:
self, self,
clf: SVC = None, clf: SVC = None,
criterion: str = None, criterion: str = None,
splitter_type: str = None, feature_select: str = None,
criteria: str = None, criteria: str = None,
min_samples_split: int = None, min_samples_split: int = None,
random_state=None, random_state=None,
normalize=False,
): ):
self._clf = clf self._clf = clf
self._random_state = random_state self._random_state = random_state
@@ -186,7 +193,8 @@ class Splitter:
self._criterion = criterion self._criterion = criterion
self._min_samples_split = min_samples_split self._min_samples_split = min_samples_split
self._criteria = criteria self._criteria = criteria
self._splitter_type = splitter_type self._feature_select = feature_select
self._normalize = normalize
if clf is None: if clf is None:
raise ValueError(f"clf has to be a sklearn estimator, got({clf})") raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
@@ -204,9 +212,10 @@ class Splitter:
f"criteria has to be max_samples or impurity; got ({criteria})" f"criteria has to be max_samples or impurity; got ({criteria})"
) )
if splitter_type not in ["random", "best"]: if feature_select not in ["random", "best"]:
raise ValueError( raise ValueError(
f"splitter must be either random or best, got({splitter_type})" "splitter must be either random or best, got "
f"({feature_select})"
) )
self.criterion_function = getattr(self, f"_{self._criterion}") self.criterion_function = getattr(self, f"_{self._criterion}")
self.decision_criteria = getattr(self, f"_{self._criteria}") self.decision_criteria = getattr(self, f"_{self._criteria}")
@@ -323,13 +332,10 @@ class Splitter:
""" """
comb = set() comb = set()
# Generate at most 5 combinations # Generate at most 5 combinations
if max_features == features: number = factorial(features) / (
set_length = 1 factorial(max_features) * factorial(features - max_features)
else: )
number = factorial(features) / ( set_length = min(5, number)
factorial(max_features) * factorial(features - max_features)
)
set_length = min(5, number)
while len(comb) < set_length: while len(comb) < set_length:
comb.add( comb.add(
tuple(sorted(random.sample(range(features), max_features))) tuple(sorted(random.sample(range(features), max_features)))
@@ -338,9 +344,9 @@ class Splitter:
def _get_subspaces_set( def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
) -> np.array: ) -> tuple:
"""Compute the indices of the features selected by splitter depending """Compute the indices of the features selected by splitter depending
on the self._splitter_type hyper parameter on the self._feature_select hyper parameter
Parameters Parameters
---------- ----------
@@ -354,23 +360,28 @@ class Splitter:
Returns Returns
------- -------
np.array tuple
indices of the features selected indices of the features selected
""" """
features_sets = self._generate_spaces(dataset.shape[1], max_features) if dataset.shape[1] == max_features:
if len(features_sets) > 1: # No feature reduction applies
if self._splitter_type == "random": return tuple(range(dataset.shape[1]))
index = random.randint(0, len(features_sets) - 1) if self._feature_select == "random":
return features_sets[index] features_sets = self._generate_spaces(
else: dataset.shape[1], max_features
return self._select_best_set(dataset, labels, features_sets) )
else: return self._select_best_set(dataset, labels, features_sets)
return features_sets[0] # Take KBest features
return (
SelectKBest(k=max_features)
.fit(dataset, labels)
.get_support(indices=True)
)
def get_subspace( def get_subspace(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
) -> tuple: ) -> tuple:
"""Return a subspace of the selected dataset of max_features length. """Re3turn a subspace of the selected dataset of max_features length.
Depending on hyperparmeter Depending on hyperparmeter
Parameters Parameters
@@ -486,8 +497,7 @@ class Splitter:
origin[down] if any(down) else None, origin[down] if any(down) else None,
] ]
@staticmethod def _distances(self, node: Snode, data: np.ndarray) -> np.array:
def _distances(node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node """Compute distances of the samples to the hyperplane of the node
Parameters Parameters
@@ -503,7 +513,10 @@ class Splitter:
array of shape (m, nc) with the distances of every sample to array of shape (m, nc) with the distances of every sample to
the hyperplane of every class. nc = # of classes the hyperplane of every class. nc = # of classes
""" """
return node._clf.decision_function(data[:, node._features]) X_transformed = data[:, node._features]
if self._normalize:
X_transformed = node._scaler.transform(X_transformed)
return node._clf.decision_function(X_transformed)
class Stree(BaseEstimator, ClassifierMixin): class Stree(BaseEstimator, ClassifierMixin):
@@ -529,6 +542,7 @@ class Stree(BaseEstimator, ClassifierMixin):
min_samples_split: int = 0, min_samples_split: int = 0,
max_features=None, max_features=None,
splitter: str = "random", splitter: str = "random",
normalize: bool = False,
): ):
self.max_iter = max_iter self.max_iter = max_iter
self.C = C self.C = C
@@ -543,6 +557,7 @@ class Stree(BaseEstimator, ClassifierMixin):
self.max_features = max_features self.max_features = max_features
self.criterion = criterion self.criterion = criterion
self.splitter = splitter self.splitter = splitter
self.normalize = normalize
def _more_tags(self) -> dict: def _more_tags(self) -> dict:
"""Required by sklearn to supply features of the classifier """Required by sklearn to supply features of the classifier
@@ -602,10 +617,11 @@ class Stree(BaseEstimator, ClassifierMixin):
self.splitter_ = Splitter( self.splitter_ = Splitter(
clf=self._build_clf(), clf=self._build_clf(),
criterion=self.criterion, criterion=self.criterion,
splitter_type=self.splitter, feature_select=self.splitter,
criteria=self.split_criteria, criteria=self.split_criteria,
random_state=self.random_state, random_state=self.random_state,
min_samples_split=self.min_samples_split, min_samples_split=self.min_samples_split,
normalize=self.normalize,
) )
if self.random_state is not None: if self.random_state is not None:
random.seed(self.random_state) random.seed(self.random_state)
@@ -660,7 +676,8 @@ class Stree(BaseEstimator, ClassifierMixin):
y = y[~indices_zero] y = y[~indices_zero]
sample_weight = sample_weight[~indices_zero] sample_weight = sample_weight[~indices_zero]
self.depth_ = max(depth, self.depth_) self.depth_ = max(depth, self.depth_)
node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight) scaler = StandardScaler()
node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight, scaler)
if np.unique(y).shape[0] == 1: if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset # only 1 class => pure dataset
node.set_title(title + ", <pure>") node.set_title(title + ", <pure>")
@@ -668,6 +685,9 @@ class Stree(BaseEstimator, ClassifierMixin):
# Train the model # Train the model
clf = self._build_clf() clf = self._build_clf()
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
if self.normalize:
scaler.fit(Xs)
Xs = scaler.transform(Xs)
clf.fit(Xs, y, sample_weight=sample_weight) clf.fit(Xs, y, sample_weight=sample_weight)
node.set_impurity(self.splitter_.partition_impurity(y)) node.set_impurity(self.splitter_.partition_impurity(y))
node.set_classifier(clf) node.set_classifier(clf)

View File

@@ -1,8 +1,6 @@
import os import os
import unittest import unittest
import numpy as np import numpy as np
from stree import Stree, Snode from stree import Stree, Snode
from .utils import load_dataset from .utils import load_dataset
@@ -119,3 +117,5 @@ class Snode_test(unittest.TestCase):
self.assertEqual("test", computed._title) self.assertEqual("test", computed._title)
self.assertIsInstance(computed._clf, Stree) self.assertIsInstance(computed._clf, Stree)
self.assertEqual(test._partition_column, computed._partition_column) self.assertEqual(test._partition_column, computed._partition_column)
self.assertEqual(test._sample_weight, computed._sample_weight)
self.assertEqual(test._scaler, computed._scaler)

View File

@@ -6,6 +6,7 @@ import numpy as np
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.datasets import load_wine, load_iris from sklearn.datasets import load_wine, load_iris
from stree import Splitter from stree import Splitter
from .utils import load_dataset
class Splitter_test(unittest.TestCase): class Splitter_test(unittest.TestCase):
@@ -17,7 +18,7 @@ class Splitter_test(unittest.TestCase):
def build( def build(
clf=SVC, clf=SVC,
min_samples_split=0, min_samples_split=0,
splitter_type="random", feature_select="random",
criterion="gini", criterion="gini",
criteria="max_samples", criteria="max_samples",
random_state=None, random_state=None,
@@ -25,7 +26,7 @@ class Splitter_test(unittest.TestCase):
return Splitter( return Splitter(
clf=clf(random_state=random_state, kernel="rbf"), clf=clf(random_state=random_state, kernel="rbf"),
min_samples_split=min_samples_split, min_samples_split=min_samples_split,
splitter_type=splitter_type, feature_select=feature_select,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
random_state=random_state, random_state=random_state,
@@ -39,20 +40,20 @@ class Splitter_test(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(criterion="duck") self.build(criterion="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(splitter_type="duck") self.build(feature_select="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(criteria="duck") self.build(criteria="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = Splitter(clf=None) _ = Splitter(clf=None)
for splitter_type in ["best", "random"]: for feature_select in ["best", "random"]:
for criterion in ["gini", "entropy"]: for criterion in ["gini", "entropy"]:
for criteria in ["max_samples", "impurity"]: for criteria in ["max_samples", "impurity"]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, feature_select=feature_select,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
) )
self.assertEqual(splitter_type, tcl._splitter_type) self.assertEqual(feature_select, tcl._feature_select)
self.assertEqual(criterion, tcl._criterion) self.assertEqual(criterion, tcl._criterion)
self.assertEqual(criteria, tcl._criteria) self.assertEqual(criteria, tcl._criteria)
@@ -177,32 +178,34 @@ class Splitter_test(unittest.TestCase):
def test_best_splitter_few_sets(self): def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1) X = np.delete(X, 3, 1)
tcl = self.build(splitter_type="best", random_state=self._random_state) tcl = self.build(
feature_select="best", random_state=self._random_state
)
dataset, computed = tcl.get_subspace(X, y, max_features=2) dataset, computed = tcl.get_subspace(X, y, max_features=2)
self.assertListEqual([0, 2], list(computed)) self.assertListEqual([0, 2], list(computed))
self.assertListEqual(X[:, computed].tolist(), dataset.tolist()) self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
def test_splitter_parameter(self): def test_splitter_parameter(self):
expected_values = [ expected_values = [
[1, 4, 9, 12], # best entropy max_samples [0, 6, 11, 12], # best entropy max_samples
[1, 3, 6, 10], # best entropy impurity [0, 6, 11, 12], # best entropy impurity
[6, 8, 10, 12], # best gini max_samples [0, 6, 11, 12], # best gini max_samples
[7, 8, 10, 11], # best gini impurity [0, 6, 11, 12], # best gini impurity
[0, 3, 8, 12], # random entropy max_samples [0, 3, 8, 12], # random entropy max_samples
[0, 3, 9, 11], # random entropy impurity [0, 3, 7, 12], # random entropy impurity
[0, 4, 7, 12], # random gini max_samples [1, 7, 9, 12], # random gini max_samples
[0, 2, 5, 6], # random gini impurity [1, 5, 8, 12], # random gini impurity
] ]
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
rn = 0 rn = 0
for splitter_type in ["best", "random"]: for feature_select in ["best", "random"]:
for criterion in ["entropy", "gini"]: for criterion in ["entropy", "gini"]:
for criteria in [ for criteria in [
"max_samples", "max_samples",
"impurity", "impurity",
]: ]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, feature_select=feature_select,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
) )
@@ -213,7 +216,7 @@ class Splitter_test(unittest.TestCase):
# print( # print(
# "{}, # {:7s}{:8s}{:15s}".format( # "{}, # {:7s}{:8s}{:15s}".format(
# list(computed), # list(computed),
# splitter_type, # feature_select,
# criterion, # criterion,
# criteria, # criteria,
# ) # )
@@ -222,3 +225,18 @@ class Splitter_test(unittest.TestCase):
self.assertListEqual( self.assertListEqual(
X[:, computed].tolist(), dataset.tolist() X[:, computed].tolist(), dataset.tolist()
) )
def test_get_best_subspaces(self):
results = [
(4, [3, 4, 11, 13]),
(7, [1, 3, 4, 5, 11, 13, 16]),
(9, [1, 3, 4, 5, 7, 10, 11, 13, 16]),
]
X, y = load_dataset(n_features=20)
for k, expected in results:
tcl = self.build(
feature_select="best",
)
Xs, computed = tcl.get_subspace(X, y, k)
self.assertListEqual(expected, list(computed))
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())

View File

@@ -315,7 +315,7 @@ class Stree_test(unittest.TestCase):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2) clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y) clf.fit(X, y)
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y)) self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
def test_bogus_splitter_parameter(self): def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck") clf = Stree(splitter="duck")
@@ -378,9 +378,14 @@ class Stree_test(unittest.TestCase):
n_samples=500, n_samples=500,
) )
clf = Stree(kernel="rbf", random_state=self._random_state) clf = Stree(kernel="rbf", random_state=self._random_state)
clf2 = Stree(
kernel="rbf", random_state=self._random_state, normalize=True
)
self.assertEqual(0.768, clf.fit(X, y).score(X, y)) self.assertEqual(0.768, clf.fit(X, y).score(X, y))
self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y)) self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_score_multiclass_poly(self): def test_score_multiclass_poly(self):
X, y = load_dataset( X, y = load_dataset(
@@ -392,9 +397,16 @@ class Stree_test(unittest.TestCase):
clf = Stree( clf = Stree(
kernel="poly", random_state=self._random_state, C=10, degree=5 kernel="poly", random_state=self._random_state, C=10, degree=5
) )
clf2 = Stree(
kernel="poly",
random_state=self._random_state,
normalize=True,
)
self.assertEqual(0.786, clf.fit(X, y).score(X, y)) self.assertEqual(0.786, clf.fit(X, y).score(X, y))
self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y)) self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
def test_score_multiclass_linear(self): def test_score_multiclass_linear(self):
X, y = load_dataset( X, y = load_dataset(
@@ -405,8 +417,14 @@ class Stree_test(unittest.TestCase):
) )
clf = Stree(kernel="linear", random_state=self._random_state) clf = Stree(kernel="linear", random_state=self._random_state)
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y)) self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
# Check with context based standardization
clf2 = Stree(
kernel="linear", random_state=self._random_state, normalize=True
)
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y)) self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y))
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_zero_all_sample_weights(self): def test_zero_all_sample_weights(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)