mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 15:36:00 +00:00
Merge branch 'master' into package_doc_#7
This commit is contained in:
56
.github/workflows/codeql-analysis.yml
vendored
Normal file
56
.github/workflows/codeql-analysis.yml
vendored
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
name: "CodeQL"
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ master ]
|
||||||
|
pull_request:
|
||||||
|
# The branches below must be a subset of the branches above
|
||||||
|
branches: [ master ]
|
||||||
|
schedule:
|
||||||
|
- cron: '16 17 * * 3'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
analyze:
|
||||||
|
name: Analyze
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
language: [ 'python' ]
|
||||||
|
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
|
||||||
|
# Learn more:
|
||||||
|
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
|
||||||
|
# Initializes the CodeQL tools for scanning.
|
||||||
|
- name: Initialize CodeQL
|
||||||
|
uses: github/codeql-action/init@v1
|
||||||
|
with:
|
||||||
|
languages: ${{ matrix.language }}
|
||||||
|
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||||
|
# By default, queries listed here will override any specified in a config file.
|
||||||
|
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||||
|
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||||
|
|
||||||
|
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||||
|
# If this step fails, then you should remove it and run the build manually (see below)
|
||||||
|
- name: Autobuild
|
||||||
|
uses: github/codeql-action/autobuild@v1
|
||||||
|
|
||||||
|
# ℹ️ Command-line programs to run using the OS shell.
|
||||||
|
# 📚 https://git.io/JvXDl
|
||||||
|
|
||||||
|
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
|
||||||
|
# and modify them (or add more) to build your code if your project
|
||||||
|
# uses a compiled language
|
||||||
|
|
||||||
|
#- run: |
|
||||||
|
# make bootstrap
|
||||||
|
# make release
|
||||||
|
|
||||||
|
- name: Perform CodeQL Analysis
|
||||||
|
uses: github/codeql-action/analyze@v1
|
@@ -11,6 +11,7 @@ from typing import Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import SVC, LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
|
from sklearn.feature_selection import SelectKBest
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.utils.multiclass import check_classification_targets
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
@@ -172,7 +173,7 @@ class Splitter:
|
|||||||
self,
|
self,
|
||||||
clf: SVC = None,
|
clf: SVC = None,
|
||||||
criterion: str = None,
|
criterion: str = None,
|
||||||
splitter_type: str = None,
|
feature_select: str = None,
|
||||||
criteria: str = None,
|
criteria: str = None,
|
||||||
min_samples_split: int = None,
|
min_samples_split: int = None,
|
||||||
random_state=None,
|
random_state=None,
|
||||||
@@ -185,7 +186,7 @@ class Splitter:
|
|||||||
self._criterion = criterion
|
self._criterion = criterion
|
||||||
self._min_samples_split = min_samples_split
|
self._min_samples_split = min_samples_split
|
||||||
self._criteria = criteria
|
self._criteria = criteria
|
||||||
self._splitter_type = splitter_type
|
self._feature_select = feature_select
|
||||||
self._normalize = normalize
|
self._normalize = normalize
|
||||||
|
|
||||||
if clf is None:
|
if clf is None:
|
||||||
@@ -204,9 +205,10 @@ class Splitter:
|
|||||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if splitter_type not in ["random", "best"]:
|
if feature_select not in ["random", "best"]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"splitter must be either random or best, got({splitter_type})"
|
"splitter must be either random or best, got "
|
||||||
|
f"({feature_select})"
|
||||||
)
|
)
|
||||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||||
@@ -340,13 +342,10 @@ class Splitter:
|
|||||||
"""
|
"""
|
||||||
comb = set()
|
comb = set()
|
||||||
# Generate at most 5 combinations
|
# Generate at most 5 combinations
|
||||||
if max_features == features:
|
number = factorial(features) / (
|
||||||
set_length = 1
|
factorial(max_features) * factorial(features - max_features)
|
||||||
else:
|
)
|
||||||
number = factorial(features) / (
|
set_length = min(5, number)
|
||||||
factorial(max_features) * factorial(features - max_features)
|
|
||||||
)
|
|
||||||
set_length = min(5, number)
|
|
||||||
while len(comb) < set_length:
|
while len(comb) < set_length:
|
||||||
comb.add(
|
comb.add(
|
||||||
tuple(sorted(random.sample(range(features), max_features)))
|
tuple(sorted(random.sample(range(features), max_features)))
|
||||||
@@ -355,9 +354,9 @@ class Splitter:
|
|||||||
|
|
||||||
def _get_subspaces_set(
|
def _get_subspaces_set(
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
) -> np.array:
|
) -> tuple:
|
||||||
"""Compute the indices of the features selected by splitter depending
|
"""Compute the indices of the features selected by splitter depending
|
||||||
on the self._splitter_type hyper parameter
|
on the self._feature_select hyper parameter
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -371,21 +370,28 @@ class Splitter:
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
np.array
|
tuple
|
||||||
indices of the features selected
|
indices of the features selected
|
||||||
"""
|
"""
|
||||||
features_sets = self._generate_spaces(dataset.shape[1], max_features)
|
if dataset.shape[1] == max_features:
|
||||||
if len(features_sets) > 1:
|
# No feature reduction applies
|
||||||
if self._splitter_type == "random":
|
return tuple(range(dataset.shape[1]))
|
||||||
index = random.randint(0, len(features_sets) - 1)
|
if self._feature_select == "random":
|
||||||
return features_sets[index]
|
features_sets = self._generate_spaces(
|
||||||
|
dataset.shape[1], max_features
|
||||||
|
)
|
||||||
return self._select_best_set(dataset, labels, features_sets)
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
return features_sets[0]
|
# Take KBest features
|
||||||
|
return (
|
||||||
|
SelectKBest(k=max_features)
|
||||||
|
.fit(dataset, labels)
|
||||||
|
.get_support(indices=True)
|
||||||
|
)
|
||||||
|
|
||||||
def get_subspace(
|
def get_subspace(
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
"""Return a subspace of the selected dataset of max_features length.
|
"""Re3turn a subspace of the selected dataset of max_features length.
|
||||||
Depending on hyperparmeter
|
Depending on hyperparmeter
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@@ -632,7 +638,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.splitter_ = Splitter(
|
self.splitter_ = Splitter(
|
||||||
clf=self._build_clf(),
|
clf=self._build_clf(),
|
||||||
criterion=self.criterion,
|
criterion=self.criterion,
|
||||||
splitter_type=self.splitter,
|
feature_select=self.splitter,
|
||||||
criteria=self.split_criteria,
|
criteria=self.split_criteria,
|
||||||
random_state=self.random_state,
|
random_state=self.random_state,
|
||||||
min_samples_split=self.min_samples_split,
|
min_samples_split=self.min_samples_split,
|
||||||
|
@@ -6,6 +6,7 @@ import numpy as np
|
|||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.datasets import load_wine, load_iris
|
from sklearn.datasets import load_wine, load_iris
|
||||||
from stree import Splitter
|
from stree import Splitter
|
||||||
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
class Splitter_test(unittest.TestCase):
|
class Splitter_test(unittest.TestCase):
|
||||||
@@ -17,7 +18,7 @@ class Splitter_test(unittest.TestCase):
|
|||||||
def build(
|
def build(
|
||||||
clf=SVC,
|
clf=SVC,
|
||||||
min_samples_split=0,
|
min_samples_split=0,
|
||||||
splitter_type="random",
|
feature_select="random",
|
||||||
criterion="gini",
|
criterion="gini",
|
||||||
criteria="max_samples",
|
criteria="max_samples",
|
||||||
random_state=None,
|
random_state=None,
|
||||||
@@ -25,7 +26,7 @@ class Splitter_test(unittest.TestCase):
|
|||||||
return Splitter(
|
return Splitter(
|
||||||
clf=clf(random_state=random_state, kernel="rbf"),
|
clf=clf(random_state=random_state, kernel="rbf"),
|
||||||
min_samples_split=min_samples_split,
|
min_samples_split=min_samples_split,
|
||||||
splitter_type=splitter_type,
|
feature_select=feature_select,
|
||||||
criterion=criterion,
|
criterion=criterion,
|
||||||
criteria=criteria,
|
criteria=criteria,
|
||||||
random_state=random_state,
|
random_state=random_state,
|
||||||
@@ -39,20 +40,20 @@ class Splitter_test(unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
self.build(criterion="duck")
|
self.build(criterion="duck")
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
self.build(splitter_type="duck")
|
self.build(feature_select="duck")
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
self.build(criteria="duck")
|
self.build(criteria="duck")
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = Splitter(clf=None)
|
_ = Splitter(clf=None)
|
||||||
for splitter_type in ["best", "random"]:
|
for feature_select in ["best", "random"]:
|
||||||
for criterion in ["gini", "entropy"]:
|
for criterion in ["gini", "entropy"]:
|
||||||
for criteria in ["max_samples", "impurity"]:
|
for criteria in ["max_samples", "impurity"]:
|
||||||
tcl = self.build(
|
tcl = self.build(
|
||||||
splitter_type=splitter_type,
|
feature_select=feature_select,
|
||||||
criterion=criterion,
|
criterion=criterion,
|
||||||
criteria=criteria,
|
criteria=criteria,
|
||||||
)
|
)
|
||||||
self.assertEqual(splitter_type, tcl._splitter_type)
|
self.assertEqual(feature_select, tcl._feature_select)
|
||||||
self.assertEqual(criterion, tcl._criterion)
|
self.assertEqual(criterion, tcl._criterion)
|
||||||
self.assertEqual(criteria, tcl._criteria)
|
self.assertEqual(criteria, tcl._criteria)
|
||||||
|
|
||||||
@@ -177,32 +178,34 @@ class Splitter_test(unittest.TestCase):
|
|||||||
def test_best_splitter_few_sets(self):
|
def test_best_splitter_few_sets(self):
|
||||||
X, y = load_iris(return_X_y=True)
|
X, y = load_iris(return_X_y=True)
|
||||||
X = np.delete(X, 3, 1)
|
X = np.delete(X, 3, 1)
|
||||||
tcl = self.build(splitter_type="best", random_state=self._random_state)
|
tcl = self.build(
|
||||||
|
feature_select="best", random_state=self._random_state
|
||||||
|
)
|
||||||
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
||||||
self.assertListEqual([0, 2], list(computed))
|
self.assertListEqual([0, 2], list(computed))
|
||||||
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
|
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
|
||||||
|
|
||||||
def test_splitter_parameter(self):
|
def test_splitter_parameter(self):
|
||||||
expected_values = [
|
expected_values = [
|
||||||
[1, 4, 9, 12], # best entropy max_samples
|
[0, 6, 11, 12], # best entropy max_samples
|
||||||
[1, 3, 6, 10], # best entropy impurity
|
[0, 6, 11, 12], # best entropy impurity
|
||||||
[6, 8, 10, 12], # best gini max_samples
|
[0, 6, 11, 12], # best gini max_samples
|
||||||
[7, 8, 10, 11], # best gini impurity
|
[0, 6, 11, 12], # best gini impurity
|
||||||
[0, 3, 8, 12], # random entropy max_samples
|
[0, 3, 8, 12], # random entropy max_samples
|
||||||
[0, 3, 9, 11], # random entropy impurity
|
[0, 3, 7, 12], # random entropy impurity
|
||||||
[0, 4, 7, 12], # random gini max_samples
|
[1, 7, 9, 12], # random gini max_samples
|
||||||
[0, 2, 5, 6], # random gini impurity
|
[1, 5, 8, 12], # random gini impurity
|
||||||
]
|
]
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
rn = 0
|
rn = 0
|
||||||
for splitter_type in ["best", "random"]:
|
for feature_select in ["best", "random"]:
|
||||||
for criterion in ["entropy", "gini"]:
|
for criterion in ["entropy", "gini"]:
|
||||||
for criteria in [
|
for criteria in [
|
||||||
"max_samples",
|
"max_samples",
|
||||||
"impurity",
|
"impurity",
|
||||||
]:
|
]:
|
||||||
tcl = self.build(
|
tcl = self.build(
|
||||||
splitter_type=splitter_type,
|
feature_select=feature_select,
|
||||||
criterion=criterion,
|
criterion=criterion,
|
||||||
criteria=criteria,
|
criteria=criteria,
|
||||||
)
|
)
|
||||||
@@ -213,7 +216,7 @@ class Splitter_test(unittest.TestCase):
|
|||||||
# print(
|
# print(
|
||||||
# "{}, # {:7s}{:8s}{:15s}".format(
|
# "{}, # {:7s}{:8s}{:15s}".format(
|
||||||
# list(computed),
|
# list(computed),
|
||||||
# splitter_type,
|
# feature_select,
|
||||||
# criterion,
|
# criterion,
|
||||||
# criteria,
|
# criteria,
|
||||||
# )
|
# )
|
||||||
@@ -222,3 +225,18 @@ class Splitter_test(unittest.TestCase):
|
|||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
X[:, computed].tolist(), dataset.tolist()
|
X[:, computed].tolist(), dataset.tolist()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_get_best_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [3, 4, 11, 13]),
|
||||||
|
(7, [1, 3, 4, 5, 11, 13, 16]),
|
||||||
|
(9, [1, 3, 4, 5, 7, 10, 11, 13, 16]),
|
||||||
|
]
|
||||||
|
X, y = load_dataset(n_features=20)
|
||||||
|
for k, expected in results:
|
||||||
|
tcl = self.build(
|
||||||
|
feature_select="best",
|
||||||
|
)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
@@ -330,7 +330,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
clf = Stree(random_state=self._random_state, max_features=2)
|
clf = Stree(random_state=self._random_state, max_features=2)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
|
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
||||||
|
|
||||||
def test_bogus_splitter_parameter(self):
|
def test_bogus_splitter_parameter(self):
|
||||||
clf = Stree(splitter="duck")
|
clf = Stree(splitter="duck")
|
||||||
|
Reference in New Issue
Block a user