mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
32 Commits
package_do
...
v1.3.0
Author | SHA1 | Date | |
---|---|---|---|
|
c37f044e3a | ||
|
2f6ae648a1 | ||
|
93be8a89a8 | ||
82838fa3e0
|
|||
f0b2ce3c7b
|
|||
00ed57c015
|
|||
|
08222f109e | ||
cc931d8547
|
|||
b044a057df
|
|||
fc48bc8ba4
|
|||
|
8251f07674 | ||
|
0b15a5af11 | ||
|
28d905368b | ||
e5d49132ec
|
|||
8daecc4726
|
|||
|
bf678df159 | ||
|
36b08b1bcf | ||
36ff3da26d
|
|||
|
6b281ebcc8 | ||
|
3aaddd096f | ||
|
15a5a4c407 | ||
|
0afe14a447 | ||
|
fc9b7b5c92 | ||
|
3f79d2877f | ||
ecc2800705
|
|||
0524d47d64
|
|||
d46f544466
|
|||
79190ef2e1
|
|||
|
4f04e72670 | ||
5cef0f4875
|
|||
28c7558f01
|
|||
|
e19d10f6a7 |
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -12,8 +12,8 @@ jobs:
|
|||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [macos-latest, ubuntu-latest]
|
os: [macos-latest, ubuntu-latest, windows-latest]
|
||||||
python: [3.8]
|
python: [3.8, "3.10"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
37
CITATION.cff
Normal file
37
CITATION.cff
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
cff-version: 1.2.0
|
||||||
|
message: "If you use this software, please cite it as below."
|
||||||
|
authors:
|
||||||
|
- family-names: "Montañana"
|
||||||
|
given-names: "Ricardo"
|
||||||
|
orcid: "https://orcid.org/0000-0003-3242-5452"
|
||||||
|
- family-names: "Gámez"
|
||||||
|
given-names: "José A."
|
||||||
|
orcid: "https://orcid.org/0000-0003-1188-1117"
|
||||||
|
- family-names: "Puerta"
|
||||||
|
given-names: "José M."
|
||||||
|
orcid: "https://orcid.org/0000-0002-9164-5191"
|
||||||
|
title: "STree"
|
||||||
|
version: 1.2.3
|
||||||
|
doi: 10.5281/zenodo.5504083
|
||||||
|
date-released: 2021-11-02
|
||||||
|
url: "https://github.com/Doctorado-ML/STree"
|
||||||
|
preferred-citation:
|
||||||
|
type: article
|
||||||
|
authors:
|
||||||
|
- family-names: "Montañana"
|
||||||
|
given-names: "Ricardo"
|
||||||
|
orcid: "https://orcid.org/0000-0003-3242-5452"
|
||||||
|
- family-names: "Gámez"
|
||||||
|
given-names: "José A."
|
||||||
|
orcid: "https://orcid.org/0000-0003-1188-1117"
|
||||||
|
- family-names: "Puerta"
|
||||||
|
given-names: "José M."
|
||||||
|
orcid: "https://orcid.org/0000-0002-9164-5191"
|
||||||
|
doi: "10.1007/978-3-030-85713-4_6"
|
||||||
|
journal: "Lecture Notes in Computer Science"
|
||||||
|
month: 9
|
||||||
|
start: 54
|
||||||
|
end: 64
|
||||||
|
title: "STree: A Single Multi-class Oblique Decision Tree Based on Support Vector Machines"
|
||||||
|
volume: 12882
|
||||||
|
year: 2021
|
19
Makefile
19
Makefile
@@ -1,6 +1,6 @@
|
|||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
.PHONY: coverage deps help lint push test
|
.PHONY: coverage deps help lint push test doc build
|
||||||
|
|
||||||
coverage: ## Run tests with coverage
|
coverage: ## Run tests with coverage
|
||||||
coverage erase
|
coverage erase
|
||||||
@@ -10,6 +10,9 @@ coverage: ## Run tests with coverage
|
|||||||
deps: ## Install dependencies
|
deps: ## Install dependencies
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
devdeps: ## Install development dependencies
|
||||||
|
pip install black pip-audit flake8 mypy coverage
|
||||||
|
|
||||||
lint: ## Lint and static-check
|
lint: ## Lint and static-check
|
||||||
black stree
|
black stree
|
||||||
flake8 stree
|
flake8 stree
|
||||||
@@ -21,6 +24,20 @@ push: ## Push code with tags
|
|||||||
test: ## Run tests
|
test: ## Run tests
|
||||||
python -m unittest -v stree.tests
|
python -m unittest -v stree.tests
|
||||||
|
|
||||||
|
doc: ## Update documentation
|
||||||
|
make -C docs --makefile=Makefile html
|
||||||
|
|
||||||
|
build: ## Build package
|
||||||
|
rm -fr dist/*
|
||||||
|
rm -fr build/*
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
|
||||||
|
doc-clean: ## Update documentation
|
||||||
|
make -C docs --makefile=Makefile clean
|
||||||
|
|
||||||
|
audit: ## Audit pip
|
||||||
|
pip-audit
|
||||||
|
|
||||||
help: ## Show help message
|
help: ## Show help message
|
||||||
@IFS=$$'\n' ; \
|
@IFS=$$'\n' ; \
|
||||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||||
|
26
README.md
26
README.md
@@ -1,8 +1,12 @@
|
|||||||

|

|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
||||||
|
[](https://badge.fury.io/py/STree)
|
||||||
|

|
||||||
|
[](https://zenodo.org/badge/latestdoi/262658230)
|
||||||
|
|
||||||
# Stree
|
# STree
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
@@ -16,14 +20,12 @@ pip install git+https://github.com/doctorado-ml/stree
|
|||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
||||||
Can be found in
|
Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
@@ -35,21 +37,23 @@ Can be found in
|
|||||||
## Hyperparameters
|
## Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
| | splitter | {"best", "random", "trandom", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features).
|
||||||
|
Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates only one random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
@@ -70,3 +74,7 @@ python -m unittest -v stree.tests
|
|||||||
## License
|
## License
|
||||||
|
|
||||||
STree is [MIT](https://github.com/doctorado-ml/stree/blob/master/LICENSE) licensed
|
STree is [MIT](https://github.com/doctorado-ml/stree/blob/master/LICENSE) licensed
|
||||||
|
|
||||||
|
## Reference
|
||||||
|
|
||||||
|
R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class oblique decision tree based on support vector machines.", 2021 LNAI 12882, pg. 54-64
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
sphinx
|
sphinx
|
||||||
sphinx-rtd-theme
|
sphinx-rtd-theme
|
||||||
myst-parser
|
myst-parser
|
||||||
|
mufs
|
@@ -1,7 +1,7 @@
|
|||||||
Siterator
|
Siterator
|
||||||
=========
|
=========
|
||||||
|
|
||||||
.. automodule:: stree
|
.. automodule:: Splitter
|
||||||
.. autoclass:: Siterator
|
.. autoclass:: Siterator
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
Snode
|
Snode
|
||||||
=====
|
=====
|
||||||
|
|
||||||
.. automodule:: stree
|
.. automodule:: Splitter
|
||||||
.. autoclass:: Snode
|
.. autoclass:: Snode
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
Splitter
|
Splitter
|
||||||
========
|
========
|
||||||
|
|
||||||
.. automodule:: stree
|
.. automodule:: Splitter
|
||||||
.. autoclass:: Splitter
|
.. autoclass:: Splitter
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -6,6 +6,6 @@ API index
|
|||||||
:caption: Contents:
|
:caption: Contents:
|
||||||
|
|
||||||
Stree
|
Stree
|
||||||
Splitter
|
|
||||||
Snode
|
|
||||||
Siterator
|
Siterator
|
||||||
|
Snode
|
||||||
|
Splitter
|
||||||
|
@@ -12,18 +12,19 @@
|
|||||||
#
|
#
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from stree._version import __version__
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath("../../stree/"))
|
sys.path.insert(0, os.path.abspath("../../stree/"))
|
||||||
|
|
||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
project = "STree"
|
project = "STree"
|
||||||
copyright = "2020 - 2021, Ricardo Montañana Gómez"
|
copyright = "2020 - 2022, Ricardo Montañana Gómez"
|
||||||
author = "Ricardo Montañana Gómez"
|
author = "Ricardo Montañana Gómez"
|
||||||
|
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = "1.0"
|
version = __version__
|
||||||
|
release = version
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
@@ -52,4 +53,4 @@ html_theme = "sphinx_rtd_theme"
|
|||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
html_static_path = ["_static"]
|
html_static_path = []
|
||||||
|
@@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
## Notebooks
|
## Notebooks
|
||||||
|
|
||||||
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
|
@@ -1,21 +1,22 @@
|
|||||||
# Hyperparameters
|
# Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"linear", "poly", "rbf"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’.<br>liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*.<br>max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features).<br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features).<br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
| | splitter | {"best", "random", "trandom", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features).<br>Supported strategies are:<br>**“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features.<br>**“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them.<br>**“trandom”**: The algorithm generates only one random combination.<br>**"mutual"**: Chooses the best features w.r.t. their mutual info with the label.<br>**"cfs"**: Apply Correlation-based Feature Selection.<br>**"fcbf"**: Apply Fast Correlation-Based Filter.<br>**"iwss"**: IWSS based algorithm |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets:<br>**"ovo"**: one versus one.<br>**"ovr"**: one versus rest |
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
|
@@ -1,8 +1,12 @@
|
|||||||
# Stree
|
# STree
|
||||||
|
|
||||||
[](https://app.codeship.com/projects/399170)
|

|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
||||||
|
[](https://badge.fury.io/py/STree)
|
||||||
|

|
||||||
|
[](https://zenodo.org/badge/latestdoi/262658230)
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
|
@@ -178,7 +178,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Stree\n",
|
"# Stree\n",
|
||||||
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3)"
|
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3, kernel=\"liblinear\", multiclass_strategy=\"ovr\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@@ -1 +1,2 @@
|
|||||||
scikit-learn>0.24
|
scikit-learn>0.24
|
||||||
|
mufs
|
39
setup.py
39
setup.py
@@ -1,5 +1,5 @@
|
|||||||
import setuptools
|
import setuptools
|
||||||
import stree
|
import os
|
||||||
|
|
||||||
|
|
||||||
def readme():
|
def readme():
|
||||||
@@ -7,29 +7,50 @@ def readme():
|
|||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
VERSION = stree.__version__
|
def get_data(field, file_name="__init__.py"):
|
||||||
|
item = ""
|
||||||
|
with open(os.path.join("stree", file_name)) as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
if line.startswith(f"__{field}__"):
|
||||||
|
delim = '"' if '"' in line else "'"
|
||||||
|
item = line.split(delim)[1]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unable to find {field} string.")
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
def get_requirements():
|
||||||
|
with open("requirements.txt") as f:
|
||||||
|
return f.read().splitlines()
|
||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="STree",
|
name="STree",
|
||||||
version=stree.__version__,
|
version=get_data("version", "_version.py"),
|
||||||
license=stree.__license__,
|
license=get_data("license"),
|
||||||
description="Oblique decision tree with svm nodes",
|
description="Oblique decision tree with svm nodes",
|
||||||
long_description=readme(),
|
long_description=readme(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(),
|
||||||
url=stree.__url__,
|
url="https://github.com/Doctorado-ML/STree#stree",
|
||||||
author=stree.__author__,
|
project_urls={
|
||||||
author_email=stree.__author_email__,
|
"Code": "https://github.com/Doctorado-ML/STree",
|
||||||
|
"Documentation": "https://stree.readthedocs.io/en/latest/index.html",
|
||||||
|
},
|
||||||
|
author=get_data("author"),
|
||||||
|
author_email=get_data("author_email"),
|
||||||
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
||||||
tree svm svc",
|
tree svm svc",
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Development Status :: 5 - Production/Stable",
|
"Development Status :: 5 - Production/Stable",
|
||||||
"License :: OSI Approved :: " + stree.__license__,
|
"License :: OSI Approved :: " + get_data("license"),
|
||||||
"Programming Language :: Python :: 3.8",
|
"Programming Language :: Python :: 3.8",
|
||||||
"Natural Language :: English",
|
"Natural Language :: English",
|
||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
],
|
],
|
||||||
install_requires=["scikit-learn", "numpy", "ipympl"],
|
install_requires=get_requirements(),
|
||||||
test_suite="stree.tests",
|
test_suite="stree.tests",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
10
stree/.readthedocs.yaml
Normal file
10
stree/.readthedocs.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
version: 2
|
||||||
|
|
||||||
|
sphinx:
|
||||||
|
configuration: docs/source/conf.py
|
||||||
|
|
||||||
|
python:
|
||||||
|
version: 3.8
|
||||||
|
install:
|
||||||
|
- requirements: requirements.txt
|
||||||
|
- requirements: docs/requirements.txt
|
809
stree/Splitter.py
Normal file
809
stree/Splitter.py
Normal file
@@ -0,0 +1,809 @@
|
|||||||
|
"""
|
||||||
|
Oblique decision tree classifier based on SVM nodes
|
||||||
|
Splitter class
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
import random
|
||||||
|
from math import log, factorial
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_selection import SelectKBest, mutual_info_classif
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
|
from mufs import MUFS
|
||||||
|
|
||||||
|
|
||||||
|
class Snode:
|
||||||
|
"""
|
||||||
|
Nodes of the tree that keeps the svm classifier and if testing the
|
||||||
|
dataset assigned to it
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
clf : SVC
|
||||||
|
Classifier used
|
||||||
|
X : np.ndarray
|
||||||
|
input dataset in train time (only in testing)
|
||||||
|
y : np.ndarray
|
||||||
|
input labes in train time
|
||||||
|
features : np.array
|
||||||
|
features used to compute hyperplane
|
||||||
|
impurity : float
|
||||||
|
impurity of the node
|
||||||
|
title : str
|
||||||
|
label describing the route to the node
|
||||||
|
weight : np.ndarray, optional
|
||||||
|
weights applied to input dataset in train time, by default None
|
||||||
|
scaler : StandardScaler, optional
|
||||||
|
scaler used if any, by default None
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
clf: SVC,
|
||||||
|
X: np.ndarray,
|
||||||
|
y: np.ndarray,
|
||||||
|
features: np.array,
|
||||||
|
impurity: float,
|
||||||
|
title: str,
|
||||||
|
weight: np.ndarray = None,
|
||||||
|
scaler: StandardScaler = None,
|
||||||
|
):
|
||||||
|
self._clf = clf
|
||||||
|
self._title = title
|
||||||
|
self._belief = 0.0
|
||||||
|
# Only store dataset in Testing
|
||||||
|
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
self._y = y
|
||||||
|
self._down = None
|
||||||
|
self._up = None
|
||||||
|
self._class = None
|
||||||
|
self._feature = None
|
||||||
|
self._sample_weight = (
|
||||||
|
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
)
|
||||||
|
self._features = features
|
||||||
|
self._impurity = impurity
|
||||||
|
self._partition_column: int = -1
|
||||||
|
self._scaler = scaler
|
||||||
|
self._proba = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def copy(cls, node: "Snode") -> "Snode":
|
||||||
|
return cls(
|
||||||
|
node._clf,
|
||||||
|
node._X,
|
||||||
|
node._y,
|
||||||
|
node._features,
|
||||||
|
node._impurity,
|
||||||
|
node._title,
|
||||||
|
node._sample_weight,
|
||||||
|
node._scaler,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_partition_column(self, col: int):
|
||||||
|
self._partition_column = col
|
||||||
|
|
||||||
|
def get_partition_column(self) -> int:
|
||||||
|
return self._partition_column
|
||||||
|
|
||||||
|
def set_down(self, son):
|
||||||
|
self._down = son
|
||||||
|
|
||||||
|
def set_title(self, title):
|
||||||
|
self._title = title
|
||||||
|
|
||||||
|
def set_classifier(self, clf):
|
||||||
|
self._clf = clf
|
||||||
|
|
||||||
|
def set_features(self, features):
|
||||||
|
self._features = features
|
||||||
|
|
||||||
|
def set_impurity(self, impurity):
|
||||||
|
self._impurity = impurity
|
||||||
|
|
||||||
|
def get_title(self) -> str:
|
||||||
|
return self._title
|
||||||
|
|
||||||
|
def get_classifier(self) -> SVC:
|
||||||
|
return self._clf
|
||||||
|
|
||||||
|
def get_impurity(self) -> float:
|
||||||
|
return self._impurity
|
||||||
|
|
||||||
|
def get_features(self) -> np.array:
|
||||||
|
return self._features
|
||||||
|
|
||||||
|
def set_up(self, son):
|
||||||
|
self._up = son
|
||||||
|
|
||||||
|
def is_leaf(self) -> bool:
|
||||||
|
return self._up is None and self._down is None
|
||||||
|
|
||||||
|
def get_down(self) -> "Snode":
|
||||||
|
return self._down
|
||||||
|
|
||||||
|
def get_up(self) -> "Snode":
|
||||||
|
return self._up
|
||||||
|
|
||||||
|
def make_predictor(self, num_classes: int) -> None:
|
||||||
|
"""Compute the class of the predictor and its belief based on the
|
||||||
|
subdataset of the node only if it is a leaf
|
||||||
|
"""
|
||||||
|
if not self.is_leaf():
|
||||||
|
return
|
||||||
|
classes, card = np.unique(self._y, return_counts=True)
|
||||||
|
self._proba = np.zeros((num_classes,), dtype=np.int64)
|
||||||
|
for c, n in zip(classes, card):
|
||||||
|
self._proba[c] = n
|
||||||
|
try:
|
||||||
|
max_card = max(card)
|
||||||
|
self._class = classes[card == max_card][0]
|
||||||
|
self._belief = max_card / np.sum(card)
|
||||||
|
except ValueError:
|
||||||
|
self._class = None
|
||||||
|
|
||||||
|
def graph(self):
|
||||||
|
"""
|
||||||
|
Return a string representing the node in graphviz format
|
||||||
|
"""
|
||||||
|
output = ""
|
||||||
|
count_values = np.unique(self._y, return_counts=True)
|
||||||
|
if self.is_leaf():
|
||||||
|
output += (
|
||||||
|
f'N{id(self)} [shape=box style=filled label="'
|
||||||
|
f"class={self._class} impurity={self._impurity:.3f} "
|
||||||
|
f'counts={self._proba}"];\n'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output += (
|
||||||
|
f'N{id(self)} [label="#features={len(self._features)} '
|
||||||
|
f"classes={count_values[0]} samples={count_values[1]} "
|
||||||
|
f'({sum(count_values[1])})" fontcolor=black];\n'
|
||||||
|
)
|
||||||
|
output += f"N{id(self)} -> N{id(self.get_up())} [color=black];\n"
|
||||||
|
output += f"N{id(self)} -> N{id(self.get_down())} [color=black];\n"
|
||||||
|
return output
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
count_values = np.unique(self._y, return_counts=True)
|
||||||
|
if self.is_leaf():
|
||||||
|
return (
|
||||||
|
f"{self._title} - Leaf class={self._class} belief="
|
||||||
|
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"{self._title} feaures={self._features} impurity="
|
||||||
|
f"{self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Siterator:
|
||||||
|
"""Stree preorder iterator"""
|
||||||
|
|
||||||
|
def __init__(self, tree: Snode):
|
||||||
|
self._stack = []
|
||||||
|
self._push(tree)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
# To complete the iterator interface
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _push(self, node: Snode):
|
||||||
|
if node is not None:
|
||||||
|
self._stack.append(node)
|
||||||
|
|
||||||
|
def __next__(self) -> Snode:
|
||||||
|
if len(self._stack) == 0:
|
||||||
|
raise StopIteration()
|
||||||
|
node = self._stack.pop()
|
||||||
|
self._push(node.get_up())
|
||||||
|
self._push(node.get_down())
|
||||||
|
return node
|
||||||
|
|
||||||
|
|
||||||
|
class Splitter:
|
||||||
|
"""
|
||||||
|
Splits a dataset in two based on different criteria
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
clf : SVC, optional
|
||||||
|
classifier, by default None
|
||||||
|
criterion : str, optional
|
||||||
|
The function to measure the quality of a split (only used if
|
||||||
|
max_features != num_features). Supported criteria are “gini” for the
|
||||||
|
Gini impurity and “entropy” for the information gain., by default
|
||||||
|
"entropy", by default None
|
||||||
|
feature_select : str, optional
|
||||||
|
The strategy used to choose the feature set at each node (only used if
|
||||||
|
max_features < num_features). Supported strategies are: “best”: sklearn
|
||||||
|
SelectKBest algorithm is used in every node to choose the max_features
|
||||||
|
best features. “random”: The algorithm generates 5 candidates and
|
||||||
|
choose the best (max. info. gain) of them. “trandom”: The algorithm
|
||||||
|
generates only one random combination. "mutual": Chooses the best
|
||||||
|
features w.r.t. their mutual info with the label. "cfs": Apply
|
||||||
|
Correlation-based Feature Selection. "fcbf": Apply Fast Correlation-
|
||||||
|
Based, by default None
|
||||||
|
criteria : str, optional
|
||||||
|
ecides (just in case of a multi class classification) which column
|
||||||
|
(class) use to split the dataset in a node. max_samples is
|
||||||
|
incompatible with 'ovo' multiclass_strategy, by default None
|
||||||
|
min_samples_split : int, optional
|
||||||
|
The minimum number of samples required to split an internal node. 0
|
||||||
|
(default) for any, by default None
|
||||||
|
random_state : optional
|
||||||
|
Controls the pseudo random number generation for shuffling the data for
|
||||||
|
probability estimates. Ignored when probability is False.Pass an int
|
||||||
|
for reproducible output across multiple function calls, by
|
||||||
|
default None
|
||||||
|
normalize : bool, optional
|
||||||
|
If standardization of features should be applied on each node with the
|
||||||
|
samples that reach it , by default False
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
clf has to be a sklearn estimator
|
||||||
|
ValueError
|
||||||
|
criterion must be gini or entropy
|
||||||
|
ValueError
|
||||||
|
criteria has to be max_samples or impurity
|
||||||
|
ValueError
|
||||||
|
splitter must be in {random, best, mutual, cfs, fcbf}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
clf: SVC = None,
|
||||||
|
criterion: str = None,
|
||||||
|
feature_select: str = None,
|
||||||
|
criteria: str = None,
|
||||||
|
min_samples_split: int = None,
|
||||||
|
random_state=None,
|
||||||
|
normalize=False,
|
||||||
|
):
|
||||||
|
|
||||||
|
self._clf = clf
|
||||||
|
self._random_state = random_state
|
||||||
|
if random_state is not None:
|
||||||
|
random.seed(random_state)
|
||||||
|
self._criterion = criterion
|
||||||
|
self._min_samples_split = min_samples_split
|
||||||
|
self._criteria = criteria
|
||||||
|
self._feature_select = feature_select
|
||||||
|
self._normalize = normalize
|
||||||
|
|
||||||
|
if clf is None:
|
||||||
|
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||||
|
|
||||||
|
if criterion not in ["gini", "entropy"]:
|
||||||
|
raise ValueError(
|
||||||
|
f"criterion must be gini or entropy got({criterion})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if criteria not in [
|
||||||
|
"max_samples",
|
||||||
|
"impurity",
|
||||||
|
]:
|
||||||
|
raise ValueError(
|
||||||
|
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if feature_select not in [
|
||||||
|
"random",
|
||||||
|
"trandom",
|
||||||
|
"best",
|
||||||
|
"mutual",
|
||||||
|
"cfs",
|
||||||
|
"fcbf",
|
||||||
|
"iwss",
|
||||||
|
]:
|
||||||
|
raise ValueError(
|
||||||
|
"splitter must be in {random, trandom, best, mutual, cfs, "
|
||||||
|
"fcbf, iwss} "
|
||||||
|
f"got ({feature_select})"
|
||||||
|
)
|
||||||
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
|
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||||
|
self.fs_function = getattr(self, f"_fs_{self._feature_select}")
|
||||||
|
|
||||||
|
def _fs_random(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the best of five random feature set combinations
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# Random feature reduction
|
||||||
|
n_features = dataset.shape[1]
|
||||||
|
features_sets = self._generate_spaces(n_features, max_features)
|
||||||
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_trandom(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the a random feature set combination
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# Random feature reduction
|
||||||
|
n_features = dataset.shape[1]
|
||||||
|
return tuple(sorted(random.sample(range(n_features), max_features)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_best(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the variabes with higher f-score
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
SelectKBest(k=max_features)
|
||||||
|
.fit(dataset, labels)
|
||||||
|
.get_support(indices=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fs_mutual(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the best features with mutual information with labels
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# return best features with mutual info with the label
|
||||||
|
feature_list = mutual_info_classif(
|
||||||
|
dataset, labels, random_state=self._random_state
|
||||||
|
)
|
||||||
|
return tuple(
|
||||||
|
sorted(
|
||||||
|
range(len(feature_list)), key=lambda sub: feature_list[sub]
|
||||||
|
)[-max_features:]
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_cfs(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Correlattion-based feature selection with max_features limit
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
|
return mufs.cfs(dataset, labels).get_results()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_fcbf(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Fast Correlation-based Filter algorithm with max_features limit
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
|
return mufs.fcbf(dataset, labels, 5e-4).get_results()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_iwss(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Correlattion-based feature selection based on iwss with max_features
|
||||||
|
limit
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
|
return mufs.iwss(dataset, labels, 0.25).get_results()
|
||||||
|
|
||||||
|
def partition_impurity(self, y: np.array) -> np.array:
|
||||||
|
return self.criterion_function(y)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _gini(y: np.array) -> float:
|
||||||
|
_, count = np.unique(y, return_counts=True)
|
||||||
|
return 1 - np.sum(np.square(count / np.sum(count)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _entropy(y: np.array) -> float:
|
||||||
|
"""Compute entropy of a labels set
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : np.array
|
||||||
|
set of labels
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
entropy
|
||||||
|
"""
|
||||||
|
n_labels = len(y)
|
||||||
|
if n_labels <= 1:
|
||||||
|
return 0
|
||||||
|
counts = np.bincount(y)
|
||||||
|
proportions = counts / n_labels
|
||||||
|
n_classes = np.count_nonzero(proportions)
|
||||||
|
if n_classes <= 1:
|
||||||
|
return 0
|
||||||
|
entropy = 0.0
|
||||||
|
# Compute standard entropy.
|
||||||
|
for prop in proportions:
|
||||||
|
if prop != 0.0:
|
||||||
|
entropy -= prop * log(prop, n_classes)
|
||||||
|
return entropy
|
||||||
|
|
||||||
|
def information_gain(
|
||||||
|
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||||
|
) -> float:
|
||||||
|
"""Compute information gain of a split candidate
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
labels_up : np.array
|
||||||
|
labels of one side
|
||||||
|
labels_dn : np.array
|
||||||
|
labels on the other side
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
information gain
|
||||||
|
"""
|
||||||
|
imp_prev = self.criterion_function(labels)
|
||||||
|
card_up = card_dn = imp_up = imp_dn = 0
|
||||||
|
if labels_up is not None:
|
||||||
|
card_up = labels_up.shape[0]
|
||||||
|
imp_up = self.criterion_function(labels_up)
|
||||||
|
if labels_dn is not None:
|
||||||
|
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||||
|
imp_dn = self.criterion_function(labels_dn)
|
||||||
|
samples = card_up + card_dn
|
||||||
|
if samples == 0:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
result = (
|
||||||
|
imp_prev
|
||||||
|
- (card_up / samples) * imp_up
|
||||||
|
- (card_dn / samples) * imp_dn
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _select_best_set(
|
||||||
|
self, dataset: np.array, labels: np.array, features_sets: list
|
||||||
|
) -> list:
|
||||||
|
"""Return the best set of features among feature_sets, the criterion is
|
||||||
|
the information gain
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
labels : np.array
|
||||||
|
array of labels
|
||||||
|
features_sets : list
|
||||||
|
list of features sets to check
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
best feature set
|
||||||
|
"""
|
||||||
|
max_gain = 0
|
||||||
|
selected = None
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
for feature_set in features_sets:
|
||||||
|
self._clf.fit(dataset[:, feature_set], labels)
|
||||||
|
node = Snode(
|
||||||
|
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
||||||
|
)
|
||||||
|
self.partition(dataset, node, train=True)
|
||||||
|
y1, y2 = self.part(labels)
|
||||||
|
gain = self.information_gain(labels, y1, y2)
|
||||||
|
if gain > max_gain:
|
||||||
|
max_gain = gain
|
||||||
|
selected = feature_set
|
||||||
|
return selected if selected is not None else feature_set
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_spaces(features: int, max_features: int) -> list:
|
||||||
|
"""Generate at most 5 feature random combinations
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
features : int
|
||||||
|
number of features in each combination
|
||||||
|
max_features : int
|
||||||
|
number of features in dataset
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list with up to 5 combination of features randomly selected
|
||||||
|
"""
|
||||||
|
comb = set()
|
||||||
|
# Generate at most 5 combinations
|
||||||
|
number = factorial(features) / (
|
||||||
|
factorial(max_features) * factorial(features - max_features)
|
||||||
|
)
|
||||||
|
set_length = min(5, number)
|
||||||
|
while len(comb) < set_length:
|
||||||
|
comb.add(
|
||||||
|
tuple(sorted(random.sample(range(features), max_features)))
|
||||||
|
)
|
||||||
|
return list(comb)
|
||||||
|
|
||||||
|
def _get_subspaces_set(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Compute the indices of the features selected by splitter depending
|
||||||
|
on the self._feature_select hyper parameter
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(<= number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# No feature reduction
|
||||||
|
n_features = dataset.shape[1]
|
||||||
|
if n_features == max_features:
|
||||||
|
return tuple(range(n_features))
|
||||||
|
# select features as selected in constructor
|
||||||
|
return self.fs_function(dataset, labels, max_features)
|
||||||
|
|
||||||
|
def get_subspace(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Re3turn a subspace of the selected dataset of max_features length.
|
||||||
|
Depending on hyperparameter
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features to form the subspace
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
tuple with the dataset with only the features selected and the
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||||
|
return dataset[:, indices], indices
|
||||||
|
|
||||||
|
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
||||||
|
"""return column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : np.array
|
||||||
|
distances to hyper plane of every class
|
||||||
|
y : np.array
|
||||||
|
vector of labels (classes)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
"""
|
||||||
|
max_gain = 0
|
||||||
|
selected = -1
|
||||||
|
for col in range(data.shape[1]):
|
||||||
|
tup = y[data[:, col] > 0]
|
||||||
|
tdn = y[data[:, col] <= 0]
|
||||||
|
info_gain = self.information_gain(y, tup, tdn)
|
||||||
|
if info_gain > max_gain:
|
||||||
|
selected = col
|
||||||
|
max_gain = info_gain
|
||||||
|
return selected
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||||
|
"""return column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : np.array
|
||||||
|
distances to hyper plane of every class
|
||||||
|
y : np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
"""
|
||||||
|
# select the class with max number of samples
|
||||||
|
_, samples = np.unique(y, return_counts=True)
|
||||||
|
return np.argmax(samples)
|
||||||
|
|
||||||
|
def partition(self, samples: np.array, node: Snode, train: bool):
|
||||||
|
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||||
|
that should go to one side of the tree (up)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
samples : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
node : Snode
|
||||||
|
Node of the tree where partition is going to be made
|
||||||
|
train : bool
|
||||||
|
Train time - True / Test time - False
|
||||||
|
"""
|
||||||
|
# data contains the distances of every sample to every class hyperplane
|
||||||
|
# array of (m, nc) nc = # classes
|
||||||
|
data = self._distances(node, samples)
|
||||||
|
if data.shape[0] < self._min_samples_split:
|
||||||
|
# there aren't enough samples to split
|
||||||
|
self._up = np.ones((data.shape[0]), dtype=bool)
|
||||||
|
return
|
||||||
|
if data.ndim > 1:
|
||||||
|
# split criteria for multiclass
|
||||||
|
# Convert data to a (m, 1) array selecting values for samples
|
||||||
|
if train:
|
||||||
|
# in train time we have to compute the column to take into
|
||||||
|
# account to split the dataset
|
||||||
|
col = self.decision_criteria(data, node._y)
|
||||||
|
node.set_partition_column(col)
|
||||||
|
else:
|
||||||
|
# in predcit time just use the column computed in train time
|
||||||
|
# is taking the classifier of class <col>
|
||||||
|
col = node.get_partition_column()
|
||||||
|
if col == -1:
|
||||||
|
# No partition is producing information gain
|
||||||
|
data = np.ones(data.shape)
|
||||||
|
data = data[:, col]
|
||||||
|
self._up = data > 0
|
||||||
|
|
||||||
|
def part(self, origin: np.array) -> list:
|
||||||
|
"""Split an array in two based on indices (self._up) and its complement
|
||||||
|
partition has to be called first to establish up indices
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
origin : np.array
|
||||||
|
dataset to split
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list with two splits of the array
|
||||||
|
"""
|
||||||
|
down = ~self._up
|
||||||
|
return [
|
||||||
|
origin[self._up] if any(self._up) else None,
|
||||||
|
origin[down] if any(down) else None,
|
||||||
|
]
|
||||||
|
|
||||||
|
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
||||||
|
"""Compute distances of the samples to the hyperplane of the node
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
node : Snode
|
||||||
|
node containing the svm classifier
|
||||||
|
data : np.ndarray
|
||||||
|
samples to compute distance to hyperplane
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
array of shape (m, nc) with the distances of every sample to
|
||||||
|
the hyperplane of every class. nc = # of classes
|
||||||
|
"""
|
||||||
|
X_transformed = data[:, node._features]
|
||||||
|
if self._normalize:
|
||||||
|
X_transformed = node._scaler.transform(X_transformed)
|
||||||
|
return node._clf.decision_function(X_transformed)
|
831
stree/Strees.py
831
stree/Strees.py
@@ -2,548 +2,137 @@
|
|||||||
Oblique decision tree classifier based on SVM nodes
|
Oblique decision tree classifier based on SVM nodes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
import numbers
|
import numbers
|
||||||
import random
|
import random
|
||||||
import warnings
|
|
||||||
from math import log, factorial
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import SVC, LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
from sklearn.feature_selection import SelectKBest
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.utils.multiclass import check_classification_targets
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
|
||||||
from sklearn.utils.validation import (
|
from sklearn.utils.validation import (
|
||||||
check_X_y,
|
check_X_y,
|
||||||
check_array,
|
check_array,
|
||||||
check_is_fitted,
|
check_is_fitted,
|
||||||
_check_sample_weight,
|
_check_sample_weight,
|
||||||
)
|
)
|
||||||
|
from .Splitter import Splitter, Snode, Siterator
|
||||||
|
from ._version import __version__
|
||||||
class Snode:
|
|
||||||
"""Nodes of the tree that keeps the svm classifier and if testing the
|
|
||||||
dataset assigned to it
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
clf: SVC,
|
|
||||||
X: np.ndarray,
|
|
||||||
y: np.ndarray,
|
|
||||||
features: np.array,
|
|
||||||
impurity: float,
|
|
||||||
title: str,
|
|
||||||
weight: np.ndarray = None,
|
|
||||||
scaler: StandardScaler = None,
|
|
||||||
):
|
|
||||||
self._clf = clf
|
|
||||||
self._title = title
|
|
||||||
self._belief = 0.0
|
|
||||||
# Only store dataset in Testing
|
|
||||||
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
|
||||||
self._y = y
|
|
||||||
self._down = None
|
|
||||||
self._up = None
|
|
||||||
self._class = None
|
|
||||||
self._feature = None
|
|
||||||
self._sample_weight = (
|
|
||||||
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
|
||||||
)
|
|
||||||
self._features = features
|
|
||||||
self._impurity = impurity
|
|
||||||
self._partition_column: int = -1
|
|
||||||
self._scaler = scaler
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def copy(cls, node: "Snode") -> "Snode":
|
|
||||||
return cls(
|
|
||||||
node._clf,
|
|
||||||
node._X,
|
|
||||||
node._y,
|
|
||||||
node._features,
|
|
||||||
node._impurity,
|
|
||||||
node._title,
|
|
||||||
node._sample_weight,
|
|
||||||
node._scaler,
|
|
||||||
)
|
|
||||||
|
|
||||||
def set_partition_column(self, col: int):
|
|
||||||
self._partition_column = col
|
|
||||||
|
|
||||||
def get_partition_column(self) -> int:
|
|
||||||
return self._partition_column
|
|
||||||
|
|
||||||
def set_down(self, son):
|
|
||||||
self._down = son
|
|
||||||
|
|
||||||
def set_title(self, title):
|
|
||||||
self._title = title
|
|
||||||
|
|
||||||
def set_classifier(self, clf):
|
|
||||||
self._clf = clf
|
|
||||||
|
|
||||||
def set_features(self, features):
|
|
||||||
self._features = features
|
|
||||||
|
|
||||||
def set_impurity(self, impurity):
|
|
||||||
self._impurity = impurity
|
|
||||||
|
|
||||||
def get_title(self) -> str:
|
|
||||||
return self._title
|
|
||||||
|
|
||||||
def get_classifier(self) -> SVC:
|
|
||||||
return self._clf
|
|
||||||
|
|
||||||
def get_impurity(self) -> float:
|
|
||||||
return self._impurity
|
|
||||||
|
|
||||||
def get_features(self) -> np.array:
|
|
||||||
return self._features
|
|
||||||
|
|
||||||
def set_up(self, son):
|
|
||||||
self._up = son
|
|
||||||
|
|
||||||
def is_leaf(self) -> bool:
|
|
||||||
return self._up is None and self._down is None
|
|
||||||
|
|
||||||
def get_down(self) -> "Snode":
|
|
||||||
return self._down
|
|
||||||
|
|
||||||
def get_up(self) -> "Snode":
|
|
||||||
return self._up
|
|
||||||
|
|
||||||
def make_predictor(self):
|
|
||||||
"""Compute the class of the predictor and its belief based on the
|
|
||||||
subdataset of the node only if it is a leaf
|
|
||||||
"""
|
|
||||||
if not self.is_leaf():
|
|
||||||
return
|
|
||||||
classes, card = np.unique(self._y, return_counts=True)
|
|
||||||
if len(classes) > 1:
|
|
||||||
max_card = max(card)
|
|
||||||
self._class = classes[card == max_card][0]
|
|
||||||
self._belief = max_card / np.sum(card)
|
|
||||||
else:
|
|
||||||
self._belief = 1
|
|
||||||
try:
|
|
||||||
self._class = classes[0]
|
|
||||||
except IndexError:
|
|
||||||
self._class = None
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
|
||||||
count_values = np.unique(self._y, return_counts=True)
|
|
||||||
if self.is_leaf():
|
|
||||||
return (
|
|
||||||
f"{self._title} - Leaf class={self._class} belief="
|
|
||||||
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
|
||||||
f"counts={count_values}"
|
|
||||||
)
|
|
||||||
return (
|
|
||||||
f"{self._title} feaures={self._features} impurity="
|
|
||||||
f"{self._impurity:.4f} "
|
|
||||||
f"counts={count_values}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Siterator:
|
|
||||||
"""Stree preorder iterator"""
|
|
||||||
|
|
||||||
def __init__(self, tree: Snode):
|
|
||||||
self._stack = []
|
|
||||||
self._push(tree)
|
|
||||||
|
|
||||||
def _push(self, node: Snode):
|
|
||||||
if node is not None:
|
|
||||||
self._stack.append(node)
|
|
||||||
|
|
||||||
def __next__(self) -> Snode:
|
|
||||||
if len(self._stack) == 0:
|
|
||||||
raise StopIteration()
|
|
||||||
node = self._stack.pop()
|
|
||||||
self._push(node.get_up())
|
|
||||||
self._push(node.get_down())
|
|
||||||
return node
|
|
||||||
|
|
||||||
|
|
||||||
class Splitter:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
clf: SVC = None,
|
|
||||||
criterion: str = None,
|
|
||||||
feature_select: str = None,
|
|
||||||
criteria: str = None,
|
|
||||||
min_samples_split: int = None,
|
|
||||||
random_state=None,
|
|
||||||
normalize=False,
|
|
||||||
):
|
|
||||||
self._clf = clf
|
|
||||||
self._random_state = random_state
|
|
||||||
if random_state is not None:
|
|
||||||
random.seed(random_state)
|
|
||||||
self._criterion = criterion
|
|
||||||
self._min_samples_split = min_samples_split
|
|
||||||
self._criteria = criteria
|
|
||||||
self._feature_select = feature_select
|
|
||||||
self._normalize = normalize
|
|
||||||
|
|
||||||
if clf is None:
|
|
||||||
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
|
||||||
|
|
||||||
if criterion not in ["gini", "entropy"]:
|
|
||||||
raise ValueError(
|
|
||||||
f"criterion must be gini or entropy got({criterion})"
|
|
||||||
)
|
|
||||||
|
|
||||||
if criteria not in [
|
|
||||||
"max_samples",
|
|
||||||
"impurity",
|
|
||||||
]:
|
|
||||||
raise ValueError(
|
|
||||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
|
||||||
)
|
|
||||||
|
|
||||||
if feature_select not in ["random", "best"]:
|
|
||||||
raise ValueError(
|
|
||||||
"splitter must be either random or best, got "
|
|
||||||
f"({feature_select})"
|
|
||||||
)
|
|
||||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
|
||||||
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
|
||||||
|
|
||||||
def partition_impurity(self, y: np.array) -> np.array:
|
|
||||||
return self.criterion_function(y)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _gini(y: np.array) -> float:
|
|
||||||
_, count = np.unique(y, return_counts=True)
|
|
||||||
return 1 - np.sum(np.square(count / np.sum(count)))
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _entropy(y: np.array) -> float:
|
|
||||||
"""Compute entropy of a labels set
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
y : np.array
|
|
||||||
set of labels
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
float
|
|
||||||
entropy
|
|
||||||
"""
|
|
||||||
n_labels = len(y)
|
|
||||||
if n_labels <= 1:
|
|
||||||
return 0
|
|
||||||
counts = np.bincount(y)
|
|
||||||
proportions = counts / n_labels
|
|
||||||
n_classes = np.count_nonzero(proportions)
|
|
||||||
if n_classes <= 1:
|
|
||||||
return 0
|
|
||||||
entropy = 0.0
|
|
||||||
# Compute standard entropy.
|
|
||||||
for prop in proportions:
|
|
||||||
if prop != 0.0:
|
|
||||||
entropy -= prop * log(prop, n_classes)
|
|
||||||
return entropy
|
|
||||||
|
|
||||||
def information_gain(
|
|
||||||
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
|
||||||
) -> float:
|
|
||||||
"""Compute information gain of a split candidate
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
labels_up : np.array
|
|
||||||
labels of one side
|
|
||||||
labels_dn : np.array
|
|
||||||
labels on the other side
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
float
|
|
||||||
information gain
|
|
||||||
"""
|
|
||||||
imp_prev = self.criterion_function(labels)
|
|
||||||
card_up = card_dn = imp_up = imp_dn = 0
|
|
||||||
if labels_up is not None:
|
|
||||||
card_up = labels_up.shape[0]
|
|
||||||
imp_up = self.criterion_function(labels_up)
|
|
||||||
if labels_dn is not None:
|
|
||||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
|
||||||
imp_dn = self.criterion_function(labels_dn)
|
|
||||||
samples = card_up + card_dn
|
|
||||||
if samples == 0:
|
|
||||||
return 0.0
|
|
||||||
else:
|
|
||||||
result = (
|
|
||||||
imp_prev
|
|
||||||
- (card_up / samples) * imp_up
|
|
||||||
- (card_dn / samples) * imp_dn
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _select_best_set(
|
|
||||||
self, dataset: np.array, labels: np.array, features_sets: list
|
|
||||||
) -> list:
|
|
||||||
"""Return the best set of features among feature_sets, the criterion is
|
|
||||||
the information gain
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples (# samples, # features)
|
|
||||||
labels : np.array
|
|
||||||
array of labels
|
|
||||||
features_sets : list
|
|
||||||
list of features sets to check
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list
|
|
||||||
best feature set
|
|
||||||
"""
|
|
||||||
max_gain = 0
|
|
||||||
selected = None
|
|
||||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
||||||
for feature_set in features_sets:
|
|
||||||
self._clf.fit(dataset[:, feature_set], labels)
|
|
||||||
node = Snode(
|
|
||||||
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
|
||||||
)
|
|
||||||
self.partition(dataset, node, train=True)
|
|
||||||
y1, y2 = self.part(labels)
|
|
||||||
gain = self.information_gain(labels, y1, y2)
|
|
||||||
if gain > max_gain:
|
|
||||||
max_gain = gain
|
|
||||||
selected = feature_set
|
|
||||||
return selected if selected is not None else feature_set
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _generate_spaces(features: int, max_features: int) -> list:
|
|
||||||
"""Generate at most 5 feature random combinations
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
features : int
|
|
||||||
number of features in each combination
|
|
||||||
max_features : int
|
|
||||||
number of features in dataset
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list
|
|
||||||
list with up to 5 combination of features randomly selected
|
|
||||||
"""
|
|
||||||
comb = set()
|
|
||||||
# Generate at most 5 combinations
|
|
||||||
number = factorial(features) / (
|
|
||||||
factorial(max_features) * factorial(features - max_features)
|
|
||||||
)
|
|
||||||
set_length = min(5, number)
|
|
||||||
while len(comb) < set_length:
|
|
||||||
comb.add(
|
|
||||||
tuple(sorted(random.sample(range(features), max_features)))
|
|
||||||
)
|
|
||||||
return list(comb)
|
|
||||||
|
|
||||||
def _get_subspaces_set(
|
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Compute the indices of the features selected by splitter depending
|
|
||||||
on the self._feature_select hyper parameter
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(<= number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
if dataset.shape[1] == max_features:
|
|
||||||
# No feature reduction applies
|
|
||||||
return tuple(range(dataset.shape[1]))
|
|
||||||
if self._feature_select == "random":
|
|
||||||
features_sets = self._generate_spaces(
|
|
||||||
dataset.shape[1], max_features
|
|
||||||
)
|
|
||||||
return self._select_best_set(dataset, labels, features_sets)
|
|
||||||
# Take KBest features
|
|
||||||
return (
|
|
||||||
SelectKBest(k=max_features)
|
|
||||||
.fit(dataset, labels)
|
|
||||||
.get_support(indices=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_subspace(
|
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Re3turn a subspace of the selected dataset of max_features length.
|
|
||||||
Depending on hyperparmeter
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples (# samples, # features)
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features to form the subspace
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
tuple with the dataset with only the features selected and the
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
indices = self._get_subspaces_set(dataset, labels, max_features)
|
|
||||||
return dataset[:, indices], indices
|
|
||||||
|
|
||||||
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
|
||||||
"""return column of dataset to be taken into account to split dataset
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : np.array
|
|
||||||
distances to hyper plane of every class
|
|
||||||
y : np.array
|
|
||||||
vector of labels (classes)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.array
|
|
||||||
column of dataset to be taken into account to split dataset
|
|
||||||
"""
|
|
||||||
max_gain = 0
|
|
||||||
selected = -1
|
|
||||||
for col in range(data.shape[1]):
|
|
||||||
tup = y[data[:, col] > 0]
|
|
||||||
tdn = y[data[:, col] <= 0]
|
|
||||||
info_gain = self.information_gain(y, tup, tdn)
|
|
||||||
if info_gain > max_gain:
|
|
||||||
selected = col
|
|
||||||
max_gain = info_gain
|
|
||||||
return selected
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _max_samples(data: np.array, y: np.array) -> np.array:
|
|
||||||
"""return column of dataset to be taken into account to split dataset
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : np.array
|
|
||||||
distances to hyper plane of every class
|
|
||||||
y : np.array
|
|
||||||
column of dataset to be taken into account to split dataset
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.array
|
|
||||||
column of dataset to be taken into account to split dataset
|
|
||||||
"""
|
|
||||||
# select the class with max number of samples
|
|
||||||
_, samples = np.unique(y, return_counts=True)
|
|
||||||
return np.argmax(samples)
|
|
||||||
|
|
||||||
def partition(self, samples: np.array, node: Snode, train: bool):
|
|
||||||
"""Set the criteria to split arrays. Compute the indices of the samples
|
|
||||||
that should go to one side of the tree (up)
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
samples : np.array
|
|
||||||
array of samples (# samples, # features)
|
|
||||||
node : Snode
|
|
||||||
Node of the tree where partition is going to be made
|
|
||||||
train : bool
|
|
||||||
Train time - True / Test time - False
|
|
||||||
"""
|
|
||||||
# data contains the distances of every sample to every class hyperplane
|
|
||||||
# array of (m, nc) nc = # classes
|
|
||||||
data = self._distances(node, samples)
|
|
||||||
if data.shape[0] < self._min_samples_split:
|
|
||||||
# there aren't enough samples to split
|
|
||||||
self._up = np.ones((data.shape[0]), dtype=bool)
|
|
||||||
return
|
|
||||||
if data.ndim > 1:
|
|
||||||
# split criteria for multiclass
|
|
||||||
# Convert data to a (m, 1) array selecting values for samples
|
|
||||||
if train:
|
|
||||||
# in train time we have to compute the column to take into
|
|
||||||
# account to split the dataset
|
|
||||||
col = self.decision_criteria(data, node._y)
|
|
||||||
node.set_partition_column(col)
|
|
||||||
else:
|
|
||||||
# in predcit time just use the column computed in train time
|
|
||||||
# is taking the classifier of class <col>
|
|
||||||
col = node.get_partition_column()
|
|
||||||
if col == -1:
|
|
||||||
# No partition is producing information gain
|
|
||||||
data = np.ones(data.shape)
|
|
||||||
data = data[:, col]
|
|
||||||
self._up = data > 0
|
|
||||||
|
|
||||||
def part(self, origin: np.array) -> list:
|
|
||||||
"""Split an array in two based on indices (self._up) and its complement
|
|
||||||
partition has to be called first to establish up indices
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
origin : np.array
|
|
||||||
dataset to split
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list
|
|
||||||
list with two splits of the array
|
|
||||||
"""
|
|
||||||
down = ~self._up
|
|
||||||
return [
|
|
||||||
origin[self._up] if any(self._up) else None,
|
|
||||||
origin[down] if any(down) else None,
|
|
||||||
]
|
|
||||||
|
|
||||||
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
|
||||||
"""Compute distances of the samples to the hyperplane of the node
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
node : Snode
|
|
||||||
node containing the svm classifier
|
|
||||||
data : np.ndarray
|
|
||||||
samples to compute distance to hyperplane
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.array
|
|
||||||
array of shape (m, nc) with the distances of every sample to
|
|
||||||
the hyperplane of every class. nc = # of classes
|
|
||||||
"""
|
|
||||||
X_transformed = data[:, node._features]
|
|
||||||
if self._normalize:
|
|
||||||
X_transformed = node._scaler.transform(X_transformed)
|
|
||||||
return node._clf.decision_function(X_transformed)
|
|
||||||
|
|
||||||
|
|
||||||
class Stree(BaseEstimator, ClassifierMixin):
|
class Stree(BaseEstimator, ClassifierMixin):
|
||||||
"""Estimator that is based on binary trees of svm nodes
|
"""
|
||||||
|
Estimator that is based on binary trees of svm nodes
|
||||||
can deal with sample_weights in predict, used in boosting sklearn methods
|
can deal with sample_weights in predict, used in boosting sklearn methods
|
||||||
inheriting from BaseEstimator implements get_params and set_params methods
|
inheriting from BaseEstimator implements get_params and set_params methods
|
||||||
inheriting from ClassifierMixin implement the attribute _estimator_type
|
inheriting from ClassifierMixin implement the attribute _estimator_type
|
||||||
with "classifier" as value
|
with "classifier" as value
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
C : float, optional
|
||||||
|
Regularization parameter. The strength of the regularization is
|
||||||
|
inversely proportional to C. Must be strictly positive., by default 1.0
|
||||||
|
kernel : str, optional
|
||||||
|
Specifies the kernel type to be used in the algorithm. It must be one
|
||||||
|
of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses
|
||||||
|
[liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and
|
||||||
|
the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/)
|
||||||
|
library through scikit-learn library, by default "linear"
|
||||||
|
max_iter : int, optional
|
||||||
|
Hard limit on iterations within solver, or -1 for no limit., by default
|
||||||
|
1e5
|
||||||
|
random_state : int, optional
|
||||||
|
Controls the pseudo random number generation for shuffling the data for
|
||||||
|
probability estimates. Ignored when probability is False.Pass an int
|
||||||
|
for reproducible output across multiple function calls, by
|
||||||
|
default None
|
||||||
|
max_depth : int, optional
|
||||||
|
Specifies the maximum depth of the tree, by default None
|
||||||
|
tol : float, optional
|
||||||
|
Tolerance for stopping, by default 1e-4
|
||||||
|
degree : int, optional
|
||||||
|
Degree of the polynomial kernel function (‘poly’). Ignored by all other
|
||||||
|
kernels., by default 3
|
||||||
|
gamma : str, optional
|
||||||
|
Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.if gamma='scale'
|
||||||
|
(default) is passed then it uses 1 / (n_features * X.var()) as value
|
||||||
|
of gamma,if ‘auto’, uses 1 / n_features., by default "scale"
|
||||||
|
split_criteria : str, optional
|
||||||
|
Decides (just in case of a multi class classification) which column
|
||||||
|
(class) use to split the dataset in a node. max_samples is
|
||||||
|
incompatible with 'ovo' multiclass_strategy, by default "impurity"
|
||||||
|
criterion : str, optional
|
||||||
|
The function to measure the quality of a split (only used if
|
||||||
|
max_features != num_features). Supported criteria are “gini” for the
|
||||||
|
Gini impurity and “entropy” for the information gain., by default
|
||||||
|
"entropy"
|
||||||
|
min_samples_split : int, optional
|
||||||
|
The minimum number of samples required to split an internal node. 0
|
||||||
|
(default) for any, by default 0
|
||||||
|
max_features : optional
|
||||||
|
The number of features to consider when looking for the split: If int,
|
||||||
|
then consider max_features features at each split. If float, then
|
||||||
|
max_features is a fraction and int(max_features * n_features) features
|
||||||
|
are considered at each split. If “auto”, then max_features=
|
||||||
|
sqrt(n_features). If “sqrt”, then max_features=sqrt(n_features). If
|
||||||
|
“log2”, then max_features=log2(n_features). If None, then max_features=
|
||||||
|
n_features., by default None
|
||||||
|
splitter : str, optional
|
||||||
|
The strategy used to choose the feature set at each node (only used if
|
||||||
|
max_features < num_features). Supported strategies are: “best”: sklearn
|
||||||
|
SelectKBest algorithm is used in every node to choose the max_features
|
||||||
|
best features. “random”: The algorithm generates 5 candidates and
|
||||||
|
choose the best (max. info. gain) of them. “trandom”: The algorithm
|
||||||
|
generates only one random combination. "mutual": Chooses the best
|
||||||
|
features w.r.t. their mutual info with the label. "cfs": Apply
|
||||||
|
Correlation-based Feature Selection. "fcbf": Apply Fast Correlation-
|
||||||
|
Based , by default "random"
|
||||||
|
multiclass_strategy : str, optional
|
||||||
|
Strategy to use with multiclass datasets, "ovo": one versus one. "ovr":
|
||||||
|
one versus rest, by default "ovo"
|
||||||
|
normalize : bool, optional
|
||||||
|
If standardization of features should be applied on each node with the
|
||||||
|
samples that reach it , by default False
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
classes_ : ndarray of shape (n_classes,)
|
||||||
|
The classes labels.
|
||||||
|
|
||||||
|
n_classes_ : int
|
||||||
|
The number of classes
|
||||||
|
|
||||||
|
n_iter_ : int
|
||||||
|
Max number of iterations in classifier
|
||||||
|
|
||||||
|
depth_ : int
|
||||||
|
Max depht of the tree
|
||||||
|
|
||||||
|
n_features_ : int
|
||||||
|
The number of features when ``fit`` is performed.
|
||||||
|
|
||||||
|
n_features_in_ : int
|
||||||
|
Number of features seen during :term:`fit`.
|
||||||
|
|
||||||
|
max_features_ : int
|
||||||
|
Number of features to use in hyperplane computation
|
||||||
|
|
||||||
|
tree_ : Node
|
||||||
|
root of the tree
|
||||||
|
|
||||||
|
X_ : ndarray
|
||||||
|
points to the input dataset
|
||||||
|
|
||||||
|
y_ : ndarray
|
||||||
|
points to the input labels
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class
|
||||||
|
oblique decision tree based on support vector machines.", 2021 LNAI 12882
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -561,8 +150,10 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
min_samples_split: int = 0,
|
min_samples_split: int = 0,
|
||||||
max_features=None,
|
max_features=None,
|
||||||
splitter: str = "random",
|
splitter: str = "random",
|
||||||
|
multiclass_strategy: str = "ovo",
|
||||||
normalize: bool = False,
|
normalize: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.max_iter = max_iter
|
self.max_iter = max_iter
|
||||||
self.C = C
|
self.C = C
|
||||||
self.kernel = kernel
|
self.kernel = kernel
|
||||||
@@ -577,6 +168,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.criterion = criterion
|
self.criterion = criterion
|
||||||
self.splitter = splitter
|
self.splitter = splitter
|
||||||
self.normalize = normalize
|
self.normalize = normalize
|
||||||
|
self.multiclass_strategy = multiclass_strategy
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def version() -> str:
|
||||||
|
"""Return the version of the package."""
|
||||||
|
return __version__
|
||||||
|
|
||||||
def _more_tags(self) -> dict:
|
def _more_tags(self) -> dict:
|
||||||
"""Required by sklearn to supply features of the classifier
|
"""Required by sklearn to supply features of the classifier
|
||||||
@@ -621,7 +218,23 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
f"Maximum depth has to be greater than 1... got (max_depth=\
|
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||||
{self.max_depth})"
|
{self.max_depth})"
|
||||||
)
|
)
|
||||||
kernels = ["linear", "rbf", "poly", "sigmoid"]
|
if self.multiclass_strategy not in ["ovr", "ovo"]:
|
||||||
|
raise ValueError(
|
||||||
|
"mutliclass_strategy has to be either ovr or ovo"
|
||||||
|
f" but got {self.multiclass_strategy}"
|
||||||
|
)
|
||||||
|
if self.multiclass_strategy == "ovo":
|
||||||
|
if self.kernel == "liblinear":
|
||||||
|
raise ValueError(
|
||||||
|
"The kernel liblinear is incompatible with ovo "
|
||||||
|
"multiclass_strategy"
|
||||||
|
)
|
||||||
|
if self.split_criteria == "max_samples":
|
||||||
|
raise ValueError(
|
||||||
|
"The multiclass_strategy 'ovo' is incompatible with "
|
||||||
|
"split_criteria 'max_samples'"
|
||||||
|
)
|
||||||
|
kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
|
||||||
if self.kernel not in kernels:
|
if self.kernel not in kernels:
|
||||||
raise ValueError(f"Kernel {self.kernel} not in {kernels}")
|
raise ValueError(f"Kernel {self.kernel} not in {kernels}")
|
||||||
check_classification_targets(y)
|
check_classification_targets(y)
|
||||||
@@ -653,12 +266,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.n_features_ = X.shape[1]
|
self.n_features_ = X.shape[1]
|
||||||
self.n_features_in_ = X.shape[1]
|
self.n_features_in_ = X.shape[1]
|
||||||
self.max_features_ = self._initialize_max_features()
|
self.max_features_ = self._initialize_max_features()
|
||||||
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
self.tree_ = self._train(X, y, sample_weight, 1, "root")
|
||||||
self.X_ = X
|
self.X_ = X
|
||||||
self.y_ = y
|
self.y_ = y
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def train(
|
def _train(
|
||||||
self,
|
self,
|
||||||
X: np.ndarray,
|
X: np.ndarray,
|
||||||
y: np.ndarray,
|
y: np.ndarray,
|
||||||
@@ -701,7 +314,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
if np.unique(y).shape[0] == 1:
|
if np.unique(y).shape[0] == 1:
|
||||||
# only 1 class => pure dataset
|
# only 1 class => pure dataset
|
||||||
node.set_title(title + ", <pure>")
|
node.set_title(title + ", <pure>")
|
||||||
node.make_predictor()
|
node.make_predictor(self.n_classes_)
|
||||||
return node
|
return node
|
||||||
# Train the model
|
# Train the model
|
||||||
clf = self._build_clf()
|
clf = self._build_clf()
|
||||||
@@ -720,13 +333,13 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
if X_U is None or X_D is None:
|
if X_U is None or X_D is None:
|
||||||
# didn't part anything
|
# didn't part anything
|
||||||
node.set_title(title + ", <cgaf>")
|
node.set_title(title + ", <cgaf>")
|
||||||
node.make_predictor()
|
node.make_predictor(self.n_classes_)
|
||||||
return node
|
return node
|
||||||
node.set_up(
|
node.set_up(
|
||||||
self.train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
|
self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
|
||||||
)
|
)
|
||||||
node.set_down(
|
node.set_down(
|
||||||
self.train(
|
self._train(
|
||||||
X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})"
|
X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -741,7 +354,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
C=self.C,
|
C=self.C,
|
||||||
tol=self.tol,
|
tol=self.tol,
|
||||||
)
|
)
|
||||||
if self.kernel == "linear"
|
if self.kernel == "liblinear"
|
||||||
else SVC(
|
else SVC(
|
||||||
kernel=self.kernel,
|
kernel=self.kernel,
|
||||||
max_iter=self.max_iter,
|
max_iter=self.max_iter,
|
||||||
@@ -750,31 +363,104 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
gamma=self.gamma,
|
gamma=self.gamma,
|
||||||
degree=self.degree,
|
degree=self.degree,
|
||||||
random_state=self.random_state,
|
random_state=self.random_state,
|
||||||
|
decision_function_shape=self.multiclass_strategy,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
def __predict_class(self, X: np.array) -> np.array:
|
||||||
def _reorder_results(y: np.array, indices: np.array) -> np.array:
|
"""Compute the predicted class for the samples in X. Returns the number
|
||||||
"""Reorder an array based on the array of indices passed
|
of samples of each class in the corresponding leaf node.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
y : np.array
|
X : np.array
|
||||||
data untidy
|
Array of samples
|
||||||
indices : np.array
|
|
||||||
indices used to set order
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
np.array
|
np.array
|
||||||
array y ordered
|
Array of shape (n_samples, n_classes) with the number of samples
|
||||||
|
of each class in the corresponding leaf node
|
||||||
"""
|
"""
|
||||||
# return array of same type given in y
|
|
||||||
y_ordered = y.copy()
|
def compute_prediction(xp, indices, node):
|
||||||
indices = indices.astype(int)
|
if xp is None:
|
||||||
for i, index in enumerate(indices):
|
return
|
||||||
y_ordered[index] = y[i]
|
if node.is_leaf():
|
||||||
return y_ordered
|
# set a class for indices
|
||||||
|
result[indices] = node._proba
|
||||||
|
return
|
||||||
|
self.splitter_.partition(xp, node, train=False)
|
||||||
|
x_u, x_d = self.splitter_.part(xp)
|
||||||
|
i_u, i_d = self.splitter_.part(indices)
|
||||||
|
compute_prediction(x_u, i_u, node.get_up())
|
||||||
|
compute_prediction(x_d, i_d, node.get_down())
|
||||||
|
|
||||||
|
# setup prediction & make it happen
|
||||||
|
result = np.zeros((X.shape[0], self.n_classes_))
|
||||||
|
indices = np.arange(X.shape[0])
|
||||||
|
compute_prediction(X, indices, self.tree_)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def check_predict(self, X) -> np.array:
|
||||||
|
"""Checks predict and predict_proba preconditions. If input X is not an
|
||||||
|
np.array convert it to one.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : np.ndarray
|
||||||
|
Array of samples
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
Array of samples
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If number of features of X is different of the number of features
|
||||||
|
in training data
|
||||||
|
"""
|
||||||
|
check_is_fitted(self, ["tree_"])
|
||||||
|
# Input validation
|
||||||
|
X = check_array(X)
|
||||||
|
if X.shape[1] != self.n_features_:
|
||||||
|
raise ValueError(
|
||||||
|
f"Expected {self.n_features_} features but got "
|
||||||
|
f"({X.shape[1]})"
|
||||||
|
)
|
||||||
|
return X
|
||||||
|
|
||||||
|
def predict_proba(self, X: np.array) -> np.array:
|
||||||
|
"""Predict class probabilities of the input samples X.
|
||||||
|
|
||||||
|
The predicted class probability is the fraction of samples of the same
|
||||||
|
class in a leaf.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : dataset of samples.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
proba : array of shape (n_samples, n_classes)
|
||||||
|
The class probabilities of the input samples.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
if dataset with inconsistent number of features
|
||||||
|
NotFittedError
|
||||||
|
if model is not fitted
|
||||||
|
"""
|
||||||
|
|
||||||
|
X = self.check_predict(X)
|
||||||
|
# return # of samples of each class in leaf node
|
||||||
|
values = self.__predict_class(X)
|
||||||
|
normalizer = values.sum(axis=1)[:, np.newaxis]
|
||||||
|
normalizer[normalizer == 0.0] = 1.0
|
||||||
|
return values / normalizer
|
||||||
|
|
||||||
def predict(self, X: np.array) -> np.array:
|
def predict(self, X: np.array) -> np.array:
|
||||||
"""Predict labels for each sample in dataset passed
|
"""Predict labels for each sample in dataset passed
|
||||||
@@ -796,40 +482,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
NotFittedError
|
NotFittedError
|
||||||
if model is not fitted
|
if model is not fitted
|
||||||
"""
|
"""
|
||||||
|
X = self.check_predict(X)
|
||||||
def predict_class(
|
return self.classes_[np.argmax(self.__predict_class(X), axis=1)]
|
||||||
xp: np.array, indices: np.array, node: Snode
|
|
||||||
) -> np.array:
|
|
||||||
if xp is None:
|
|
||||||
return [], []
|
|
||||||
if node.is_leaf():
|
|
||||||
# set a class for every sample in dataset
|
|
||||||
prediction = np.full((xp.shape[0], 1), node._class)
|
|
||||||
return prediction, indices
|
|
||||||
self.splitter_.partition(xp, node, train=False)
|
|
||||||
x_u, x_d = self.splitter_.part(xp)
|
|
||||||
i_u, i_d = self.splitter_.part(indices)
|
|
||||||
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
|
|
||||||
prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
|
|
||||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
|
||||||
|
|
||||||
# sklearn check
|
|
||||||
check_is_fitted(self, ["tree_"])
|
|
||||||
# Input validation
|
|
||||||
X = check_array(X)
|
|
||||||
if X.shape[1] != self.n_features_:
|
|
||||||
raise ValueError(
|
|
||||||
f"Expected {self.n_features_} features but got "
|
|
||||||
f"({X.shape[1]})"
|
|
||||||
)
|
|
||||||
# setup prediction & make it happen
|
|
||||||
indices = np.arange(X.shape[0])
|
|
||||||
result = (
|
|
||||||
self._reorder_results(*predict_class(X, indices, self.tree_))
|
|
||||||
.astype(int)
|
|
||||||
.ravel()
|
|
||||||
)
|
|
||||||
return self.classes_[result]
|
|
||||||
|
|
||||||
def nodes_leaves(self) -> tuple:
|
def nodes_leaves(self) -> tuple:
|
||||||
"""Compute the number of nodes and leaves in the built tree
|
"""Compute the number of nodes and leaves in the built tree
|
||||||
@@ -862,6 +516,23 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
tree = None
|
tree = None
|
||||||
return Siterator(tree)
|
return Siterator(tree)
|
||||||
|
|
||||||
|
def graph(self, title="") -> str:
|
||||||
|
"""Graphviz code representing the tree
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
graphviz code
|
||||||
|
"""
|
||||||
|
output = (
|
||||||
|
"digraph STree {\nlabel=<STree "
|
||||||
|
f"{title}>\nfontsize=30\nfontcolor=blue\nlabelloc=t\n"
|
||||||
|
)
|
||||||
|
for node in self:
|
||||||
|
output += node.graph()
|
||||||
|
output += "}\n"
|
||||||
|
return output
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
"""String representation of the tree
|
"""String representation of the tree
|
||||||
|
|
||||||
@@ -892,6 +563,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
elif self.max_features is None:
|
elif self.max_features is None:
|
||||||
max_features = self.n_features_
|
max_features = self.n_features_
|
||||||
elif isinstance(self.max_features, numbers.Integral):
|
elif isinstance(self.max_features, numbers.Integral):
|
||||||
|
if self.max_features > self.n_features_:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid value for max_features. "
|
||||||
|
"It can not be greater than number of features "
|
||||||
|
f"({self.n_features_})"
|
||||||
|
)
|
||||||
max_features = self.max_features
|
max_features = self.max_features
|
||||||
else: # float
|
else: # float
|
||||||
if self.max_features > 0.0:
|
if self.max_features > 0.0:
|
||||||
|
@@ -1,11 +1,8 @@
|
|||||||
from .Strees import Stree, Snode, Siterator, Splitter
|
from .Strees import Stree, Siterator
|
||||||
|
|
||||||
__version__ = "1.0"
|
|
||||||
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
||||||
__license__ = "MIT License"
|
__license__ = "MIT License"
|
||||||
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
||||||
__url__ = "https://github.com/doctorado-ml/stree"
|
|
||||||
|
|
||||||
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
__all__ = ["Stree", "Siterator"]
|
||||||
|
1
stree/_version.py
Normal file
1
stree/_version.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
__version__ = "1.3.0"
|
@@ -1,14 +1,19 @@
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from stree import Stree, Snode
|
from stree import Stree
|
||||||
|
from stree.Splitter import Snode
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
class Snode_test(unittest.TestCase):
|
class Snode_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree(random_state=self._random_state)
|
self._clf = Stree(
|
||||||
|
random_state=self._random_state,
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
)
|
||||||
self._clf.fit(*load_dataset(self._random_state))
|
self._clf.fit(*load_dataset(self._random_state))
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@@ -62,10 +67,28 @@ class Snode_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_make_predictor_on_leaf(self):
|
def test_make_predictor_on_leaf(self):
|
||||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
test.make_predictor()
|
test.make_predictor(2)
|
||||||
self.assertEqual(1, test._class)
|
self.assertEqual(1, test._class)
|
||||||
self.assertEqual(0.75, test._belief)
|
self.assertEqual(0.75, test._belief)
|
||||||
self.assertEqual(-1, test._partition_column)
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
self.assertListEqual([1, 3], test._proba.tolist())
|
||||||
|
|
||||||
|
def test_make_predictor_on_not_leaf(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
|
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
|
||||||
|
test.make_predictor(2)
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
self.assertEqual(0, test._belief)
|
||||||
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
self.assertEqual(-1, test.get_up()._partition_column)
|
||||||
|
self.assertIsNone(test._proba)
|
||||||
|
|
||||||
|
def test_make_predictor_on_leaf_bogus_data(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
||||||
|
test.make_predictor(2)
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
self.assertListEqual([0, 0], test._proba.tolist())
|
||||||
|
|
||||||
def test_set_title(self):
|
def test_set_title(self):
|
||||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
@@ -92,21 +115,6 @@ class Snode_test(unittest.TestCase):
|
|||||||
test.set_features([1, 2])
|
test.set_features([1, 2])
|
||||||
self.assertListEqual([1, 2], test.get_features())
|
self.assertListEqual([1, 2], test.get_features())
|
||||||
|
|
||||||
def test_make_predictor_on_not_leaf(self):
|
|
||||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
|
||||||
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
|
|
||||||
test.make_predictor()
|
|
||||||
self.assertIsNone(test._class)
|
|
||||||
self.assertEqual(0, test._belief)
|
|
||||||
self.assertEqual(-1, test._partition_column)
|
|
||||||
self.assertEqual(-1, test.get_up()._partition_column)
|
|
||||||
|
|
||||||
def test_make_predictor_on_leaf_bogus_data(self):
|
|
||||||
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
|
||||||
test.make_predictor()
|
|
||||||
self.assertIsNone(test._class)
|
|
||||||
self.assertEqual(-1, test._partition_column)
|
|
||||||
|
|
||||||
def test_copy_node(self):
|
def test_copy_node(self):
|
||||||
px = [1, 2, 3, 4]
|
px = [1, 2, 3, 4]
|
||||||
py = [1]
|
py = [1]
|
||||||
|
@@ -5,8 +5,8 @@ import random
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.datasets import load_wine, load_iris
|
from sklearn.datasets import load_wine, load_iris
|
||||||
from stree import Splitter
|
from stree.Splitter import Splitter
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset, load_disc_dataset
|
||||||
|
|
||||||
|
|
||||||
class Splitter_test(unittest.TestCase):
|
class Splitter_test(unittest.TestCase):
|
||||||
@@ -195,10 +195,14 @@ class Splitter_test(unittest.TestCase):
|
|||||||
[0, 3, 7, 12], # random entropy impurity
|
[0, 3, 7, 12], # random entropy impurity
|
||||||
[1, 7, 9, 12], # random gini max_samples
|
[1, 7, 9, 12], # random gini max_samples
|
||||||
[1, 5, 8, 12], # random gini impurity
|
[1, 5, 8, 12], # random gini impurity
|
||||||
|
[6, 9, 11, 12], # mutual entropy max_samples
|
||||||
|
[6, 9, 11, 12], # mutual entropy impurity
|
||||||
|
[6, 9, 11, 12], # mutual gini max_samples
|
||||||
|
[6, 9, 11, 12], # mutual gini impurity
|
||||||
]
|
]
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
rn = 0
|
rn = 0
|
||||||
for feature_select in ["best", "random"]:
|
for feature_select in ["best", "random", "mutual"]:
|
||||||
for criterion in ["entropy", "gini"]:
|
for criterion in ["entropy", "gini"]:
|
||||||
for criteria in [
|
for criteria in [
|
||||||
"max_samples",
|
"max_samples",
|
||||||
@@ -221,7 +225,7 @@ class Splitter_test(unittest.TestCase):
|
|||||||
# criteria,
|
# criteria,
|
||||||
# )
|
# )
|
||||||
# )
|
# )
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, sorted(list(computed)))
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
X[:, computed].tolist(), dataset.tolist()
|
X[:, computed].tolist(), dataset.tolist()
|
||||||
)
|
)
|
||||||
@@ -240,3 +244,69 @@ class Splitter_test(unittest.TestCase):
|
|||||||
Xs, computed = tcl.get_subspace(X, y, k)
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_best_subspaces_discrete(self):
|
||||||
|
results = [
|
||||||
|
(4, [0, 3, 16, 18]),
|
||||||
|
(7, [0, 3, 13, 14, 16, 18, 19]),
|
||||||
|
(9, [0, 3, 7, 13, 14, 15, 16, 18, 19]),
|
||||||
|
]
|
||||||
|
X, y = load_disc_dataset(n_features=20)
|
||||||
|
for k, expected in results:
|
||||||
|
tcl = self.build(
|
||||||
|
feature_select="best",
|
||||||
|
)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_cfs_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [1, 5, 9, 12]),
|
||||||
|
(6, [1, 5, 9, 12, 4, 2]),
|
||||||
|
(7, [1, 5, 9, 12, 4, 2, 3]),
|
||||||
|
]
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
for k, expected in results:
|
||||||
|
tcl = self.build(feature_select="cfs")
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_fcbf_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [1, 5, 9, 12]),
|
||||||
|
(6, [1, 5, 9, 12, 4, 2]),
|
||||||
|
(7, [1, 5, 9, 12, 4, 2, 16]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="fcbf", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_iwss_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [1, 5, 9, 12]),
|
||||||
|
(6, [1, 5, 9, 12, 4, 15]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="iwss", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_trandom_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [3, 7, 9, 12]),
|
||||||
|
(6, [0, 1, 2, 8, 15, 18]),
|
||||||
|
(7, [1, 2, 4, 8, 10, 12, 13]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="trandom", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
@@ -7,14 +7,16 @@ from sklearn.datasets import load_iris, load_wine
|
|||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
from stree import Stree, Snode
|
from stree import Stree
|
||||||
|
from stree.Splitter import Snode
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset
|
||||||
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
class Stree_test(unittest.TestCase):
|
class Stree_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._kernels = ["linear", "rbf", "poly"]
|
self._kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -22,10 +24,9 @@ class Stree_test(unittest.TestCase):
|
|||||||
os.environ["TESTING"] = "1"
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
def test_valid_kernels(self):
|
def test_valid_kernels(self):
|
||||||
valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
|
|
||||||
X, y = load_dataset()
|
X, y = load_dataset()
|
||||||
for kernel in valid_kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel)
|
clf = Stree(kernel=kernel, multiclass_strategy="ovr")
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertIsNotNone(clf.tree_)
|
self.assertIsNotNone(clf.tree_)
|
||||||
|
|
||||||
@@ -55,14 +56,19 @@ class Stree_test(unittest.TestCase):
|
|||||||
# i.e. The partition algorithm didn't forget any sample
|
# i.e. The partition algorithm didn't forget any sample
|
||||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||||
_, count_d = np.unique(y_down, return_counts=True)
|
labels_d, count_d = np.unique(y_down, return_counts=True)
|
||||||
_, count_u = np.unique(y_up, return_counts=True)
|
labels_u, count_u = np.unique(y_up, return_counts=True)
|
||||||
|
dict_d = {label: count_d[i] for i, label in enumerate(labels_d)}
|
||||||
|
dict_u = {label: count_u[i] for i, label in enumerate(labels_u)}
|
||||||
#
|
#
|
||||||
for i in unique_y:
|
for i in unique_y:
|
||||||
number_up = count_u[i]
|
|
||||||
try:
|
try:
|
||||||
number_down = count_d[i]
|
number_up = dict_u[i]
|
||||||
except IndexError:
|
except KeyError:
|
||||||
|
number_up = 0
|
||||||
|
try:
|
||||||
|
number_down = dict_d[i]
|
||||||
|
except KeyError:
|
||||||
number_down = 0
|
number_down = 0
|
||||||
self.assertEqual(count_y[i], number_down + number_up)
|
self.assertEqual(count_y[i], number_down + number_up)
|
||||||
# Is the partition made the same as the prediction?
|
# Is the partition made the same as the prediction?
|
||||||
@@ -77,14 +83,22 @@ class Stree_test(unittest.TestCase):
|
|||||||
"""Check if the tree is built the same way as predictions of models"""
|
"""Check if the tree is built the same way as predictions of models"""
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="sigmoid",
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
self._check_tree(clf.tree_)
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
def test_single_prediction(self):
|
def test_single_prediction(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||||
self.assertEqual(yp[0], y[0])
|
self.assertEqual(yp[0], y[0])
|
||||||
|
|
||||||
@@ -92,18 +106,58 @@ class Stree_test(unittest.TestCase):
|
|||||||
# First 27 elements the predictions are the same as the truth
|
# First 27 elements the predictions are the same as the truth
|
||||||
num = 27
|
num = 27
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in ["liblinear", "linear", "rbf", "poly"]:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
yp = clf.fit(X, y).predict(X[:num, :])
|
yp = clf.fit(X, y).predict(X[:num, :])
|
||||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||||
|
|
||||||
|
def test_multiple_predict_proba(self):
|
||||||
|
expected = {
|
||||||
|
"liblinear": {
|
||||||
|
0: [0.02401129943502825, 0.9759887005649718],
|
||||||
|
17: [0.9282970550576184, 0.07170294494238157],
|
||||||
|
},
|
||||||
|
"linear": {
|
||||||
|
0: [0.029329608938547486, 0.9706703910614525],
|
||||||
|
17: [0.9298469387755102, 0.07015306122448979],
|
||||||
|
},
|
||||||
|
"rbf": {
|
||||||
|
0: [0.023448275862068966, 0.976551724137931],
|
||||||
|
17: [0.9458064516129032, 0.05419354838709677],
|
||||||
|
},
|
||||||
|
"poly": {
|
||||||
|
0: [0.01601164483260553, 0.9839883551673945],
|
||||||
|
17: [0.9089790897908979, 0.0910209102091021],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
indices = [0, 17]
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in ["liblinear", "linear", "rbf", "poly"]:
|
||||||
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
|
yp = clf.fit(X, y).predict_proba(X)
|
||||||
|
for index in indices:
|
||||||
|
for exp, comp in zip(expected[kernel][index], yp[index]):
|
||||||
|
self.assertAlmostEqual(exp, comp)
|
||||||
|
|
||||||
def test_single_vs_multiple_prediction(self):
|
def test_single_vs_multiple_prediction(self):
|
||||||
"""Check if predicting sample by sample gives the same result as
|
"""Check if predicting sample by sample gives the same result as
|
||||||
predicting all samples at once
|
predicting all samples at once
|
||||||
"""
|
"""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
# Compute prediction line by line
|
# Compute prediction line by line
|
||||||
yp_line = np.array([], dtype=int)
|
yp_line = np.array([], dtype=int)
|
||||||
@@ -135,9 +189,13 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
computed = []
|
computed = []
|
||||||
expected_string = ""
|
expected_string = ""
|
||||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
for node in clf:
|
for node in iter(clf):
|
||||||
computed.append(str(node))
|
computed.append(str(node))
|
||||||
expected_string += str(node) + "\n"
|
expected_string += str(node) + "\n"
|
||||||
self.assertListEqual(expected, computed)
|
self.assertListEqual(expected, computed)
|
||||||
@@ -173,7 +231,12 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_check_max_depth(self):
|
def test_check_max_depth(self):
|
||||||
depths = (3, 4)
|
depths = (3, 4)
|
||||||
for depth in depths:
|
for depth in depths:
|
||||||
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
tcl = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
max_depth=depth,
|
||||||
|
)
|
||||||
tcl.fit(*load_dataset(self._random_state))
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
self.assertEqual(depth, tcl.depth_)
|
self.assertEqual(depth, tcl.depth_)
|
||||||
|
|
||||||
@@ -194,7 +257,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
split_criteria="max_samples",
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
px = [[1, 2], [5, 6], [9, 10]]
|
px = [[1, 2], [5, 6], [9, 10]]
|
||||||
@@ -205,26 +268,36 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertListEqual(py, clf.classes_.tolist())
|
self.assertListEqual(py, clf.classes_.tolist())
|
||||||
|
|
||||||
def test_muticlass_dataset(self):
|
def test_muticlass_dataset(self):
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
datasets = {
|
datasets = {
|
||||||
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||||
"Iris": load_wine(return_X_y=True),
|
"Iris": load_wine(return_X_y=True),
|
||||||
}
|
}
|
||||||
outcomes = {
|
outcomes = {
|
||||||
"Synt": {
|
"Synt": {
|
||||||
"max_samples linear": 0.9606666666666667,
|
"max_samples liblinear": 0.9493333333333334,
|
||||||
"max_samples rbf": 0.7133333333333334,
|
"max_samples linear": 0.9426666666666667,
|
||||||
"max_samples poly": 0.618,
|
"max_samples rbf": 0.9606666666666667,
|
||||||
"impurity linear": 0.9606666666666667,
|
"max_samples poly": 0.9373333333333334,
|
||||||
"impurity rbf": 0.7133333333333334,
|
"max_samples sigmoid": 0.824,
|
||||||
"impurity poly": 0.618,
|
"impurity liblinear": 0.9493333333333334,
|
||||||
|
"impurity linear": 0.9426666666666667,
|
||||||
|
"impurity rbf": 0.9606666666666667,
|
||||||
|
"impurity poly": 0.9373333333333334,
|
||||||
|
"impurity sigmoid": 0.824,
|
||||||
},
|
},
|
||||||
"Iris": {
|
"Iris": {
|
||||||
|
"max_samples liblinear": 0.9550561797752809,
|
||||||
"max_samples linear": 1.0,
|
"max_samples linear": 1.0,
|
||||||
"max_samples rbf": 0.6910112359550562,
|
"max_samples rbf": 0.6685393258426966,
|
||||||
"max_samples poly": 0.6966292134831461,
|
"max_samples poly": 0.6853932584269663,
|
||||||
"impurity linear": 1,
|
"max_samples sigmoid": 0.6404494382022472,
|
||||||
"impurity rbf": 0.6910112359550562,
|
"impurity liblinear": 0.9550561797752809,
|
||||||
"impurity poly": 0.6966292134831461,
|
"impurity linear": 1.0,
|
||||||
|
"impurity rbf": 0.6685393258426966,
|
||||||
|
"impurity poly": 0.6853932584269663,
|
||||||
|
"impurity sigmoid": 0.6404494382022472,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,18 +306,22 @@ class Stree_test(unittest.TestCase):
|
|||||||
for criteria in ["max_samples", "impurity"]:
|
for criteria in ["max_samples", "impurity"]:
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
C=55,
|
max_iter=1e4,
|
||||||
max_iter=1e5,
|
multiclass_strategy="ovr"
|
||||||
|
if kernel == "liblinear"
|
||||||
|
else "ovo",
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
clf.fit(px, py)
|
clf.fit(px, py)
|
||||||
outcome = outcomes[name][f"{criteria} {kernel}"]
|
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||||
# print(
|
# print(f'"{criteria} {kernel}": {clf.score(px, py)},')
|
||||||
# f"{name} {criteria} {kernel} {outcome} {clf.score(px"
|
self.assertAlmostEqual(
|
||||||
# ", py)}"
|
outcome,
|
||||||
# )
|
clf.score(px, py),
|
||||||
self.assertAlmostEqual(outcome, clf.score(px, py))
|
5,
|
||||||
|
f"{name} - {criteria} - {kernel}",
|
||||||
|
)
|
||||||
|
|
||||||
def test_max_features(self):
|
def test_max_features(self):
|
||||||
n_features = 16
|
n_features = 16
|
||||||
@@ -269,6 +346,12 @@ class Stree_test(unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = clf._initialize_max_features()
|
_ = clf._initialize_max_features()
|
||||||
|
|
||||||
|
def test_wrong_max_features(self):
|
||||||
|
X, y = load_dataset(n_features=15)
|
||||||
|
clf = Stree(max_features=16)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
def test_get_subspaces(self):
|
def test_get_subspaces(self):
|
||||||
dataset = np.random.random((10, 16))
|
dataset = np.random.random((10, 16))
|
||||||
y = np.random.randint(0, 2, 10)
|
y = np.random.randint(0, 2, 10)
|
||||||
@@ -306,17 +389,20 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf.predict(X[:, :3])
|
clf.predict(X[:, :3])
|
||||||
|
|
||||||
# Tests of score
|
# Tests of score
|
||||||
|
|
||||||
def test_score_binary(self):
|
def test_score_binary(self):
|
||||||
|
"""Check score for binary classification."""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
accuracies = [
|
accuracies = [
|
||||||
0.9506666666666667,
|
0.9506666666666667,
|
||||||
|
0.9493333333333334,
|
||||||
0.9606666666666667,
|
0.9606666666666667,
|
||||||
0.9433333333333334,
|
0.9433333333333334,
|
||||||
|
0.9153333333333333,
|
||||||
]
|
]
|
||||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
)
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
@@ -327,12 +413,19 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||||
|
|
||||||
def test_score_max_features(self):
|
def test_score_max_features(self):
|
||||||
|
"""Check score using max_features."""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
clf = Stree(random_state=self._random_state, max_features=2)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
max_features=2,
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
||||||
|
|
||||||
def test_bogus_splitter_parameter(self):
|
def test_bogus_splitter_parameter(self):
|
||||||
|
"""Check that bogus splitter parameter raises exception."""
|
||||||
clf = Stree(splitter="duck")
|
clf = Stree(splitter="duck")
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
clf.fit(*load_dataset())
|
clf.fit(*load_dataset())
|
||||||
@@ -340,7 +433,9 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_multiclass_classifier_integrity(self):
|
def test_multiclass_classifier_integrity(self):
|
||||||
"""Checks if the multiclass operation is done right"""
|
"""Checks if the multiclass operation is done right"""
|
||||||
X, y = load_iris(return_X_y=True)
|
X, y = load_iris(return_X_y=True)
|
||||||
clf = Stree(random_state=0)
|
clf = Stree(
|
||||||
|
kernel="liblinear", multiclass_strategy="ovr", random_state=0
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
score = clf.score(X, y)
|
score = clf.score(X, y)
|
||||||
# Check accuracy of the whole model
|
# Check accuracy of the whole model
|
||||||
@@ -386,6 +481,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertListEqual([47], resdn[1].tolist())
|
self.assertListEqual([47], resdn[1].tolist())
|
||||||
|
|
||||||
def test_score_multiclass_rbf(self):
|
def test_score_multiclass_rbf(self):
|
||||||
|
"""Test score for multiclass classification with rbf kernel."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -396,13 +492,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="rbf", random_state=self._random_state, normalize=True
|
kernel="rbf", random_state=self._random_state, normalize=True
|
||||||
)
|
)
|
||||||
self.assertEqual(0.768, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.966, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.964, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.6685393258426966, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_poly(self):
|
def test_score_multiclass_poly(self):
|
||||||
|
"""Test score for multiclass classification with poly kernel."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -417,24 +514,81 @@ class Stree_test(unittest.TestCase):
|
|||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.946, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.972, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.7808988764044944, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_liblinear(self):
|
||||||
|
"""Test score for multiclass classification with liblinear kernel."""
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
C=10,
|
||||||
|
)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
|
)
|
||||||
|
self.assertEqual(0.968, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.97, clf2.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(1.0, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_sigmoid(self):
|
||||||
|
"""Test score for multiclass classification with sigmoid kernel."""
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(kernel="sigmoid", random_state=self._random_state, C=10)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="sigmoid",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
|
C=10,
|
||||||
|
)
|
||||||
|
self.assertEqual(0.796, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.952, clf2.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(0.6910112359550562, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.9662921348314607, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_linear(self):
|
def test_score_multiclass_linear(self):
|
||||||
|
"""Test score for multiclass classification with linear kernel."""
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
n_features=5,
|
n_features=5,
|
||||||
n_samples=1500,
|
n_samples=1500,
|
||||||
)
|
)
|
||||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||||
# Check with context based standardization
|
# Check with context based standardization
|
||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="linear", random_state=self._random_state, normalize=True
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
@@ -442,11 +596,13 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_zero_all_sample_weights(self):
|
def test_zero_all_sample_weights(self):
|
||||||
|
"""Test exception raises when all sample weights are zero."""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
Stree().fit(X, y, np.zeros(len(y)))
|
Stree().fit(X, y, np.zeros(len(y)))
|
||||||
|
|
||||||
def test_mask_samples_weighted_zero(self):
|
def test_mask_samples_weighted_zero(self):
|
||||||
|
"""Check that the weighted zero samples are masked."""
|
||||||
X = np.array(
|
X = np.array(
|
||||||
[
|
[
|
||||||
[1, 1],
|
[1, 1],
|
||||||
@@ -461,7 +617,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
||||||
yw = np.array([1, 1, 1, 5, 5, 5, 5, 5, 5])
|
yw = np.array([1, 1, 1, 1, 1, 1, 5, 5, 5])
|
||||||
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
||||||
model1 = Stree().fit(X, y)
|
model1 = Stree().fit(X, y)
|
||||||
model2 = Stree().fit(X, y, w)
|
model2 = Stree().fit(X, y, w)
|
||||||
@@ -474,6 +630,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertEqual(model2.score(X, y, w), 1)
|
self.assertEqual(model2.score(X, y, w), 1)
|
||||||
|
|
||||||
def test_depth(self):
|
def test_depth(self):
|
||||||
|
"""Check depth of the tree."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -489,6 +646,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertEqual(4, clf.depth_)
|
self.assertEqual(4, clf.depth_)
|
||||||
|
|
||||||
def test_nodes_leaves(self):
|
def test_nodes_leaves(self):
|
||||||
|
"""Check number of nodes and leaves."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -498,16 +656,17 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(25, nodes)
|
self.assertEqual(31, nodes)
|
||||||
self.assertEqual(13, leaves)
|
self.assertEqual(16, leaves)
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(9, nodes)
|
self.assertEqual(11, nodes)
|
||||||
self.assertEqual(5, leaves)
|
self.assertEqual(6, leaves)
|
||||||
|
|
||||||
def test_nodes_leaves_artificial(self):
|
def test_nodes_leaves_artificial(self):
|
||||||
|
"""Check leaves of artificial dataset."""
|
||||||
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
||||||
n2 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test2")
|
n2 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test2")
|
||||||
n3 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test3")
|
n3 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test3")
|
||||||
@@ -524,3 +683,77 @@ class Stree_test(unittest.TestCase):
|
|||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(6, nodes)
|
self.assertEqual(6, nodes)
|
||||||
self.assertEqual(2, leaves)
|
self.assertEqual(2, leaves)
|
||||||
|
|
||||||
|
def test_bogus_multiclass_strategy(self):
|
||||||
|
"""Check invalid multiclass strategy."""
|
||||||
|
clf = Stree(multiclass_strategy="other")
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
|
def test_multiclass_strategy(self):
|
||||||
|
"""Check multiclass strategy."""
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf_o = Stree(multiclass_strategy="ovo")
|
||||||
|
clf_r = Stree(multiclass_strategy="ovr")
|
||||||
|
score_o = clf_o.fit(X, y).score(X, y)
|
||||||
|
score_r = clf_r.fit(X, y).score(X, y)
|
||||||
|
self.assertEqual(1.0, score_o)
|
||||||
|
self.assertEqual(0.9269662921348315, score_r)
|
||||||
|
|
||||||
|
def test_incompatible_hyperparameters(self):
|
||||||
|
"""Check incompatible hyperparameters."""
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf = Stree(kernel="liblinear", multiclass_strategy="ovo")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
clf = Stree(multiclass_strategy="ovo", split_criteria="max_samples")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
|
def test_version(self):
|
||||||
|
"""Check STree version."""
|
||||||
|
clf = Stree()
|
||||||
|
self.assertEqual(__version__, clf.version())
|
||||||
|
|
||||||
|
def test_graph(self):
|
||||||
|
"""Check graphviz representation of the tree."""
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf = Stree(random_state=self._random_state)
|
||||||
|
|
||||||
|
expected_head = (
|
||||||
|
"digraph STree {\nlabel=<STree >\nfontsize=30\n"
|
||||||
|
"fontcolor=blue\nlabelloc=t\n"
|
||||||
|
)
|
||||||
|
expected_tail = (
|
||||||
|
' [shape=box style=filled label="class=1 impurity=0.000 '
|
||||||
|
'counts=[0 1 0]"];\n}\n'
|
||||||
|
)
|
||||||
|
self.assertEqual(clf.graph(), expected_head + "}\n")
|
||||||
|
clf.fit(X, y)
|
||||||
|
computed = clf.graph()
|
||||||
|
computed_head = computed[: len(expected_head)]
|
||||||
|
num = -len(expected_tail)
|
||||||
|
computed_tail = computed[num:]
|
||||||
|
self.assertEqual(computed_head, expected_head)
|
||||||
|
self.assertEqual(computed_tail, expected_tail)
|
||||||
|
|
||||||
|
def test_graph_title(self):
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf = Stree(random_state=self._random_state)
|
||||||
|
expected_head = (
|
||||||
|
"digraph STree {\nlabel=<STree Sample title>\nfontsize=30\n"
|
||||||
|
"fontcolor=blue\nlabelloc=t\n"
|
||||||
|
)
|
||||||
|
expected_tail = (
|
||||||
|
' [shape=box style=filled label="class=1 impurity=0.000 '
|
||||||
|
'counts=[0 1 0]"];\n}\n'
|
||||||
|
)
|
||||||
|
self.assertEqual(clf.graph("Sample title"), expected_head + "}\n")
|
||||||
|
clf.fit(X, y)
|
||||||
|
computed = clf.graph("Sample title")
|
||||||
|
computed_head = computed[: len(expected_head)]
|
||||||
|
num = -len(expected_tail)
|
||||||
|
computed_tail = computed[num:]
|
||||||
|
self.assertEqual(computed_head, expected_head)
|
||||||
|
self.assertEqual(computed_tail, expected_tail)
|
||||||
|
@@ -1,11 +1,14 @@
|
|||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
def load_dataset(
|
||||||
|
random_state=0, n_classes=2, n_features=3, n_samples=1500, n_informative=3
|
||||||
|
):
|
||||||
X, y = make_classification(
|
X, y = make_classification(
|
||||||
n_samples=n_samples,
|
n_samples=n_samples,
|
||||||
n_features=n_features,
|
n_features=n_features,
|
||||||
n_informative=3,
|
n_informative=n_informative,
|
||||||
n_redundant=0,
|
n_redundant=0,
|
||||||
n_repeated=0,
|
n_repeated=0,
|
||||||
n_classes=n_classes,
|
n_classes=n_classes,
|
||||||
@@ -15,3 +18,12 @@ def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
|||||||
random_state=random_state,
|
random_state=random_state,
|
||||||
)
|
)
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
def load_disc_dataset(
|
||||||
|
random_state=0, n_classes=2, n_features=3, n_samples=1500
|
||||||
|
):
|
||||||
|
np.random.seed(random_state)
|
||||||
|
X = np.random.randint(1, 17, size=(n_samples, n_features)).astype(float)
|
||||||
|
y = np.random.randint(low=0, high=n_classes, size=(n_samples), dtype=int)
|
||||||
|
return X, y
|
||||||
|
Reference in New Issue
Block a user