mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
10 Commits
entropy_fu
...
package_do
Author | SHA1 | Date | |
---|---|---|---|
|
4370433d4d | ||
8fe5fdff2b
|
|||
881777c38c
|
|||
3af7864278
|
|||
a2df31628d
|
|||
fec094a75f
|
|||
045e2fd446
|
|||
2d6921f9a5
|
|||
9eb06a9169
|
|||
951f1cfaa7
|
13
Makefile
13
Makefile
@@ -1,6 +1,6 @@
|
|||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
.PHONY: coverage deps help lint push test doc build
|
.PHONY: coverage deps help lint push test
|
||||||
|
|
||||||
coverage: ## Run tests with coverage
|
coverage: ## Run tests with coverage
|
||||||
coverage erase
|
coverage erase
|
||||||
@@ -21,17 +21,6 @@ push: ## Push code with tags
|
|||||||
test: ## Run tests
|
test: ## Run tests
|
||||||
python -m unittest -v stree.tests
|
python -m unittest -v stree.tests
|
||||||
|
|
||||||
doc: ## Update documentation
|
|
||||||
make -C docs --makefile=Makefile html
|
|
||||||
|
|
||||||
build: ## Build package
|
|
||||||
rm -fr dist/*
|
|
||||||
rm -fr build/*
|
|
||||||
python setup.py sdist bdist_wheel
|
|
||||||
|
|
||||||
doc-clean: ## Update documentation
|
|
||||||
make -C docs --makefile=Makefile clean
|
|
||||||
|
|
||||||
help: ## Show help message
|
help: ## Show help message
|
||||||
@IFS=$$'\n' ; \
|
@IFS=$$'\n' ; \
|
||||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||||
|
21
README.md
21
README.md
@@ -1,12 +1,8 @@
|
|||||||

|

|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
|
||||||
[](https://badge.fury.io/py/STree)
|
|
||||||

|
|
||||||
[](https://zenodo.org/badge/latestdoi/262658230)
|
|
||||||
|
|
||||||
# STree
|
# Stree
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
@@ -20,12 +16,14 @@ pip install git+https://github.com/doctorado-ml/stree
|
|||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
||||||
Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
Can be found in
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
|
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
@@ -37,22 +35,21 @@ Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
|||||||
## Hyperparameters
|
## Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
| \* | kernel | {"linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates a true random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
|
@@ -1,4 +1,3 @@
|
|||||||
sphinx
|
sphinx
|
||||||
sphinx-rtd-theme
|
sphinx-rtd-theme
|
||||||
myst-parser
|
myst-parser
|
||||||
mufs
|
|
@@ -1,7 +1,7 @@
|
|||||||
Siterator
|
Siterator
|
||||||
=========
|
=========
|
||||||
|
|
||||||
.. automodule:: Splitter
|
.. automodule:: stree
|
||||||
.. autoclass:: Siterator
|
.. autoclass:: Siterator
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
Snode
|
Snode
|
||||||
=====
|
=====
|
||||||
|
|
||||||
.. automodule:: Splitter
|
.. automodule:: stree
|
||||||
.. autoclass:: Snode
|
.. autoclass:: Snode
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
Splitter
|
Splitter
|
||||||
========
|
========
|
||||||
|
|
||||||
.. automodule:: Splitter
|
.. automodule:: stree
|
||||||
.. autoclass:: Splitter
|
.. autoclass:: Splitter
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -6,6 +6,6 @@ API index
|
|||||||
:caption: Contents:
|
:caption: Contents:
|
||||||
|
|
||||||
Stree
|
Stree
|
||||||
Siterator
|
|
||||||
Snode
|
|
||||||
Splitter
|
Splitter
|
||||||
|
Snode
|
||||||
|
Siterator
|
||||||
|
@@ -12,7 +12,6 @@
|
|||||||
#
|
#
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import stree
|
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath("../../stree/"))
|
sys.path.insert(0, os.path.abspath("../../stree/"))
|
||||||
|
|
||||||
@@ -24,8 +23,7 @@ copyright = "2020 - 2021, Ricardo Montañana Gómez"
|
|||||||
author = "Ricardo Montañana Gómez"
|
author = "Ricardo Montañana Gómez"
|
||||||
|
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
version = stree.__version__
|
release = "1.0"
|
||||||
release = version
|
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
@@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
## Notebooks
|
## Notebooks
|
||||||
|
|
||||||
|
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
|
@@ -1,22 +1,21 @@
|
|||||||
# Hyperparameters
|
# Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
| \* | kernel | {"linear", "poly", "rbf"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates a true random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
|
@@ -1,12 +1,8 @@
|
|||||||
# STree
|
# Stree
|
||||||
|
|
||||||

|
[](https://app.codeship.com/projects/399170)
|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
|
||||||
[](https://badge.fury.io/py/STree)
|
|
||||||

|
|
||||||
[](https://zenodo.org/badge/latestdoi/262658230)
|
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
|
@@ -178,7 +178,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Stree\n",
|
"# Stree\n",
|
||||||
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3, kernel=\"liblinear\", multiclass_strategy=\"ovr\")"
|
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@@ -1,2 +1 @@
|
|||||||
scikit-learn>0.24
|
scikit-learn>0.24
|
||||||
mufs
|
|
33
setup.py
33
setup.py
@@ -1,4 +1,5 @@
|
|||||||
import setuptools
|
import setuptools
|
||||||
|
import stree
|
||||||
|
|
||||||
|
|
||||||
def readme():
|
def readme():
|
||||||
@@ -6,45 +7,29 @@ def readme():
|
|||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
def get_data(field):
|
VERSION = stree.__version__
|
||||||
item = ""
|
|
||||||
with open("stree/__init__.py") as f:
|
|
||||||
for line in f.readlines():
|
|
||||||
if line.startswith(f"__{field}__"):
|
|
||||||
delim = '"' if '"' in line else "'"
|
|
||||||
item = line.split(delim)[1]
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Unable to find {field} string.")
|
|
||||||
return item
|
|
||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="STree",
|
name="STree",
|
||||||
version=get_data("version"),
|
version=stree.__version__,
|
||||||
license=get_data("license"),
|
license=stree.__license__,
|
||||||
description="Oblique decision tree with svm nodes",
|
description="Oblique decision tree with svm nodes",
|
||||||
long_description=readme(),
|
long_description=readme(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(),
|
||||||
url="https://github.com/Doctorado-ML/STree#stree",
|
url=stree.__url__,
|
||||||
project_urls={
|
author=stree.__author__,
|
||||||
"Code": "https://github.com/Doctorado-ML/STree",
|
author_email=stree.__author_email__,
|
||||||
"Documentation": "https://stree.readthedocs.io/en/latest/index.html",
|
|
||||||
},
|
|
||||||
author=get_data("author"),
|
|
||||||
author_email=get_data("author_email"),
|
|
||||||
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
||||||
tree svm svc",
|
tree svm svc",
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Development Status :: 5 - Production/Stable",
|
"Development Status :: 5 - Production/Stable",
|
||||||
"License :: OSI Approved :: " + get_data("license"),
|
"License :: OSI Approved :: " + stree.__license__,
|
||||||
"Programming Language :: Python :: 3.8",
|
"Programming Language :: Python :: 3.8",
|
||||||
"Natural Language :: English",
|
"Natural Language :: English",
|
||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
],
|
],
|
||||||
install_requires=["scikit-learn", "mufs"],
|
install_requires=["scikit-learn", "numpy", "ipympl"],
|
||||||
test_suite="stree.tests",
|
test_suite="stree.tests",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
@@ -1,10 +0,0 @@
|
|||||||
version: 2
|
|
||||||
|
|
||||||
sphinx:
|
|
||||||
configuration: docs/source/conf.py
|
|
||||||
|
|
||||||
python:
|
|
||||||
version: 3.8
|
|
||||||
install:
|
|
||||||
- requirements: requirements.txt
|
|
||||||
- requirements: docs/requirements.txt
|
|
@@ -1,770 +0,0 @@
|
|||||||
"""
|
|
||||||
Oblique decision tree classifier based on SVM nodes
|
|
||||||
Splitter class
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import warnings
|
|
||||||
import random
|
|
||||||
from math import log, factorial
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.feature_selection import SelectKBest, mutual_info_classif
|
|
||||||
from sklearn.preprocessing import StandardScaler
|
|
||||||
from sklearn.svm import SVC
|
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
|
||||||
from mufs import MUFS
|
|
||||||
|
|
||||||
|
|
||||||
class Snode:
|
|
||||||
"""
|
|
||||||
Nodes of the tree that keeps the svm classifier and if testing the
|
|
||||||
dataset assigned to it
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
clf : SVC
|
|
||||||
Classifier used
|
|
||||||
X : np.ndarray
|
|
||||||
input dataset in train time (only in testing)
|
|
||||||
y : np.ndarray
|
|
||||||
input labes in train time
|
|
||||||
features : np.array
|
|
||||||
features used to compute hyperplane
|
|
||||||
impurity : float
|
|
||||||
impurity of the node
|
|
||||||
title : str
|
|
||||||
label describing the route to the node
|
|
||||||
weight : np.ndarray, optional
|
|
||||||
weights applied to input dataset in train time, by default None
|
|
||||||
scaler : StandardScaler, optional
|
|
||||||
scaler used if any, by default None
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
clf: SVC,
|
|
||||||
X: np.ndarray,
|
|
||||||
y: np.ndarray,
|
|
||||||
features: np.array,
|
|
||||||
impurity: float,
|
|
||||||
title: str,
|
|
||||||
weight: np.ndarray = None,
|
|
||||||
scaler: StandardScaler = None,
|
|
||||||
):
|
|
||||||
self._clf = clf
|
|
||||||
self._title = title
|
|
||||||
self._belief = 0.0
|
|
||||||
# Only store dataset in Testing
|
|
||||||
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
|
||||||
self._y = y
|
|
||||||
self._down = None
|
|
||||||
self._up = None
|
|
||||||
self._class = None
|
|
||||||
self._feature = None
|
|
||||||
self._sample_weight = (
|
|
||||||
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
|
||||||
)
|
|
||||||
self._features = features
|
|
||||||
self._impurity = impurity
|
|
||||||
self._partition_column: int = -1
|
|
||||||
self._scaler = scaler
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def copy(cls, node: "Snode") -> "Snode":
|
|
||||||
return cls(
|
|
||||||
node._clf,
|
|
||||||
node._X,
|
|
||||||
node._y,
|
|
||||||
node._features,
|
|
||||||
node._impurity,
|
|
||||||
node._title,
|
|
||||||
node._sample_weight,
|
|
||||||
node._scaler,
|
|
||||||
)
|
|
||||||
|
|
||||||
def set_partition_column(self, col: int):
|
|
||||||
self._partition_column = col
|
|
||||||
|
|
||||||
def get_partition_column(self) -> int:
|
|
||||||
return self._partition_column
|
|
||||||
|
|
||||||
def set_down(self, son):
|
|
||||||
self._down = son
|
|
||||||
|
|
||||||
def set_title(self, title):
|
|
||||||
self._title = title
|
|
||||||
|
|
||||||
def set_classifier(self, clf):
|
|
||||||
self._clf = clf
|
|
||||||
|
|
||||||
def set_features(self, features):
|
|
||||||
self._features = features
|
|
||||||
|
|
||||||
def set_impurity(self, impurity):
|
|
||||||
self._impurity = impurity
|
|
||||||
|
|
||||||
def get_title(self) -> str:
|
|
||||||
return self._title
|
|
||||||
|
|
||||||
def get_classifier(self) -> SVC:
|
|
||||||
return self._clf
|
|
||||||
|
|
||||||
def get_impurity(self) -> float:
|
|
||||||
return self._impurity
|
|
||||||
|
|
||||||
def get_features(self) -> np.array:
|
|
||||||
return self._features
|
|
||||||
|
|
||||||
def set_up(self, son):
|
|
||||||
self._up = son
|
|
||||||
|
|
||||||
def is_leaf(self) -> bool:
|
|
||||||
return self._up is None and self._down is None
|
|
||||||
|
|
||||||
def get_down(self) -> "Snode":
|
|
||||||
return self._down
|
|
||||||
|
|
||||||
def get_up(self) -> "Snode":
|
|
||||||
return self._up
|
|
||||||
|
|
||||||
def make_predictor(self):
|
|
||||||
"""Compute the class of the predictor and its belief based on the
|
|
||||||
subdataset of the node only if it is a leaf
|
|
||||||
"""
|
|
||||||
if not self.is_leaf():
|
|
||||||
return
|
|
||||||
classes, card = np.unique(self._y, return_counts=True)
|
|
||||||
if len(classes) > 1:
|
|
||||||
max_card = max(card)
|
|
||||||
self._class = classes[card == max_card][0]
|
|
||||||
self._belief = max_card / np.sum(card)
|
|
||||||
else:
|
|
||||||
self._belief = 1
|
|
||||||
try:
|
|
||||||
self._class = classes[0]
|
|
||||||
except IndexError:
|
|
||||||
self._class = None
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
|
||||||
count_values = np.unique(self._y, return_counts=True)
|
|
||||||
if self.is_leaf():
|
|
||||||
return (
|
|
||||||
f"{self._title} - Leaf class={self._class} belief="
|
|
||||||
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
|
||||||
f"counts={count_values}"
|
|
||||||
)
|
|
||||||
return (
|
|
||||||
f"{self._title} feaures={self._features} impurity="
|
|
||||||
f"{self._impurity:.4f} "
|
|
||||||
f"counts={count_values}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class Siterator:
|
|
||||||
"""Stree preorder iterator"""
|
|
||||||
|
|
||||||
def __init__(self, tree: Snode):
|
|
||||||
self._stack = []
|
|
||||||
self._push(tree)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
# To complete the iterator interface
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _push(self, node: Snode):
|
|
||||||
if node is not None:
|
|
||||||
self._stack.append(node)
|
|
||||||
|
|
||||||
def __next__(self) -> Snode:
|
|
||||||
if len(self._stack) == 0:
|
|
||||||
raise StopIteration()
|
|
||||||
node = self._stack.pop()
|
|
||||||
self._push(node.get_up())
|
|
||||||
self._push(node.get_down())
|
|
||||||
return node
|
|
||||||
|
|
||||||
|
|
||||||
class Splitter:
|
|
||||||
"""
|
|
||||||
Splits a dataset in two based on different criteria
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
clf : SVC, optional
|
|
||||||
classifier, by default None
|
|
||||||
criterion : str, optional
|
|
||||||
The function to measure the quality of a split (only used if
|
|
||||||
max_features != num_features). Supported criteria are “gini” for the
|
|
||||||
Gini impurity and “entropy” for the information gain., by default
|
|
||||||
"entropy", by default None
|
|
||||||
feature_select : str, optional
|
|
||||||
The strategy used to choose the feature set at each node (only used if
|
|
||||||
max_features < num_features). Supported strategies are: “best”: sklearn
|
|
||||||
SelectKBest algorithm is used in every node to choose the max_features
|
|
||||||
best features. “random”: The algorithm generates 5 candidates and
|
|
||||||
choose the best (max. info. gain) of them. "mutual": Chooses the best
|
|
||||||
features w.r.t. their mutual info with the label. "cfs": Apply
|
|
||||||
Correlation-based Feature Selection. "fcbf": Apply Fast Correlation-
|
|
||||||
Based, by default None
|
|
||||||
criteria : str, optional
|
|
||||||
ecides (just in case of a multi class classification) which column
|
|
||||||
(class) use to split the dataset in a node. max_samples is
|
|
||||||
incompatible with 'ovo' multiclass_strategy, by default None
|
|
||||||
min_samples_split : int, optional
|
|
||||||
The minimum number of samples required to split an internal node. 0
|
|
||||||
(default) for any, by default None
|
|
||||||
random_state : optional
|
|
||||||
Controls the pseudo random number generation for shuffling the data for
|
|
||||||
probability estimates. Ignored when probability is False.Pass an int
|
|
||||||
for reproducible output across multiple function calls, by
|
|
||||||
default None
|
|
||||||
normalize : bool, optional
|
|
||||||
If standardization of features should be applied on each node with the
|
|
||||||
samples that reach it , by default False
|
|
||||||
|
|
||||||
Raises
|
|
||||||
------
|
|
||||||
ValueError
|
|
||||||
clf has to be a sklearn estimator
|
|
||||||
ValueError
|
|
||||||
criterion must be gini or entropy
|
|
||||||
ValueError
|
|
||||||
criteria has to be max_samples or impurity
|
|
||||||
ValueError
|
|
||||||
splitter must be in {random, best, mutual, cfs, fcbf}
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
clf: SVC = None,
|
|
||||||
criterion: str = None,
|
|
||||||
feature_select: str = None,
|
|
||||||
criteria: str = None,
|
|
||||||
min_samples_split: int = None,
|
|
||||||
random_state=None,
|
|
||||||
normalize=False,
|
|
||||||
):
|
|
||||||
|
|
||||||
self._clf = clf
|
|
||||||
self._random_state = random_state
|
|
||||||
if random_state is not None:
|
|
||||||
random.seed(random_state)
|
|
||||||
self._criterion = criterion
|
|
||||||
self._min_samples_split = min_samples_split
|
|
||||||
self._criteria = criteria
|
|
||||||
self._feature_select = feature_select
|
|
||||||
self._normalize = normalize
|
|
||||||
|
|
||||||
if clf is None:
|
|
||||||
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
|
||||||
|
|
||||||
if criterion not in ["gini", "entropy"]:
|
|
||||||
raise ValueError(
|
|
||||||
f"criterion must be gini or entropy got({criterion})"
|
|
||||||
)
|
|
||||||
|
|
||||||
if criteria not in [
|
|
||||||
"max_samples",
|
|
||||||
"impurity",
|
|
||||||
]:
|
|
||||||
raise ValueError(
|
|
||||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
|
||||||
)
|
|
||||||
|
|
||||||
if feature_select not in [
|
|
||||||
"random",
|
|
||||||
"trandom",
|
|
||||||
"best",
|
|
||||||
"mutual",
|
|
||||||
"cfs",
|
|
||||||
"fcbf",
|
|
||||||
"iwss",
|
|
||||||
]:
|
|
||||||
raise ValueError(
|
|
||||||
"splitter must be in {random, trandom, best, mutual, cfs, "
|
|
||||||
"fcbf, iwss} "
|
|
||||||
f"got ({feature_select})"
|
|
||||||
)
|
|
||||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
|
||||||
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
|
||||||
self.fs_function = getattr(self, f"_fs_{self._feature_select}")
|
|
||||||
|
|
||||||
def _fs_random(
|
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Return the best of five random feature set combinations
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(< number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
# Random feature reduction
|
|
||||||
n_features = dataset.shape[1]
|
|
||||||
features_sets = self._generate_spaces(n_features, max_features)
|
|
||||||
return self._select_best_set(dataset, labels, features_sets)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fs_trandom(
|
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Return the a random feature set combination
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(< number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
# Random feature reduction
|
|
||||||
n_features = dataset.shape[1]
|
|
||||||
return tuple(sorted(random.sample(range(n_features), max_features)))
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fs_best(
|
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Return the variabes with higher f-score
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(< number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
return (
|
|
||||||
SelectKBest(k=max_features)
|
|
||||||
.fit(dataset, labels)
|
|
||||||
.get_support(indices=True)
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fs_mutual(
|
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Return the best features with mutual information with labels
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(< number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
# return best features with mutual info with the label
|
|
||||||
feature_list = mutual_info_classif(dataset, labels)
|
|
||||||
return tuple(
|
|
||||||
sorted(
|
|
||||||
range(len(feature_list)), key=lambda sub: feature_list[sub]
|
|
||||||
)[-max_features:]
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fs_cfs(
|
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Correlattion-based feature selection with max_features limit
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(< number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
mufs = MUFS(max_features=max_features, discrete=False)
|
|
||||||
return mufs.cfs(dataset, labels).get_results()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fs_fcbf(
|
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Fast Correlation-based Filter algorithm with max_features limit
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(< number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
mufs = MUFS(max_features=max_features, discrete=False)
|
|
||||||
return mufs.fcbf(dataset, labels, 5e-4).get_results()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fs_iwss(
|
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Correlattion-based feature selection based on iwss with max_features
|
|
||||||
limit
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(< number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
mufs = MUFS(max_features=max_features, discrete=False)
|
|
||||||
return mufs.iwss(dataset, labels, 0.25).get_results()
|
|
||||||
|
|
||||||
def partition_impurity(self, y: np.array) -> np.array:
|
|
||||||
return self.criterion_function(y)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _gini(y: np.array) -> float:
|
|
||||||
_, count = np.unique(y, return_counts=True)
|
|
||||||
return 1 - np.sum(np.square(count / np.sum(count)))
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _entropy(y: np.array) -> float:
|
|
||||||
n_labels = len(y)
|
|
||||||
if n_labels <= 1:
|
|
||||||
return 0
|
|
||||||
counts = np.bincount(y)
|
|
||||||
proportions = counts / n_labels
|
|
||||||
n_classes = np.count_nonzero(proportions)
|
|
||||||
if n_classes <= 1:
|
|
||||||
return 0.0
|
|
||||||
from scipy.stats import entropy
|
|
||||||
|
|
||||||
return entropy(y, base=n_classes)
|
|
||||||
|
|
||||||
def information_gain(
|
|
||||||
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
|
||||||
) -> float:
|
|
||||||
"""Compute information gain of a split candidate
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
labels_up : np.array
|
|
||||||
labels of one side
|
|
||||||
labels_dn : np.array
|
|
||||||
labels on the other side
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
float
|
|
||||||
information gain
|
|
||||||
"""
|
|
||||||
imp_prev = self.criterion_function(labels)
|
|
||||||
card_up = card_dn = imp_up = imp_dn = 0
|
|
||||||
if labels_up is not None:
|
|
||||||
card_up = labels_up.shape[0]
|
|
||||||
imp_up = self.criterion_function(labels_up)
|
|
||||||
if labels_dn is not None:
|
|
||||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
|
||||||
imp_dn = self.criterion_function(labels_dn)
|
|
||||||
samples = card_up + card_dn
|
|
||||||
if samples == 0:
|
|
||||||
return 0.0
|
|
||||||
else:
|
|
||||||
result = (
|
|
||||||
imp_prev
|
|
||||||
- (card_up / samples) * imp_up
|
|
||||||
- (card_dn / samples) * imp_dn
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _select_best_set(
|
|
||||||
self, dataset: np.array, labels: np.array, features_sets: list
|
|
||||||
) -> list:
|
|
||||||
"""Return the best set of features among feature_sets, the criterion is
|
|
||||||
the information gain
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples (# samples, # features)
|
|
||||||
labels : np.array
|
|
||||||
array of labels
|
|
||||||
features_sets : list
|
|
||||||
list of features sets to check
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list
|
|
||||||
best feature set
|
|
||||||
"""
|
|
||||||
max_gain = 0
|
|
||||||
selected = None
|
|
||||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
||||||
for feature_set in features_sets:
|
|
||||||
self._clf.fit(dataset[:, feature_set], labels)
|
|
||||||
node = Snode(
|
|
||||||
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
|
||||||
)
|
|
||||||
self.partition(dataset, node, train=True)
|
|
||||||
y1, y2 = self.part(labels)
|
|
||||||
gain = self.information_gain(labels, y1, y2)
|
|
||||||
if gain > max_gain:
|
|
||||||
max_gain = gain
|
|
||||||
selected = feature_set
|
|
||||||
return selected if selected is not None else feature_set
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _generate_spaces(features: int, max_features: int) -> list:
|
|
||||||
"""Generate at most 5 feature random combinations
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
features : int
|
|
||||||
number of features in each combination
|
|
||||||
max_features : int
|
|
||||||
number of features in dataset
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list
|
|
||||||
list with up to 5 combination of features randomly selected
|
|
||||||
"""
|
|
||||||
comb = set()
|
|
||||||
# Generate at most 5 combinations
|
|
||||||
number = factorial(features) / (
|
|
||||||
factorial(max_features) * factorial(features - max_features)
|
|
||||||
)
|
|
||||||
set_length = min(5, number)
|
|
||||||
while len(comb) < set_length:
|
|
||||||
comb.add(
|
|
||||||
tuple(sorted(random.sample(range(features), max_features)))
|
|
||||||
)
|
|
||||||
return list(comb)
|
|
||||||
|
|
||||||
def _get_subspaces_set(
|
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Compute the indices of the features selected by splitter depending
|
|
||||||
on the self._feature_select hyper parameter
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features of the subspace
|
|
||||||
(<= number of features in dataset)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
# No feature reduction
|
|
||||||
n_features = dataset.shape[1]
|
|
||||||
if n_features == max_features:
|
|
||||||
return tuple(range(n_features))
|
|
||||||
# select features as selected in constructor
|
|
||||||
return self.fs_function(dataset, labels, max_features)
|
|
||||||
|
|
||||||
def get_subspace(
|
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
|
||||||
) -> tuple:
|
|
||||||
"""Re3turn a subspace of the selected dataset of max_features length.
|
|
||||||
Depending on hyperparameter
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
dataset : np.array
|
|
||||||
array of samples (# samples, # features)
|
|
||||||
labels : np.array
|
|
||||||
labels of the dataset
|
|
||||||
max_features : int
|
|
||||||
number of features to form the subspace
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
tuple
|
|
||||||
tuple with the dataset with only the features selected and the
|
|
||||||
indices of the features selected
|
|
||||||
"""
|
|
||||||
indices = self._get_subspaces_set(dataset, labels, max_features)
|
|
||||||
return dataset[:, indices], indices
|
|
||||||
|
|
||||||
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
|
||||||
"""return column of dataset to be taken into account to split dataset
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : np.array
|
|
||||||
distances to hyper plane of every class
|
|
||||||
y : np.array
|
|
||||||
vector of labels (classes)
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.array
|
|
||||||
column of dataset to be taken into account to split dataset
|
|
||||||
"""
|
|
||||||
max_gain = 0
|
|
||||||
selected = -1
|
|
||||||
for col in range(data.shape[1]):
|
|
||||||
tup = y[data[:, col] > 0]
|
|
||||||
tdn = y[data[:, col] <= 0]
|
|
||||||
info_gain = self.information_gain(y, tup, tdn)
|
|
||||||
if info_gain > max_gain:
|
|
||||||
selected = col
|
|
||||||
max_gain = info_gain
|
|
||||||
return selected
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _max_samples(data: np.array, y: np.array) -> np.array:
|
|
||||||
"""return column of dataset to be taken into account to split dataset
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data : np.array
|
|
||||||
distances to hyper plane of every class
|
|
||||||
y : np.array
|
|
||||||
column of dataset to be taken into account to split dataset
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.array
|
|
||||||
column of dataset to be taken into account to split dataset
|
|
||||||
"""
|
|
||||||
# select the class with max number of samples
|
|
||||||
_, samples = np.unique(y, return_counts=True)
|
|
||||||
return np.argmax(samples)
|
|
||||||
|
|
||||||
def partition(self, samples: np.array, node: Snode, train: bool):
|
|
||||||
"""Set the criteria to split arrays. Compute the indices of the samples
|
|
||||||
that should go to one side of the tree (up)
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
samples : np.array
|
|
||||||
array of samples (# samples, # features)
|
|
||||||
node : Snode
|
|
||||||
Node of the tree where partition is going to be made
|
|
||||||
train : bool
|
|
||||||
Train time - True / Test time - False
|
|
||||||
"""
|
|
||||||
# data contains the distances of every sample to every class hyperplane
|
|
||||||
# array of (m, nc) nc = # classes
|
|
||||||
data = self._distances(node, samples)
|
|
||||||
if data.shape[0] < self._min_samples_split:
|
|
||||||
# there aren't enough samples to split
|
|
||||||
self._up = np.ones((data.shape[0]), dtype=bool)
|
|
||||||
return
|
|
||||||
if data.ndim > 1:
|
|
||||||
# split criteria for multiclass
|
|
||||||
# Convert data to a (m, 1) array selecting values for samples
|
|
||||||
if train:
|
|
||||||
# in train time we have to compute the column to take into
|
|
||||||
# account to split the dataset
|
|
||||||
col = self.decision_criteria(data, node._y)
|
|
||||||
node.set_partition_column(col)
|
|
||||||
else:
|
|
||||||
# in predcit time just use the column computed in train time
|
|
||||||
# is taking the classifier of class <col>
|
|
||||||
col = node.get_partition_column()
|
|
||||||
if col == -1:
|
|
||||||
# No partition is producing information gain
|
|
||||||
data = np.ones(data.shape)
|
|
||||||
data = data[:, col]
|
|
||||||
self._up = data > 0
|
|
||||||
|
|
||||||
def part(self, origin: np.array) -> list:
|
|
||||||
"""Split an array in two based on indices (self._up) and its complement
|
|
||||||
partition has to be called first to establish up indices
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
origin : np.array
|
|
||||||
dataset to split
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
list
|
|
||||||
list with two splits of the array
|
|
||||||
"""
|
|
||||||
down = ~self._up
|
|
||||||
return [
|
|
||||||
origin[self._up] if any(self._up) else None,
|
|
||||||
origin[down] if any(down) else None,
|
|
||||||
]
|
|
||||||
|
|
||||||
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
|
||||||
"""Compute distances of the samples to the hyperplane of the node
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
node : Snode
|
|
||||||
node containing the svm classifier
|
|
||||||
data : np.ndarray
|
|
||||||
samples to compute distance to hyperplane
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
np.array
|
|
||||||
array of shape (m, nc) with the distances of every sample to
|
|
||||||
the hyperplane of every class. nc = # of classes
|
|
||||||
"""
|
|
||||||
X_transformed = data[:, node._features]
|
|
||||||
if self._normalize:
|
|
||||||
X_transformed = node._scaler.transform(X_transformed)
|
|
||||||
return node._clf.decision_function(X_transformed)
|
|
667
stree/Strees.py
667
stree/Strees.py
@@ -2,135 +2,548 @@
|
|||||||
Oblique decision tree classifier based on SVM nodes
|
Oblique decision tree classifier based on SVM nodes
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
import numbers
|
import numbers
|
||||||
import random
|
import random
|
||||||
|
import warnings
|
||||||
|
from math import log, factorial
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import SVC, LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
|
from sklearn.feature_selection import SelectKBest
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.utils.multiclass import check_classification_targets
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
from sklearn.utils.validation import (
|
from sklearn.utils.validation import (
|
||||||
check_X_y,
|
check_X_y,
|
||||||
check_array,
|
check_array,
|
||||||
check_is_fitted,
|
check_is_fitted,
|
||||||
_check_sample_weight,
|
_check_sample_weight,
|
||||||
)
|
)
|
||||||
from .Splitter import Splitter, Snode, Siterator
|
|
||||||
|
|
||||||
|
class Snode:
|
||||||
|
"""Nodes of the tree that keeps the svm classifier and if testing the
|
||||||
|
dataset assigned to it
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
clf: SVC,
|
||||||
|
X: np.ndarray,
|
||||||
|
y: np.ndarray,
|
||||||
|
features: np.array,
|
||||||
|
impurity: float,
|
||||||
|
title: str,
|
||||||
|
weight: np.ndarray = None,
|
||||||
|
scaler: StandardScaler = None,
|
||||||
|
):
|
||||||
|
self._clf = clf
|
||||||
|
self._title = title
|
||||||
|
self._belief = 0.0
|
||||||
|
# Only store dataset in Testing
|
||||||
|
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
self._y = y
|
||||||
|
self._down = None
|
||||||
|
self._up = None
|
||||||
|
self._class = None
|
||||||
|
self._feature = None
|
||||||
|
self._sample_weight = (
|
||||||
|
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
)
|
||||||
|
self._features = features
|
||||||
|
self._impurity = impurity
|
||||||
|
self._partition_column: int = -1
|
||||||
|
self._scaler = scaler
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def copy(cls, node: "Snode") -> "Snode":
|
||||||
|
return cls(
|
||||||
|
node._clf,
|
||||||
|
node._X,
|
||||||
|
node._y,
|
||||||
|
node._features,
|
||||||
|
node._impurity,
|
||||||
|
node._title,
|
||||||
|
node._sample_weight,
|
||||||
|
node._scaler,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_partition_column(self, col: int):
|
||||||
|
self._partition_column = col
|
||||||
|
|
||||||
|
def get_partition_column(self) -> int:
|
||||||
|
return self._partition_column
|
||||||
|
|
||||||
|
def set_down(self, son):
|
||||||
|
self._down = son
|
||||||
|
|
||||||
|
def set_title(self, title):
|
||||||
|
self._title = title
|
||||||
|
|
||||||
|
def set_classifier(self, clf):
|
||||||
|
self._clf = clf
|
||||||
|
|
||||||
|
def set_features(self, features):
|
||||||
|
self._features = features
|
||||||
|
|
||||||
|
def set_impurity(self, impurity):
|
||||||
|
self._impurity = impurity
|
||||||
|
|
||||||
|
def get_title(self) -> str:
|
||||||
|
return self._title
|
||||||
|
|
||||||
|
def get_classifier(self) -> SVC:
|
||||||
|
return self._clf
|
||||||
|
|
||||||
|
def get_impurity(self) -> float:
|
||||||
|
return self._impurity
|
||||||
|
|
||||||
|
def get_features(self) -> np.array:
|
||||||
|
return self._features
|
||||||
|
|
||||||
|
def set_up(self, son):
|
||||||
|
self._up = son
|
||||||
|
|
||||||
|
def is_leaf(self) -> bool:
|
||||||
|
return self._up is None and self._down is None
|
||||||
|
|
||||||
|
def get_down(self) -> "Snode":
|
||||||
|
return self._down
|
||||||
|
|
||||||
|
def get_up(self) -> "Snode":
|
||||||
|
return self._up
|
||||||
|
|
||||||
|
def make_predictor(self):
|
||||||
|
"""Compute the class of the predictor and its belief based on the
|
||||||
|
subdataset of the node only if it is a leaf
|
||||||
|
"""
|
||||||
|
if not self.is_leaf():
|
||||||
|
return
|
||||||
|
classes, card = np.unique(self._y, return_counts=True)
|
||||||
|
if len(classes) > 1:
|
||||||
|
max_card = max(card)
|
||||||
|
self._class = classes[card == max_card][0]
|
||||||
|
self._belief = max_card / np.sum(card)
|
||||||
|
else:
|
||||||
|
self._belief = 1
|
||||||
|
try:
|
||||||
|
self._class = classes[0]
|
||||||
|
except IndexError:
|
||||||
|
self._class = None
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
count_values = np.unique(self._y, return_counts=True)
|
||||||
|
if self.is_leaf():
|
||||||
|
return (
|
||||||
|
f"{self._title} - Leaf class={self._class} belief="
|
||||||
|
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"{self._title} feaures={self._features} impurity="
|
||||||
|
f"{self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Siterator:
|
||||||
|
"""Stree preorder iterator"""
|
||||||
|
|
||||||
|
def __init__(self, tree: Snode):
|
||||||
|
self._stack = []
|
||||||
|
self._push(tree)
|
||||||
|
|
||||||
|
def _push(self, node: Snode):
|
||||||
|
if node is not None:
|
||||||
|
self._stack.append(node)
|
||||||
|
|
||||||
|
def __next__(self) -> Snode:
|
||||||
|
if len(self._stack) == 0:
|
||||||
|
raise StopIteration()
|
||||||
|
node = self._stack.pop()
|
||||||
|
self._push(node.get_up())
|
||||||
|
self._push(node.get_down())
|
||||||
|
return node
|
||||||
|
|
||||||
|
|
||||||
|
class Splitter:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
clf: SVC = None,
|
||||||
|
criterion: str = None,
|
||||||
|
feature_select: str = None,
|
||||||
|
criteria: str = None,
|
||||||
|
min_samples_split: int = None,
|
||||||
|
random_state=None,
|
||||||
|
normalize=False,
|
||||||
|
):
|
||||||
|
self._clf = clf
|
||||||
|
self._random_state = random_state
|
||||||
|
if random_state is not None:
|
||||||
|
random.seed(random_state)
|
||||||
|
self._criterion = criterion
|
||||||
|
self._min_samples_split = min_samples_split
|
||||||
|
self._criteria = criteria
|
||||||
|
self._feature_select = feature_select
|
||||||
|
self._normalize = normalize
|
||||||
|
|
||||||
|
if clf is None:
|
||||||
|
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||||
|
|
||||||
|
if criterion not in ["gini", "entropy"]:
|
||||||
|
raise ValueError(
|
||||||
|
f"criterion must be gini or entropy got({criterion})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if criteria not in [
|
||||||
|
"max_samples",
|
||||||
|
"impurity",
|
||||||
|
]:
|
||||||
|
raise ValueError(
|
||||||
|
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if feature_select not in ["random", "best"]:
|
||||||
|
raise ValueError(
|
||||||
|
"splitter must be either random or best, got "
|
||||||
|
f"({feature_select})"
|
||||||
|
)
|
||||||
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
|
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||||
|
|
||||||
|
def partition_impurity(self, y: np.array) -> np.array:
|
||||||
|
return self.criterion_function(y)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _gini(y: np.array) -> float:
|
||||||
|
_, count = np.unique(y, return_counts=True)
|
||||||
|
return 1 - np.sum(np.square(count / np.sum(count)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _entropy(y: np.array) -> float:
|
||||||
|
"""Compute entropy of a labels set
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : np.array
|
||||||
|
set of labels
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
entropy
|
||||||
|
"""
|
||||||
|
n_labels = len(y)
|
||||||
|
if n_labels <= 1:
|
||||||
|
return 0
|
||||||
|
counts = np.bincount(y)
|
||||||
|
proportions = counts / n_labels
|
||||||
|
n_classes = np.count_nonzero(proportions)
|
||||||
|
if n_classes <= 1:
|
||||||
|
return 0
|
||||||
|
entropy = 0.0
|
||||||
|
# Compute standard entropy.
|
||||||
|
for prop in proportions:
|
||||||
|
if prop != 0.0:
|
||||||
|
entropy -= prop * log(prop, n_classes)
|
||||||
|
return entropy
|
||||||
|
|
||||||
|
def information_gain(
|
||||||
|
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||||
|
) -> float:
|
||||||
|
"""Compute information gain of a split candidate
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
labels_up : np.array
|
||||||
|
labels of one side
|
||||||
|
labels_dn : np.array
|
||||||
|
labels on the other side
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
information gain
|
||||||
|
"""
|
||||||
|
imp_prev = self.criterion_function(labels)
|
||||||
|
card_up = card_dn = imp_up = imp_dn = 0
|
||||||
|
if labels_up is not None:
|
||||||
|
card_up = labels_up.shape[0]
|
||||||
|
imp_up = self.criterion_function(labels_up)
|
||||||
|
if labels_dn is not None:
|
||||||
|
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||||
|
imp_dn = self.criterion_function(labels_dn)
|
||||||
|
samples = card_up + card_dn
|
||||||
|
if samples == 0:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
result = (
|
||||||
|
imp_prev
|
||||||
|
- (card_up / samples) * imp_up
|
||||||
|
- (card_dn / samples) * imp_dn
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _select_best_set(
|
||||||
|
self, dataset: np.array, labels: np.array, features_sets: list
|
||||||
|
) -> list:
|
||||||
|
"""Return the best set of features among feature_sets, the criterion is
|
||||||
|
the information gain
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
labels : np.array
|
||||||
|
array of labels
|
||||||
|
features_sets : list
|
||||||
|
list of features sets to check
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
best feature set
|
||||||
|
"""
|
||||||
|
max_gain = 0
|
||||||
|
selected = None
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
for feature_set in features_sets:
|
||||||
|
self._clf.fit(dataset[:, feature_set], labels)
|
||||||
|
node = Snode(
|
||||||
|
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
||||||
|
)
|
||||||
|
self.partition(dataset, node, train=True)
|
||||||
|
y1, y2 = self.part(labels)
|
||||||
|
gain = self.information_gain(labels, y1, y2)
|
||||||
|
if gain > max_gain:
|
||||||
|
max_gain = gain
|
||||||
|
selected = feature_set
|
||||||
|
return selected if selected is not None else feature_set
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_spaces(features: int, max_features: int) -> list:
|
||||||
|
"""Generate at most 5 feature random combinations
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
features : int
|
||||||
|
number of features in each combination
|
||||||
|
max_features : int
|
||||||
|
number of features in dataset
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list with up to 5 combination of features randomly selected
|
||||||
|
"""
|
||||||
|
comb = set()
|
||||||
|
# Generate at most 5 combinations
|
||||||
|
number = factorial(features) / (
|
||||||
|
factorial(max_features) * factorial(features - max_features)
|
||||||
|
)
|
||||||
|
set_length = min(5, number)
|
||||||
|
while len(comb) < set_length:
|
||||||
|
comb.add(
|
||||||
|
tuple(sorted(random.sample(range(features), max_features)))
|
||||||
|
)
|
||||||
|
return list(comb)
|
||||||
|
|
||||||
|
def _get_subspaces_set(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Compute the indices of the features selected by splitter depending
|
||||||
|
on the self._feature_select hyper parameter
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(<= number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
if dataset.shape[1] == max_features:
|
||||||
|
# No feature reduction applies
|
||||||
|
return tuple(range(dataset.shape[1]))
|
||||||
|
if self._feature_select == "random":
|
||||||
|
features_sets = self._generate_spaces(
|
||||||
|
dataset.shape[1], max_features
|
||||||
|
)
|
||||||
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
|
# Take KBest features
|
||||||
|
return (
|
||||||
|
SelectKBest(k=max_features)
|
||||||
|
.fit(dataset, labels)
|
||||||
|
.get_support(indices=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_subspace(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Re3turn a subspace of the selected dataset of max_features length.
|
||||||
|
Depending on hyperparmeter
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features to form the subspace
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
tuple with the dataset with only the features selected and the
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||||
|
return dataset[:, indices], indices
|
||||||
|
|
||||||
|
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
||||||
|
"""return column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : np.array
|
||||||
|
distances to hyper plane of every class
|
||||||
|
y : np.array
|
||||||
|
vector of labels (classes)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
"""
|
||||||
|
max_gain = 0
|
||||||
|
selected = -1
|
||||||
|
for col in range(data.shape[1]):
|
||||||
|
tup = y[data[:, col] > 0]
|
||||||
|
tdn = y[data[:, col] <= 0]
|
||||||
|
info_gain = self.information_gain(y, tup, tdn)
|
||||||
|
if info_gain > max_gain:
|
||||||
|
selected = col
|
||||||
|
max_gain = info_gain
|
||||||
|
return selected
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||||
|
"""return column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : np.array
|
||||||
|
distances to hyper plane of every class
|
||||||
|
y : np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
"""
|
||||||
|
# select the class with max number of samples
|
||||||
|
_, samples = np.unique(y, return_counts=True)
|
||||||
|
return np.argmax(samples)
|
||||||
|
|
||||||
|
def partition(self, samples: np.array, node: Snode, train: bool):
|
||||||
|
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||||
|
that should go to one side of the tree (up)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
samples : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
node : Snode
|
||||||
|
Node of the tree where partition is going to be made
|
||||||
|
train : bool
|
||||||
|
Train time - True / Test time - False
|
||||||
|
"""
|
||||||
|
# data contains the distances of every sample to every class hyperplane
|
||||||
|
# array of (m, nc) nc = # classes
|
||||||
|
data = self._distances(node, samples)
|
||||||
|
if data.shape[0] < self._min_samples_split:
|
||||||
|
# there aren't enough samples to split
|
||||||
|
self._up = np.ones((data.shape[0]), dtype=bool)
|
||||||
|
return
|
||||||
|
if data.ndim > 1:
|
||||||
|
# split criteria for multiclass
|
||||||
|
# Convert data to a (m, 1) array selecting values for samples
|
||||||
|
if train:
|
||||||
|
# in train time we have to compute the column to take into
|
||||||
|
# account to split the dataset
|
||||||
|
col = self.decision_criteria(data, node._y)
|
||||||
|
node.set_partition_column(col)
|
||||||
|
else:
|
||||||
|
# in predcit time just use the column computed in train time
|
||||||
|
# is taking the classifier of class <col>
|
||||||
|
col = node.get_partition_column()
|
||||||
|
if col == -1:
|
||||||
|
# No partition is producing information gain
|
||||||
|
data = np.ones(data.shape)
|
||||||
|
data = data[:, col]
|
||||||
|
self._up = data > 0
|
||||||
|
|
||||||
|
def part(self, origin: np.array) -> list:
|
||||||
|
"""Split an array in two based on indices (self._up) and its complement
|
||||||
|
partition has to be called first to establish up indices
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
origin : np.array
|
||||||
|
dataset to split
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list with two splits of the array
|
||||||
|
"""
|
||||||
|
down = ~self._up
|
||||||
|
return [
|
||||||
|
origin[self._up] if any(self._up) else None,
|
||||||
|
origin[down] if any(down) else None,
|
||||||
|
]
|
||||||
|
|
||||||
|
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
||||||
|
"""Compute distances of the samples to the hyperplane of the node
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
node : Snode
|
||||||
|
node containing the svm classifier
|
||||||
|
data : np.ndarray
|
||||||
|
samples to compute distance to hyperplane
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
array of shape (m, nc) with the distances of every sample to
|
||||||
|
the hyperplane of every class. nc = # of classes
|
||||||
|
"""
|
||||||
|
X_transformed = data[:, node._features]
|
||||||
|
if self._normalize:
|
||||||
|
X_transformed = node._scaler.transform(X_transformed)
|
||||||
|
return node._clf.decision_function(X_transformed)
|
||||||
|
|
||||||
|
|
||||||
class Stree(BaseEstimator, ClassifierMixin):
|
class Stree(BaseEstimator, ClassifierMixin):
|
||||||
"""
|
"""Estimator that is based on binary trees of svm nodes
|
||||||
Estimator that is based on binary trees of svm nodes
|
|
||||||
can deal with sample_weights in predict, used in boosting sklearn methods
|
can deal with sample_weights in predict, used in boosting sklearn methods
|
||||||
inheriting from BaseEstimator implements get_params and set_params methods
|
inheriting from BaseEstimator implements get_params and set_params methods
|
||||||
inheriting from ClassifierMixin implement the attribute _estimator_type
|
inheriting from ClassifierMixin implement the attribute _estimator_type
|
||||||
with "classifier" as value
|
with "classifier" as value
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
C : float, optional
|
|
||||||
Regularization parameter. The strength of the regularization is
|
|
||||||
inversely proportional to C. Must be strictly positive., by default 1.0
|
|
||||||
kernel : str, optional
|
|
||||||
Specifies the kernel type to be used in the algorithm. It must be one
|
|
||||||
of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses
|
|
||||||
[liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and
|
|
||||||
the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/)
|
|
||||||
library through scikit-learn library, by default "linear"
|
|
||||||
max_iter : int, optional
|
|
||||||
Hard limit on iterations within solver, or -1 for no limit., by default
|
|
||||||
1e5
|
|
||||||
random_state : int, optional
|
|
||||||
Controls the pseudo random number generation for shuffling the data for
|
|
||||||
probability estimates. Ignored when probability is False.Pass an int
|
|
||||||
for reproducible output across multiple function calls, by
|
|
||||||
default None
|
|
||||||
max_depth : int, optional
|
|
||||||
Specifies the maximum depth of the tree, by default None
|
|
||||||
tol : float, optional
|
|
||||||
Tolerance for stopping, by default 1e-4
|
|
||||||
degree : int, optional
|
|
||||||
Degree of the polynomial kernel function (‘poly’). Ignored by all other
|
|
||||||
kernels., by default 3
|
|
||||||
gamma : str, optional
|
|
||||||
Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.if gamma='scale'
|
|
||||||
(default) is passed then it uses 1 / (n_features * X.var()) as value
|
|
||||||
of gamma,if ‘auto’, uses 1 / n_features., by default "scale"
|
|
||||||
split_criteria : str, optional
|
|
||||||
Decides (just in case of a multi class classification) which column
|
|
||||||
(class) use to split the dataset in a node. max_samples is
|
|
||||||
incompatible with 'ovo' multiclass_strategy, by default "impurity"
|
|
||||||
criterion : str, optional
|
|
||||||
The function to measure the quality of a split (only used if
|
|
||||||
max_features != num_features). Supported criteria are “gini” for the
|
|
||||||
Gini impurity and “entropy” for the information gain., by default
|
|
||||||
"entropy"
|
|
||||||
min_samples_split : int, optional
|
|
||||||
The minimum number of samples required to split an internal node. 0
|
|
||||||
(default) for any, by default 0
|
|
||||||
max_features : optional
|
|
||||||
The number of features to consider when looking for the split: If int,
|
|
||||||
then consider max_features features at each split. If float, then
|
|
||||||
max_features is a fraction and int(max_features * n_features) features
|
|
||||||
are considered at each split. If “auto”, then max_features=
|
|
||||||
sqrt(n_features). If “sqrt”, then max_features=sqrt(n_features). If
|
|
||||||
“log2”, then max_features=log2(n_features). If None, then max_features=
|
|
||||||
n_features., by default None
|
|
||||||
splitter : str, optional
|
|
||||||
The strategy used to choose the feature set at each node (only used if
|
|
||||||
max_features < num_features). Supported strategies are: “best”: sklearn
|
|
||||||
SelectKBest algorithm is used in every node to choose the max_features
|
|
||||||
best features. “random”: The algorithm generates 5 candidates and
|
|
||||||
choose the best (max. info. gain) of them. "mutual": Chooses the best
|
|
||||||
features w.r.t. their mutual info with the label. "cfs": Apply
|
|
||||||
Correlation-based Feature Selection. "fcbf": Apply Fast Correlation-
|
|
||||||
Based , by default "random"
|
|
||||||
multiclass_strategy : str, optional
|
|
||||||
Strategy to use with multiclass datasets, "ovo": one versus one. "ovr":
|
|
||||||
one versus rest, by default "ovo"
|
|
||||||
normalize : bool, optional
|
|
||||||
If standardization of features should be applied on each node with the
|
|
||||||
samples that reach it , by default False
|
|
||||||
|
|
||||||
Attributes
|
|
||||||
----------
|
|
||||||
classes_ : ndarray of shape (n_classes,)
|
|
||||||
The classes labels.
|
|
||||||
|
|
||||||
n_classes_ : int
|
|
||||||
The number of classes
|
|
||||||
|
|
||||||
n_iter_ : int
|
|
||||||
Max number of iterations in classifier
|
|
||||||
|
|
||||||
depth_ : int
|
|
||||||
Max depht of the tree
|
|
||||||
|
|
||||||
n_features_ : int
|
|
||||||
The number of features when ``fit`` is performed.
|
|
||||||
|
|
||||||
n_features_in_ : int
|
|
||||||
Number of features seen during :term:`fit`.
|
|
||||||
|
|
||||||
max_features_ : int
|
|
||||||
Number of features to use in hyperplane computation
|
|
||||||
|
|
||||||
tree_ : Node
|
|
||||||
root of the tree
|
|
||||||
|
|
||||||
X_ : ndarray
|
|
||||||
points to the input dataset
|
|
||||||
|
|
||||||
y_ : ndarray
|
|
||||||
points to the input labels
|
|
||||||
|
|
||||||
References
|
|
||||||
----------
|
|
||||||
R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class
|
|
||||||
oblique decision tree based on support vector machines.", 2021 LNAI...
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -148,10 +561,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
min_samples_split: int = 0,
|
min_samples_split: int = 0,
|
||||||
max_features=None,
|
max_features=None,
|
||||||
splitter: str = "random",
|
splitter: str = "random",
|
||||||
multiclass_strategy: str = "ovo",
|
|
||||||
normalize: bool = False,
|
normalize: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.max_iter = max_iter
|
self.max_iter = max_iter
|
||||||
self.C = C
|
self.C = C
|
||||||
self.kernel = kernel
|
self.kernel = kernel
|
||||||
@@ -166,7 +577,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.criterion = criterion
|
self.criterion = criterion
|
||||||
self.splitter = splitter
|
self.splitter = splitter
|
||||||
self.normalize = normalize
|
self.normalize = normalize
|
||||||
self.multiclass_strategy = multiclass_strategy
|
|
||||||
|
|
||||||
def _more_tags(self) -> dict:
|
def _more_tags(self) -> dict:
|
||||||
"""Required by sklearn to supply features of the classifier
|
"""Required by sklearn to supply features of the classifier
|
||||||
@@ -211,23 +621,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
f"Maximum depth has to be greater than 1... got (max_depth=\
|
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||||
{self.max_depth})"
|
{self.max_depth})"
|
||||||
)
|
)
|
||||||
if self.multiclass_strategy not in ["ovr", "ovo"]:
|
kernels = ["linear", "rbf", "poly", "sigmoid"]
|
||||||
raise ValueError(
|
|
||||||
"mutliclass_strategy has to be either ovr or ovo"
|
|
||||||
f" but got {self.multiclass_strategy}"
|
|
||||||
)
|
|
||||||
if self.multiclass_strategy == "ovo":
|
|
||||||
if self.kernel == "liblinear":
|
|
||||||
raise ValueError(
|
|
||||||
"The kernel liblinear is incompatible with ovo "
|
|
||||||
"multiclass_strategy"
|
|
||||||
)
|
|
||||||
if self.split_criteria == "max_samples":
|
|
||||||
raise ValueError(
|
|
||||||
"The multiclass_strategy 'ovo' is incompatible with "
|
|
||||||
"split_criteria 'max_samples'"
|
|
||||||
)
|
|
||||||
kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
|
|
||||||
if self.kernel not in kernels:
|
if self.kernel not in kernels:
|
||||||
raise ValueError(f"Kernel {self.kernel} not in {kernels}")
|
raise ValueError(f"Kernel {self.kernel} not in {kernels}")
|
||||||
check_classification_targets(y)
|
check_classification_targets(y)
|
||||||
@@ -259,12 +653,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.n_features_ = X.shape[1]
|
self.n_features_ = X.shape[1]
|
||||||
self.n_features_in_ = X.shape[1]
|
self.n_features_in_ = X.shape[1]
|
||||||
self.max_features_ = self._initialize_max_features()
|
self.max_features_ = self._initialize_max_features()
|
||||||
self.tree_ = self._train(X, y, sample_weight, 1, "root")
|
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
||||||
self.X_ = X
|
self.X_ = X
|
||||||
self.y_ = y
|
self.y_ = y
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _train(
|
def train(
|
||||||
self,
|
self,
|
||||||
X: np.ndarray,
|
X: np.ndarray,
|
||||||
y: np.ndarray,
|
y: np.ndarray,
|
||||||
@@ -329,10 +723,10 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
node.make_predictor()
|
node.make_predictor()
|
||||||
return node
|
return node
|
||||||
node.set_up(
|
node.set_up(
|
||||||
self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
|
self.train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
|
||||||
)
|
)
|
||||||
node.set_down(
|
node.set_down(
|
||||||
self._train(
|
self.train(
|
||||||
X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})"
|
X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -347,7 +741,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
C=self.C,
|
C=self.C,
|
||||||
tol=self.tol,
|
tol=self.tol,
|
||||||
)
|
)
|
||||||
if self.kernel == "liblinear"
|
if self.kernel == "linear"
|
||||||
else SVC(
|
else SVC(
|
||||||
kernel=self.kernel,
|
kernel=self.kernel,
|
||||||
max_iter=self.max_iter,
|
max_iter=self.max_iter,
|
||||||
@@ -356,7 +750,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
gamma=self.gamma,
|
gamma=self.gamma,
|
||||||
degree=self.degree,
|
degree=self.degree,
|
||||||
random_state=self.random_state,
|
random_state=self.random_state,
|
||||||
decision_function_shape=self.multiclass_strategy,
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -499,12 +892,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
elif self.max_features is None:
|
elif self.max_features is None:
|
||||||
max_features = self.n_features_
|
max_features = self.n_features_
|
||||||
elif isinstance(self.max_features, numbers.Integral):
|
elif isinstance(self.max_features, numbers.Integral):
|
||||||
if self.max_features > self.n_features_:
|
|
||||||
raise ValueError(
|
|
||||||
"Invalid value for max_features. "
|
|
||||||
"It can not be greater than number of features "
|
|
||||||
f"({self.n_features_})"
|
|
||||||
)
|
|
||||||
max_features = self.max_features
|
max_features = self.max_features
|
||||||
else: # float
|
else: # float
|
||||||
if self.max_features > 0.0:
|
if self.max_features > 0.0:
|
||||||
|
@@ -1,10 +1,11 @@
|
|||||||
from .Strees import Stree, Siterator
|
from .Strees import Stree, Snode, Siterator, Splitter
|
||||||
|
|
||||||
__version__ = "1.2.1"
|
__version__ = "1.0"
|
||||||
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
||||||
__license__ = "MIT License"
|
__license__ = "MIT License"
|
||||||
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
||||||
|
__url__ = "https://github.com/doctorado-ml/stree"
|
||||||
|
|
||||||
__all__ = ["Stree", "Siterator"]
|
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
||||||
|
@@ -1,19 +1,14 @@
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from stree import Stree
|
from stree import Stree, Snode
|
||||||
from stree.Splitter import Snode
|
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
class Snode_test(unittest.TestCase):
|
class Snode_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree(
|
self._clf = Stree(random_state=self._random_state)
|
||||||
random_state=self._random_state,
|
|
||||||
kernel="liblinear",
|
|
||||||
multiclass_strategy="ovr",
|
|
||||||
)
|
|
||||||
self._clf.fit(*load_dataset(self._random_state))
|
self._clf.fit(*load_dataset(self._random_state))
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@@ -5,8 +5,8 @@ import random
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.datasets import load_wine, load_iris
|
from sklearn.datasets import load_wine, load_iris
|
||||||
from stree.Splitter import Splitter
|
from stree import Splitter
|
||||||
from .utils import load_dataset, load_disc_dataset
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
class Splitter_test(unittest.TestCase):
|
class Splitter_test(unittest.TestCase):
|
||||||
@@ -195,14 +195,10 @@ class Splitter_test(unittest.TestCase):
|
|||||||
[0, 3, 7, 12], # random entropy impurity
|
[0, 3, 7, 12], # random entropy impurity
|
||||||
[1, 7, 9, 12], # random gini max_samples
|
[1, 7, 9, 12], # random gini max_samples
|
||||||
[1, 5, 8, 12], # random gini impurity
|
[1, 5, 8, 12], # random gini impurity
|
||||||
[6, 9, 11, 12], # mutual entropy max_samples
|
|
||||||
[6, 9, 11, 12], # mutual entropy impurity
|
|
||||||
[6, 9, 11, 12], # mutual gini max_samples
|
|
||||||
[6, 9, 11, 12], # mutual gini impurity
|
|
||||||
]
|
]
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
rn = 0
|
rn = 0
|
||||||
for feature_select in ["best", "random", "mutual"]:
|
for feature_select in ["best", "random"]:
|
||||||
for criterion in ["entropy", "gini"]:
|
for criterion in ["entropy", "gini"]:
|
||||||
for criteria in [
|
for criteria in [
|
||||||
"max_samples",
|
"max_samples",
|
||||||
@@ -225,7 +221,7 @@ class Splitter_test(unittest.TestCase):
|
|||||||
# criteria,
|
# criteria,
|
||||||
# )
|
# )
|
||||||
# )
|
# )
|
||||||
self.assertListEqual(expected, sorted(list(computed)))
|
self.assertListEqual(expected, list(computed))
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
X[:, computed].tolist(), dataset.tolist()
|
X[:, computed].tolist(), dataset.tolist()
|
||||||
)
|
)
|
||||||
@@ -244,69 +240,3 @@ class Splitter_test(unittest.TestCase):
|
|||||||
Xs, computed = tcl.get_subspace(X, y, k)
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
def test_get_best_subspaces_discrete(self):
|
|
||||||
results = [
|
|
||||||
(4, [0, 3, 16, 18]),
|
|
||||||
(7, [0, 3, 13, 14, 16, 18, 19]),
|
|
||||||
(9, [0, 3, 7, 13, 14, 15, 16, 18, 19]),
|
|
||||||
]
|
|
||||||
X, y = load_disc_dataset(n_features=20)
|
|
||||||
for k, expected in results:
|
|
||||||
tcl = self.build(
|
|
||||||
feature_select="best",
|
|
||||||
)
|
|
||||||
Xs, computed = tcl.get_subspace(X, y, k)
|
|
||||||
self.assertListEqual(expected, list(computed))
|
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
|
||||||
|
|
||||||
def test_get_cfs_subspaces(self):
|
|
||||||
results = [
|
|
||||||
(4, [1, 5, 9, 12]),
|
|
||||||
(6, [1, 5, 9, 12, 4, 2]),
|
|
||||||
(7, [1, 5, 9, 12, 4, 2, 3]),
|
|
||||||
]
|
|
||||||
X, y = load_dataset(n_features=20, n_informative=7)
|
|
||||||
for k, expected in results:
|
|
||||||
tcl = self.build(feature_select="cfs")
|
|
||||||
Xs, computed = tcl.get_subspace(X, y, k)
|
|
||||||
self.assertListEqual(expected, list(computed))
|
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
|
||||||
|
|
||||||
def test_get_fcbf_subspaces(self):
|
|
||||||
results = [
|
|
||||||
(4, [1, 5, 9, 12]),
|
|
||||||
(6, [1, 5, 9, 12, 4, 2]),
|
|
||||||
(7, [1, 5, 9, 12, 4, 2, 16]),
|
|
||||||
]
|
|
||||||
for rs, expected in results:
|
|
||||||
X, y = load_dataset(n_features=20, n_informative=7)
|
|
||||||
tcl = self.build(feature_select="fcbf", random_state=rs)
|
|
||||||
Xs, computed = tcl.get_subspace(X, y, rs)
|
|
||||||
self.assertListEqual(expected, list(computed))
|
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
|
||||||
|
|
||||||
def test_get_iwss_subspaces(self):
|
|
||||||
results = [
|
|
||||||
(4, [1, 5, 9, 12]),
|
|
||||||
(6, [1, 5, 9, 12, 4, 15]),
|
|
||||||
]
|
|
||||||
for rs, expected in results:
|
|
||||||
X, y = load_dataset(n_features=20, n_informative=7)
|
|
||||||
tcl = self.build(feature_select="iwss", random_state=rs)
|
|
||||||
Xs, computed = tcl.get_subspace(X, y, rs)
|
|
||||||
self.assertListEqual(expected, list(computed))
|
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
|
||||||
|
|
||||||
def test_get_trandom_subspaces(self):
|
|
||||||
results = [
|
|
||||||
(4, [3, 7, 9, 12]),
|
|
||||||
(6, [0, 1, 2, 8, 15, 18]),
|
|
||||||
(7, [1, 2, 4, 8, 10, 12, 13]),
|
|
||||||
]
|
|
||||||
for rs, expected in results:
|
|
||||||
X, y = load_dataset(n_features=20, n_informative=7)
|
|
||||||
tcl = self.build(feature_select="trandom", random_state=rs)
|
|
||||||
Xs, computed = tcl.get_subspace(X, y, rs)
|
|
||||||
self.assertListEqual(expected, list(computed))
|
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
|
||||||
|
@@ -7,15 +7,14 @@ from sklearn.datasets import load_iris, load_wine
|
|||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
from stree import Stree
|
from stree import Stree, Snode
|
||||||
from stree.Splitter import Snode
|
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
class Stree_test(unittest.TestCase):
|
class Stree_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
|
self._kernels = ["linear", "rbf", "poly"]
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -23,9 +22,10 @@ class Stree_test(unittest.TestCase):
|
|||||||
os.environ["TESTING"] = "1"
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
def test_valid_kernels(self):
|
def test_valid_kernels(self):
|
||||||
|
valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
|
||||||
X, y = load_dataset()
|
X, y = load_dataset()
|
||||||
for kernel in self._kernels:
|
for kernel in valid_kernels:
|
||||||
clf = Stree(kernel=kernel, multiclass_strategy="ovr")
|
clf = Stree(kernel=kernel)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertIsNotNone(clf.tree_)
|
self.assertIsNotNone(clf.tree_)
|
||||||
|
|
||||||
@@ -55,19 +55,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
# i.e. The partition algorithm didn't forget any sample
|
# i.e. The partition algorithm didn't forget any sample
|
||||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||||
labels_d, count_d = np.unique(y_down, return_counts=True)
|
_, count_d = np.unique(y_down, return_counts=True)
|
||||||
labels_u, count_u = np.unique(y_up, return_counts=True)
|
_, count_u = np.unique(y_up, return_counts=True)
|
||||||
dict_d = {label: count_d[i] for i, label in enumerate(labels_d)}
|
|
||||||
dict_u = {label: count_u[i] for i, label in enumerate(labels_u)}
|
|
||||||
#
|
#
|
||||||
for i in unique_y:
|
for i in unique_y:
|
||||||
|
number_up = count_u[i]
|
||||||
try:
|
try:
|
||||||
number_up = dict_u[i]
|
number_down = count_d[i]
|
||||||
except KeyError:
|
except IndexError:
|
||||||
number_up = 0
|
|
||||||
try:
|
|
||||||
number_down = dict_d[i]
|
|
||||||
except KeyError:
|
|
||||||
number_down = 0
|
number_down = 0
|
||||||
self.assertEqual(count_y[i], number_down + number_up)
|
self.assertEqual(count_y[i], number_down + number_up)
|
||||||
# Is the partition made the same as the prediction?
|
# Is the partition made the same as the prediction?
|
||||||
@@ -82,22 +77,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
"""Check if the tree is built the same way as predictions of models"""
|
"""Check if the tree is built the same way as predictions of models"""
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
kernel="sigmoid",
|
|
||||||
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
|
||||||
random_state=self._random_state,
|
|
||||||
)
|
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
self._check_tree(clf.tree_)
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
def test_single_prediction(self):
|
def test_single_prediction(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
kernel=kernel,
|
|
||||||
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
|
||||||
random_state=self._random_state,
|
|
||||||
)
|
|
||||||
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||||
self.assertEqual(yp[0], y[0])
|
self.assertEqual(yp[0], y[0])
|
||||||
|
|
||||||
@@ -105,12 +92,8 @@ class Stree_test(unittest.TestCase):
|
|||||||
# First 27 elements the predictions are the same as the truth
|
# First 27 elements the predictions are the same as the truth
|
||||||
num = 27
|
num = 27
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in ["liblinear", "linear", "rbf", "poly"]:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
kernel=kernel,
|
|
||||||
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
|
||||||
random_state=self._random_state,
|
|
||||||
)
|
|
||||||
yp = clf.fit(X, y).predict(X[:num, :])
|
yp = clf.fit(X, y).predict(X[:num, :])
|
||||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||||
|
|
||||||
@@ -120,11 +103,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
kernel=kernel,
|
|
||||||
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
|
||||||
random_state=self._random_state,
|
|
||||||
)
|
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
# Compute prediction line by line
|
# Compute prediction line by line
|
||||||
yp_line = np.array([], dtype=int)
|
yp_line = np.array([], dtype=int)
|
||||||
@@ -156,13 +135,9 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
computed = []
|
computed = []
|
||||||
expected_string = ""
|
expected_string = ""
|
||||||
clf = Stree(
|
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||||
kernel="liblinear",
|
|
||||||
multiclass_strategy="ovr",
|
|
||||||
random_state=self._random_state,
|
|
||||||
)
|
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
for node in iter(clf):
|
for node in clf:
|
||||||
computed.append(str(node))
|
computed.append(str(node))
|
||||||
expected_string += str(node) + "\n"
|
expected_string += str(node) + "\n"
|
||||||
self.assertListEqual(expected, computed)
|
self.assertListEqual(expected, computed)
|
||||||
@@ -198,12 +173,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_check_max_depth(self):
|
def test_check_max_depth(self):
|
||||||
depths = (3, 4)
|
depths = (3, 4)
|
||||||
for depth in depths:
|
for depth in depths:
|
||||||
tcl = Stree(
|
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
||||||
kernel="liblinear",
|
|
||||||
multiclass_strategy="ovr",
|
|
||||||
random_state=self._random_state,
|
|
||||||
max_depth=depth,
|
|
||||||
)
|
|
||||||
tcl.fit(*load_dataset(self._random_state))
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
self.assertEqual(depth, tcl.depth_)
|
self.assertEqual(depth, tcl.depth_)
|
||||||
|
|
||||||
@@ -224,7 +194,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
split_criteria="max_samples",
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
px = [[1, 2], [5, 6], [9, 10]]
|
px = [[1, 2], [5, 6], [9, 10]]
|
||||||
@@ -235,36 +205,26 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertListEqual(py, clf.classes_.tolist())
|
self.assertListEqual(py, clf.classes_.tolist())
|
||||||
|
|
||||||
def test_muticlass_dataset(self):
|
def test_muticlass_dataset(self):
|
||||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
||||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
|
||||||
datasets = {
|
datasets = {
|
||||||
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||||
"Iris": load_wine(return_X_y=True),
|
"Iris": load_wine(return_X_y=True),
|
||||||
}
|
}
|
||||||
outcomes = {
|
outcomes = {
|
||||||
"Synt": {
|
"Synt": {
|
||||||
"max_samples liblinear": 0.9493333333333334,
|
"max_samples linear": 0.9606666666666667,
|
||||||
"max_samples linear": 0.9426666666666667,
|
"max_samples rbf": 0.7133333333333334,
|
||||||
"max_samples rbf": 0.9606666666666667,
|
"max_samples poly": 0.618,
|
||||||
"max_samples poly": 0.9373333333333334,
|
"impurity linear": 0.9606666666666667,
|
||||||
"max_samples sigmoid": 0.824,
|
"impurity rbf": 0.7133333333333334,
|
||||||
"impurity liblinear": 0.9493333333333334,
|
"impurity poly": 0.618,
|
||||||
"impurity linear": 0.9426666666666667,
|
|
||||||
"impurity rbf": 0.9606666666666667,
|
|
||||||
"impurity poly": 0.9373333333333334,
|
|
||||||
"impurity sigmoid": 0.824,
|
|
||||||
},
|
},
|
||||||
"Iris": {
|
"Iris": {
|
||||||
"max_samples liblinear": 0.9550561797752809,
|
|
||||||
"max_samples linear": 1.0,
|
"max_samples linear": 1.0,
|
||||||
"max_samples rbf": 0.6685393258426966,
|
"max_samples rbf": 0.6910112359550562,
|
||||||
"max_samples poly": 0.6853932584269663,
|
"max_samples poly": 0.6966292134831461,
|
||||||
"max_samples sigmoid": 0.6404494382022472,
|
"impurity linear": 1,
|
||||||
"impurity liblinear": 0.9550561797752809,
|
"impurity rbf": 0.6910112359550562,
|
||||||
"impurity linear": 1.0,
|
"impurity poly": 0.6966292134831461,
|
||||||
"impurity rbf": 0.6685393258426966,
|
|
||||||
"impurity poly": 0.6853932584269663,
|
|
||||||
"impurity sigmoid": 0.6404494382022472,
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -273,22 +233,18 @@ class Stree_test(unittest.TestCase):
|
|||||||
for criteria in ["max_samples", "impurity"]:
|
for criteria in ["max_samples", "impurity"]:
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
max_iter=1e4,
|
C=55,
|
||||||
multiclass_strategy="ovr"
|
max_iter=1e5,
|
||||||
if kernel == "liblinear"
|
|
||||||
else "ovo",
|
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
clf.fit(px, py)
|
clf.fit(px, py)
|
||||||
outcome = outcomes[name][f"{criteria} {kernel}"]
|
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||||
# print(f'"{criteria} {kernel}": {clf.score(px, py)},')
|
# print(
|
||||||
self.assertAlmostEqual(
|
# f"{name} {criteria} {kernel} {outcome} {clf.score(px"
|
||||||
outcome,
|
# ", py)}"
|
||||||
clf.score(px, py),
|
# )
|
||||||
5,
|
self.assertAlmostEqual(outcome, clf.score(px, py))
|
||||||
f"{name} - {criteria} - {kernel}",
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_max_features(self):
|
def test_max_features(self):
|
||||||
n_features = 16
|
n_features = 16
|
||||||
@@ -313,12 +269,6 @@ class Stree_test(unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = clf._initialize_max_features()
|
_ = clf._initialize_max_features()
|
||||||
|
|
||||||
def test_wrong_max_features(self):
|
|
||||||
X, y = load_dataset(n_features=15)
|
|
||||||
clf = Stree(max_features=16)
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
clf.fit(X, y)
|
|
||||||
|
|
||||||
def test_get_subspaces(self):
|
def test_get_subspaces(self):
|
||||||
dataset = np.random.random((10, 16))
|
dataset = np.random.random((10, 16))
|
||||||
y = np.random.randint(0, 2, 10)
|
y = np.random.randint(0, 2, 10)
|
||||||
@@ -356,19 +306,17 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf.predict(X[:, :3])
|
clf.predict(X[:, :3])
|
||||||
|
|
||||||
# Tests of score
|
# Tests of score
|
||||||
|
|
||||||
def test_score_binary(self):
|
def test_score_binary(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
accuracies = [
|
accuracies = [
|
||||||
0.9506666666666667,
|
0.9506666666666667,
|
||||||
0.9493333333333334,
|
|
||||||
0.9606666666666667,
|
0.9606666666666667,
|
||||||
0.9433333333333334,
|
0.9433333333333334,
|
||||||
0.9153333333333333,
|
|
||||||
]
|
]
|
||||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
)
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
@@ -380,12 +328,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_score_max_features(self):
|
def test_score_max_features(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
clf = Stree(
|
clf = Stree(random_state=self._random_state, max_features=2)
|
||||||
kernel="liblinear",
|
|
||||||
multiclass_strategy="ovr",
|
|
||||||
random_state=self._random_state,
|
|
||||||
max_features=2,
|
|
||||||
)
|
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
||||||
|
|
||||||
@@ -397,9 +340,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_multiclass_classifier_integrity(self):
|
def test_multiclass_classifier_integrity(self):
|
||||||
"""Checks if the multiclass operation is done right"""
|
"""Checks if the multiclass operation is done right"""
|
||||||
X, y = load_iris(return_X_y=True)
|
X, y = load_iris(return_X_y=True)
|
||||||
clf = Stree(
|
clf = Stree(random_state=0)
|
||||||
kernel="liblinear", multiclass_strategy="ovr", random_state=0
|
|
||||||
)
|
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
score = clf.score(X, y)
|
score = clf.score(X, y)
|
||||||
# Check accuracy of the whole model
|
# Check accuracy of the whole model
|
||||||
@@ -455,10 +396,10 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="rbf", random_state=self._random_state, normalize=True
|
kernel="rbf", random_state=self._random_state, normalize=True
|
||||||
)
|
)
|
||||||
self.assertEqual(0.966, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.768, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.964, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.6685393258426966, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_poly(self):
|
def test_score_multiclass_poly(self):
|
||||||
@@ -476,78 +417,24 @@ class Stree_test(unittest.TestCase):
|
|||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(0.946, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.972, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.7808988764044944, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_liblinear(self):
|
|
||||||
X, y = load_dataset(
|
|
||||||
random_state=self._random_state,
|
|
||||||
n_classes=3,
|
|
||||||
n_features=5,
|
|
||||||
n_samples=500,
|
|
||||||
)
|
|
||||||
clf = Stree(
|
|
||||||
kernel="liblinear",
|
|
||||||
multiclass_strategy="ovr",
|
|
||||||
random_state=self._random_state,
|
|
||||||
C=10,
|
|
||||||
)
|
|
||||||
clf2 = Stree(
|
|
||||||
kernel="liblinear",
|
|
||||||
multiclass_strategy="ovr",
|
|
||||||
random_state=self._random_state,
|
|
||||||
normalize=True,
|
|
||||||
)
|
|
||||||
self.assertEqual(0.968, clf.fit(X, y).score(X, y))
|
|
||||||
self.assertEqual(0.97, clf2.fit(X, y).score(X, y))
|
|
||||||
X, y = load_wine(return_X_y=True)
|
|
||||||
self.assertEqual(1.0, clf.fit(X, y).score(X, y))
|
|
||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
|
||||||
|
|
||||||
def test_score_multiclass_sigmoid(self):
|
|
||||||
X, y = load_dataset(
|
|
||||||
random_state=self._random_state,
|
|
||||||
n_classes=3,
|
|
||||||
n_features=5,
|
|
||||||
n_samples=500,
|
|
||||||
)
|
|
||||||
clf = Stree(kernel="sigmoid", random_state=self._random_state, C=10)
|
|
||||||
clf2 = Stree(
|
|
||||||
kernel="sigmoid",
|
|
||||||
random_state=self._random_state,
|
|
||||||
normalize=True,
|
|
||||||
C=10,
|
|
||||||
)
|
|
||||||
self.assertEqual(0.796, clf.fit(X, y).score(X, y))
|
|
||||||
self.assertEqual(0.952, clf2.fit(X, y).score(X, y))
|
|
||||||
X, y = load_wine(return_X_y=True)
|
|
||||||
self.assertEqual(0.6910112359550562, clf.fit(X, y).score(X, y))
|
|
||||||
self.assertEqual(0.9662921348314607, clf2.fit(X, y).score(X, y))
|
|
||||||
|
|
||||||
def test_score_multiclass_linear(self):
|
def test_score_multiclass_linear(self):
|
||||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
|
||||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
n_features=5,
|
n_features=5,
|
||||||
n_samples=1500,
|
n_samples=1500,
|
||||||
)
|
)
|
||||||
clf = Stree(
|
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||||
kernel="liblinear",
|
|
||||||
multiclass_strategy="ovr",
|
|
||||||
random_state=self._random_state,
|
|
||||||
)
|
|
||||||
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||||
# Check with context based standardization
|
# Check with context based standardization
|
||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="liblinear",
|
kernel="linear", random_state=self._random_state, normalize=True
|
||||||
multiclass_strategy="ovr",
|
|
||||||
random_state=self._random_state,
|
|
||||||
normalize=True,
|
|
||||||
)
|
)
|
||||||
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
@@ -574,7 +461,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
||||||
yw = np.array([1, 1, 1, 1, 1, 1, 5, 5, 5])
|
yw = np.array([1, 1, 1, 5, 5, 5, 5, 5, 5])
|
||||||
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
||||||
model1 = Stree().fit(X, y)
|
model1 = Stree().fit(X, y)
|
||||||
model2 = Stree().fit(X, y, w)
|
model2 = Stree().fit(X, y, w)
|
||||||
@@ -611,14 +498,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(31, nodes)
|
self.assertEqual(25, nodes)
|
||||||
self.assertEqual(16, leaves)
|
self.assertEqual(13, leaves)
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(11, nodes)
|
self.assertEqual(9, nodes)
|
||||||
self.assertEqual(6, leaves)
|
self.assertEqual(5, leaves)
|
||||||
|
|
||||||
def test_nodes_leaves_artificial(self):
|
def test_nodes_leaves_artificial(self):
|
||||||
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
||||||
@@ -637,27 +524,3 @@ class Stree_test(unittest.TestCase):
|
|||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(6, nodes)
|
self.assertEqual(6, nodes)
|
||||||
self.assertEqual(2, leaves)
|
self.assertEqual(2, leaves)
|
||||||
|
|
||||||
def test_bogus_multiclass_strategy(self):
|
|
||||||
clf = Stree(multiclass_strategy="other")
|
|
||||||
X, y = load_wine(return_X_y=True)
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
clf.fit(X, y)
|
|
||||||
|
|
||||||
def test_multiclass_strategy(self):
|
|
||||||
X, y = load_wine(return_X_y=True)
|
|
||||||
clf_o = Stree(multiclass_strategy="ovo")
|
|
||||||
clf_r = Stree(multiclass_strategy="ovr")
|
|
||||||
score_o = clf_o.fit(X, y).score(X, y)
|
|
||||||
score_r = clf_r.fit(X, y).score(X, y)
|
|
||||||
self.assertEqual(1.0, score_o)
|
|
||||||
self.assertEqual(0.9269662921348315, score_r)
|
|
||||||
|
|
||||||
def test_incompatible_hyperparameters(self):
|
|
||||||
X, y = load_wine(return_X_y=True)
|
|
||||||
clf = Stree(kernel="liblinear", multiclass_strategy="ovo")
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
clf.fit(X, y)
|
|
||||||
clf = Stree(multiclass_strategy="ovo", split_criteria="max_samples")
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
clf.fit(X, y)
|
|
||||||
|
@@ -1,14 +1,11 @@
|
|||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(
|
def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
||||||
random_state=0, n_classes=2, n_features=3, n_samples=1500, n_informative=3
|
|
||||||
):
|
|
||||||
X, y = make_classification(
|
X, y = make_classification(
|
||||||
n_samples=n_samples,
|
n_samples=n_samples,
|
||||||
n_features=n_features,
|
n_features=n_features,
|
||||||
n_informative=n_informative,
|
n_informative=3,
|
||||||
n_redundant=0,
|
n_redundant=0,
|
||||||
n_repeated=0,
|
n_repeated=0,
|
||||||
n_classes=n_classes,
|
n_classes=n_classes,
|
||||||
@@ -18,12 +15,3 @@ def load_dataset(
|
|||||||
random_state=random_state,
|
random_state=random_state,
|
||||||
)
|
)
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
def load_disc_dataset(
|
|
||||||
random_state=0, n_classes=2, n_features=3, n_samples=1500
|
|
||||||
):
|
|
||||||
np.random.seed(random_state)
|
|
||||||
X = np.random.randint(1, 17, size=(n_samples, n_features)).astype(float)
|
|
||||||
y = np.random.randint(low=0, high=n_classes, size=(n_samples), dtype=int)
|
|
||||||
return X, y
|
|
||||||
|
Reference in New Issue
Block a user