mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
15 Commits
Author | SHA1 | Date | |
---|---|---|---|
cc931d8547
|
|||
b044a057df
|
|||
fc48bc8ba4
|
|||
|
8251f07674 | ||
|
0b15a5af11 | ||
|
28d905368b | ||
e5d49132ec
|
|||
8daecc4726
|
|||
|
bf678df159 | ||
|
36b08b1bcf | ||
36ff3da26d
|
|||
|
6b281ebcc8 | ||
|
3aaddd096f | ||
|
15a5a4c407 | ||
|
0afe14a447 |
3
.github/workflows/main.yml
vendored
3
.github/workflows/main.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
|||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [macos-latest, ubuntu-latest]
|
os: [macos-latest, ubuntu-latest, windows-latest]
|
||||||
python: [3.8]
|
python: [3.8]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -26,7 +26,6 @@ jobs:
|
|||||||
pip install -q --upgrade pip
|
pip install -q --upgrade pip
|
||||||
pip install -q -r requirements.txt
|
pip install -q -r requirements.txt
|
||||||
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
||||||
pip install -q git+https://github.com/doctorado-ml/mfs
|
|
||||||
- name: Lint
|
- name: Lint
|
||||||
run: |
|
run: |
|
||||||
black --check --diff stree
|
black --check --diff stree
|
||||||
|
37
CITATION.cff
Normal file
37
CITATION.cff
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
cff-version: 1.2.0
|
||||||
|
message: "If you use this software, please cite it as below."
|
||||||
|
authors:
|
||||||
|
- family-names: "Montañana"
|
||||||
|
given-names: "Ricardo"
|
||||||
|
orcid: "https://orcid.org/0000-0003-3242-5452"
|
||||||
|
- family-names: "Gámez"
|
||||||
|
given-names: "José A."
|
||||||
|
orcid: "https://orcid.org/0000-0003-1188-1117"
|
||||||
|
- family-names: "Puerta"
|
||||||
|
given-names: "José M."
|
||||||
|
orcid: "https://orcid.org/0000-0002-9164-5191"
|
||||||
|
title: "STree"
|
||||||
|
version: 1.0.2
|
||||||
|
doi: 10.5281/zenodo.5504083
|
||||||
|
date-released: 2021-11-02
|
||||||
|
url: "https://github.com/Doctorado-ML/STree"
|
||||||
|
preferred-citation:
|
||||||
|
type: article
|
||||||
|
authors:
|
||||||
|
- family-names: "Montañana"
|
||||||
|
given-names: "Ricardo"
|
||||||
|
orcid: "https://orcid.org/0000-0003-3242-5452"
|
||||||
|
- family-names: "Gámez"
|
||||||
|
given-names: "José A."
|
||||||
|
orcid: "https://orcid.org/0000-0003-1188-1117"
|
||||||
|
- family-names: "Puerta"
|
||||||
|
given-names: "José M."
|
||||||
|
orcid: "https://orcid.org/0000-0002-9164-5191"
|
||||||
|
doi: "10.1007/978-3-030-85713-4_6"
|
||||||
|
journal: "Lecture Notes in Computer Science"
|
||||||
|
month: 9
|
||||||
|
start: 54
|
||||||
|
end: 64
|
||||||
|
title: "STree: A Single Multi-class Oblique Decision Tree Based on Support Vector Machines"
|
||||||
|
volume: 12882
|
||||||
|
year: 2021
|
1
Makefile
1
Makefile
@@ -26,6 +26,7 @@ doc: ## Update documentation
|
|||||||
|
|
||||||
build: ## Build package
|
build: ## Build package
|
||||||
rm -fr dist/*
|
rm -fr dist/*
|
||||||
|
rm -fr build/*
|
||||||
python setup.py sdist bdist_wheel
|
python setup.py sdist bdist_wheel
|
||||||
|
|
||||||
doc-clean: ## Update documentation
|
doc-clean: ## Update documentation
|
||||||
|
44
README.md
44
README.md
@@ -2,6 +2,9 @@
|
|||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
||||||
|
[](https://badge.fury.io/py/STree)
|
||||||
|

|
||||||
|
[](https://zenodo.org/badge/latestdoi/262658230)
|
||||||
|
|
||||||
# STree
|
# STree
|
||||||
|
|
||||||
@@ -23,8 +26,6 @@ Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
|||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
@@ -35,24 +36,23 @@ Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
|||||||
|
|
||||||
## Hyperparameters
|
## Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter |
|
| | splitter | {"best", "random", "trandom", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates only one random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||||
|
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
@@ -73,3 +73,7 @@ python -m unittest -v stree.tests
|
|||||||
## License
|
## License
|
||||||
|
|
||||||
STree is [MIT](https://github.com/doctorado-ml/stree/blob/master/LICENSE) licensed
|
STree is [MIT](https://github.com/doctorado-ml/stree/blob/master/LICENSE) licensed
|
||||||
|
|
||||||
|
## Reference
|
||||||
|
|
||||||
|
R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class oblique decision tree based on support vector machines.", 2021 LNAI 12882, pg. 54-64
|
||||||
|
@@ -1,4 +1,4 @@
|
|||||||
sphinx
|
sphinx
|
||||||
sphinx-rtd-theme
|
sphinx-rtd-theme
|
||||||
myst-parser
|
myst-parser
|
||||||
git+https://github.com/doctorado-ml/stree
|
mufs
|
@@ -54,4 +54,4 @@ html_theme = "sphinx_rtd_theme"
|
|||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
html_static_path = ["_static"]
|
html_static_path = []
|
||||||
|
@@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
## Notebooks
|
## Notebooks
|
||||||
|
|
||||||
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
|
@@ -1,22 +1,22 @@
|
|||||||
# Hyperparameters
|
# Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------- | ------------------------------------------------------ | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random", "mutual", "cfs", "fcbf"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter |
|
| | splitter | {"best", "random", "trandom", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates only one random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
|
@@ -1,9 +1,12 @@
|
|||||||
# STree
|
# STree
|
||||||
|
|
||||||
[](https://app.codeship.com/projects/399170)
|

|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
||||||
|
[](https://badge.fury.io/py/STree)
|
||||||
|

|
||||||
|
[](https://zenodo.org/badge/latestdoi/262658230)
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
|
@@ -178,7 +178,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Stree\n",
|
"# Stree\n",
|
||||||
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3)"
|
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3, kernel=\"liblinear\", multiclass_strategy=\"ovr\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -368,4 +368,4 @@
|
|||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 4
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
@@ -1,2 +1,2 @@
|
|||||||
scikit-learn>0.24
|
scikit-learn>0.24
|
||||||
mfs
|
mufs
|
2
setup.py
2
setup.py
@@ -44,7 +44,7 @@ setuptools.setup(
|
|||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
],
|
],
|
||||||
install_requires=["scikit-learn", "numpy", "mfs"],
|
install_requires=["scikit-learn", "mufs"],
|
||||||
test_suite="stree.tests",
|
test_suite="stree.tests",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
@@ -12,12 +12,32 @@ from sklearn.feature_selection import SelectKBest, mutual_info_classif
|
|||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
from mfs import MFS
|
from mufs import MUFS
|
||||||
|
|
||||||
|
|
||||||
class Snode:
|
class Snode:
|
||||||
"""Nodes of the tree that keeps the svm classifier and if testing the
|
"""
|
||||||
|
Nodes of the tree that keeps the svm classifier and if testing the
|
||||||
dataset assigned to it
|
dataset assigned to it
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
clf : SVC
|
||||||
|
Classifier used
|
||||||
|
X : np.ndarray
|
||||||
|
input dataset in train time (only in testing)
|
||||||
|
y : np.ndarray
|
||||||
|
input labes in train time
|
||||||
|
features : np.array
|
||||||
|
features used to compute hyperplane
|
||||||
|
impurity : float
|
||||||
|
impurity of the node
|
||||||
|
title : str
|
||||||
|
label describing the route to the node
|
||||||
|
weight : np.ndarray, optional
|
||||||
|
weights applied to input dataset in train time, by default None
|
||||||
|
scaler : StandardScaler, optional
|
||||||
|
scaler used if any, by default None
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -165,6 +185,56 @@ class Siterator:
|
|||||||
|
|
||||||
|
|
||||||
class Splitter:
|
class Splitter:
|
||||||
|
"""
|
||||||
|
Splits a dataset in two based on different criteria
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
clf : SVC, optional
|
||||||
|
classifier, by default None
|
||||||
|
criterion : str, optional
|
||||||
|
The function to measure the quality of a split (only used if
|
||||||
|
max_features != num_features). Supported criteria are “gini” for the
|
||||||
|
Gini impurity and “entropy” for the information gain., by default
|
||||||
|
"entropy", by default None
|
||||||
|
feature_select : str, optional
|
||||||
|
The strategy used to choose the feature set at each node (only used if
|
||||||
|
max_features < num_features). Supported strategies are: “best”: sklearn
|
||||||
|
SelectKBest algorithm is used in every node to choose the max_features
|
||||||
|
best features. “random”: The algorithm generates 5 candidates and
|
||||||
|
choose the best (max. info. gain) of them. “trandom”: The algorithm
|
||||||
|
generates only one random combination. "mutual": Chooses the best
|
||||||
|
features w.r.t. their mutual info with the label. "cfs": Apply
|
||||||
|
Correlation-based Feature Selection. "fcbf": Apply Fast Correlation-
|
||||||
|
Based, by default None
|
||||||
|
criteria : str, optional
|
||||||
|
ecides (just in case of a multi class classification) which column
|
||||||
|
(class) use to split the dataset in a node. max_samples is
|
||||||
|
incompatible with 'ovo' multiclass_strategy, by default None
|
||||||
|
min_samples_split : int, optional
|
||||||
|
The minimum number of samples required to split an internal node. 0
|
||||||
|
(default) for any, by default None
|
||||||
|
random_state : optional
|
||||||
|
Controls the pseudo random number generation for shuffling the data for
|
||||||
|
probability estimates. Ignored when probability is False.Pass an int
|
||||||
|
for reproducible output across multiple function calls, by
|
||||||
|
default None
|
||||||
|
normalize : bool, optional
|
||||||
|
If standardization of features should be applied on each node with the
|
||||||
|
samples that reach it , by default False
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
clf has to be a sklearn estimator
|
||||||
|
ValueError
|
||||||
|
criterion must be gini or entropy
|
||||||
|
ValueError
|
||||||
|
criteria has to be max_samples or impurity
|
||||||
|
ValueError
|
||||||
|
splitter must be in {random, best, mutual, cfs, fcbf}
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
clf: SVC = None,
|
clf: SVC = None,
|
||||||
@@ -175,6 +245,7 @@ class Splitter:
|
|||||||
random_state=None,
|
random_state=None,
|
||||||
normalize=False,
|
normalize=False,
|
||||||
):
|
):
|
||||||
|
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._random_state = random_state
|
self._random_state = random_state
|
||||||
if random_state is not None:
|
if random_state is not None:
|
||||||
@@ -201,10 +272,19 @@ class Splitter:
|
|||||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if feature_select not in ["random", "best", "mutual", "cfs", "fcbf"]:
|
if feature_select not in [
|
||||||
|
"random",
|
||||||
|
"trandom",
|
||||||
|
"best",
|
||||||
|
"mutual",
|
||||||
|
"cfs",
|
||||||
|
"fcbf",
|
||||||
|
"iwss",
|
||||||
|
]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"splitter must be in {random, best, mutual, cfs, fcbf} got "
|
"splitter must be in {random, trandom, best, mutual, cfs, "
|
||||||
f"({feature_select})"
|
"fcbf, iwss} "
|
||||||
|
f"got ({feature_select})"
|
||||||
)
|
)
|
||||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||||
@@ -235,6 +315,31 @@ class Splitter:
|
|||||||
features_sets = self._generate_spaces(n_features, max_features)
|
features_sets = self._generate_spaces(n_features, max_features)
|
||||||
return self._select_best_set(dataset, labels, features_sets)
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_trandom(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the a random feature set combination
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# Random feature reduction
|
||||||
|
n_features = dataset.shape[1]
|
||||||
|
return tuple(sorted(random.sample(range(n_features), max_features)))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _fs_best(
|
def _fs_best(
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
@@ -262,9 +367,8 @@ class Splitter:
|
|||||||
.get_support(indices=True)
|
.get_support(indices=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _fs_mutual(
|
def _fs_mutual(
|
||||||
dataset: np.array, labels: np.array, max_features: int
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
"""Return the best features with mutual information with labels
|
"""Return the best features with mutual information with labels
|
||||||
|
|
||||||
@@ -284,7 +388,9 @@ class Splitter:
|
|||||||
indices of the features selected
|
indices of the features selected
|
||||||
"""
|
"""
|
||||||
# return best features with mutual info with the label
|
# return best features with mutual info with the label
|
||||||
feature_list = mutual_info_classif(dataset, labels)
|
feature_list = mutual_info_classif(
|
||||||
|
dataset, labels, random_state=self._random_state
|
||||||
|
)
|
||||||
return tuple(
|
return tuple(
|
||||||
sorted(
|
sorted(
|
||||||
range(len(feature_list)), key=lambda sub: feature_list[sub]
|
range(len(feature_list)), key=lambda sub: feature_list[sub]
|
||||||
@@ -312,8 +418,8 @@ class Splitter:
|
|||||||
tuple
|
tuple
|
||||||
indices of the features selected
|
indices of the features selected
|
||||||
"""
|
"""
|
||||||
mfs = MFS(max_features=max_features, discrete=False)
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
return mfs.cfs(dataset, labels).get_results()
|
return mufs.cfs(dataset, labels).get_results()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _fs_fcbf(
|
def _fs_fcbf(
|
||||||
@@ -336,8 +442,33 @@ class Splitter:
|
|||||||
tuple
|
tuple
|
||||||
indices of the features selected
|
indices of the features selected
|
||||||
"""
|
"""
|
||||||
mfs = MFS(max_features=max_features, discrete=False)
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
return mfs.fcbf(dataset, labels, 5e-4).get_results()
|
return mufs.fcbf(dataset, labels, 5e-4).get_results()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_iwss(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Correlattion-based feature selection based on iwss with max_features
|
||||||
|
limit
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
|
return mufs.iwss(dataset, labels, 0.25).get_results()
|
||||||
|
|
||||||
def partition_impurity(self, y: np.array) -> np.array:
|
def partition_impurity(self, y: np.array) -> np.array:
|
||||||
return self.criterion_function(y)
|
return self.criterion_function(y)
|
||||||
|
110
stree/Strees.py
110
stree/Strees.py
@@ -20,11 +20,118 @@ from .Splitter import Splitter, Snode, Siterator
|
|||||||
|
|
||||||
|
|
||||||
class Stree(BaseEstimator, ClassifierMixin):
|
class Stree(BaseEstimator, ClassifierMixin):
|
||||||
"""Estimator that is based on binary trees of svm nodes
|
"""
|
||||||
|
Estimator that is based on binary trees of svm nodes
|
||||||
can deal with sample_weights in predict, used in boosting sklearn methods
|
can deal with sample_weights in predict, used in boosting sklearn methods
|
||||||
inheriting from BaseEstimator implements get_params and set_params methods
|
inheriting from BaseEstimator implements get_params and set_params methods
|
||||||
inheriting from ClassifierMixin implement the attribute _estimator_type
|
inheriting from ClassifierMixin implement the attribute _estimator_type
|
||||||
with "classifier" as value
|
with "classifier" as value
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
C : float, optional
|
||||||
|
Regularization parameter. The strength of the regularization is
|
||||||
|
inversely proportional to C. Must be strictly positive., by default 1.0
|
||||||
|
kernel : str, optional
|
||||||
|
Specifies the kernel type to be used in the algorithm. It must be one
|
||||||
|
of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses
|
||||||
|
[liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and
|
||||||
|
the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/)
|
||||||
|
library through scikit-learn library, by default "linear"
|
||||||
|
max_iter : int, optional
|
||||||
|
Hard limit on iterations within solver, or -1 for no limit., by default
|
||||||
|
1e5
|
||||||
|
random_state : int, optional
|
||||||
|
Controls the pseudo random number generation for shuffling the data for
|
||||||
|
probability estimates. Ignored when probability is False.Pass an int
|
||||||
|
for reproducible output across multiple function calls, by
|
||||||
|
default None
|
||||||
|
max_depth : int, optional
|
||||||
|
Specifies the maximum depth of the tree, by default None
|
||||||
|
tol : float, optional
|
||||||
|
Tolerance for stopping, by default 1e-4
|
||||||
|
degree : int, optional
|
||||||
|
Degree of the polynomial kernel function (‘poly’). Ignored by all other
|
||||||
|
kernels., by default 3
|
||||||
|
gamma : str, optional
|
||||||
|
Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.if gamma='scale'
|
||||||
|
(default) is passed then it uses 1 / (n_features * X.var()) as value
|
||||||
|
of gamma,if ‘auto’, uses 1 / n_features., by default "scale"
|
||||||
|
split_criteria : str, optional
|
||||||
|
Decides (just in case of a multi class classification) which column
|
||||||
|
(class) use to split the dataset in a node. max_samples is
|
||||||
|
incompatible with 'ovo' multiclass_strategy, by default "impurity"
|
||||||
|
criterion : str, optional
|
||||||
|
The function to measure the quality of a split (only used if
|
||||||
|
max_features != num_features). Supported criteria are “gini” for the
|
||||||
|
Gini impurity and “entropy” for the information gain., by default
|
||||||
|
"entropy"
|
||||||
|
min_samples_split : int, optional
|
||||||
|
The minimum number of samples required to split an internal node. 0
|
||||||
|
(default) for any, by default 0
|
||||||
|
max_features : optional
|
||||||
|
The number of features to consider when looking for the split: If int,
|
||||||
|
then consider max_features features at each split. If float, then
|
||||||
|
max_features is a fraction and int(max_features * n_features) features
|
||||||
|
are considered at each split. If “auto”, then max_features=
|
||||||
|
sqrt(n_features). If “sqrt”, then max_features=sqrt(n_features). If
|
||||||
|
“log2”, then max_features=log2(n_features). If None, then max_features=
|
||||||
|
n_features., by default None
|
||||||
|
splitter : str, optional
|
||||||
|
The strategy used to choose the feature set at each node (only used if
|
||||||
|
max_features < num_features). Supported strategies are: “best”: sklearn
|
||||||
|
SelectKBest algorithm is used in every node to choose the max_features
|
||||||
|
best features. “random”: The algorithm generates 5 candidates and
|
||||||
|
choose the best (max. info. gain) of them. “trandom”: The algorithm
|
||||||
|
generates only one random combination. "mutual": Chooses the best
|
||||||
|
features w.r.t. their mutual info with the label. "cfs": Apply
|
||||||
|
Correlation-based Feature Selection. "fcbf": Apply Fast Correlation-
|
||||||
|
Based , by default "random"
|
||||||
|
multiclass_strategy : str, optional
|
||||||
|
Strategy to use with multiclass datasets, "ovo": one versus one. "ovr":
|
||||||
|
one versus rest, by default "ovo"
|
||||||
|
normalize : bool, optional
|
||||||
|
If standardization of features should be applied on each node with the
|
||||||
|
samples that reach it , by default False
|
||||||
|
|
||||||
|
Attributes
|
||||||
|
----------
|
||||||
|
classes_ : ndarray of shape (n_classes,)
|
||||||
|
The classes labels.
|
||||||
|
|
||||||
|
n_classes_ : int
|
||||||
|
The number of classes
|
||||||
|
|
||||||
|
n_iter_ : int
|
||||||
|
Max number of iterations in classifier
|
||||||
|
|
||||||
|
depth_ : int
|
||||||
|
Max depht of the tree
|
||||||
|
|
||||||
|
n_features_ : int
|
||||||
|
The number of features when ``fit`` is performed.
|
||||||
|
|
||||||
|
n_features_in_ : int
|
||||||
|
Number of features seen during :term:`fit`.
|
||||||
|
|
||||||
|
max_features_ : int
|
||||||
|
Number of features to use in hyperplane computation
|
||||||
|
|
||||||
|
tree_ : Node
|
||||||
|
root of the tree
|
||||||
|
|
||||||
|
X_ : ndarray
|
||||||
|
points to the input dataset
|
||||||
|
|
||||||
|
y_ : ndarray
|
||||||
|
points to the input labels
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class
|
||||||
|
oblique decision tree based on support vector machines.", 2021 LNAI 12882
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -45,6 +152,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
multiclass_strategy: str = "ovo",
|
multiclass_strategy: str = "ovo",
|
||||||
normalize: bool = False,
|
normalize: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.max_iter = max_iter
|
self.max_iter = max_iter
|
||||||
self.C = C
|
self.C = C
|
||||||
self.kernel = kernel
|
self.kernel = kernel
|
||||||
|
@@ -1,6 +1,6 @@
|
|||||||
from .Strees import Stree, Siterator
|
from .Strees import Stree, Siterator
|
||||||
|
|
||||||
__version__ = "1.2"
|
__version__ = "1.2.3"
|
||||||
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
||||||
|
@@ -285,3 +285,28 @@ class Splitter_test(unittest.TestCase):
|
|||||||
Xs, computed = tcl.get_subspace(X, y, rs)
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_iwss_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [1, 5, 9, 12]),
|
||||||
|
(6, [1, 5, 9, 12, 4, 15]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="iwss", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_trandom_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [3, 7, 9, 12]),
|
||||||
|
(6, [0, 1, 2, 8, 15, 18]),
|
||||||
|
(7, [1, 2, 4, 8, 10, 12, 13]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="trandom", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
Reference in New Issue
Block a user