mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
46 Commits
package_do
...
v1.4.0
Author | SHA1 | Date | |
---|---|---|---|
941c2ff5e0
|
|||
2ebf48145d
|
|||
7fbfd3622e
|
|||
bc839a80d6
|
|||
ba15ea2cc0
|
|||
85b56785c8
|
|||
b627bb7531
|
|||
5f8ca8f3bb
|
|||
|
fb8b9b344f | ||
036d1ba2a7
|
|||
4de74973b8
|
|||
|
28dd04b95a | ||
|
542bbce7db
|
||
|
5b791bc5bf | ||
|
c37f044e3a | ||
|
2f6ae648a1 | ||
|
93be8a89a8 | ||
82838fa3e0
|
|||
f0b2ce3c7b
|
|||
00ed57c015
|
|||
|
08222f109e | ||
cc931d8547
|
|||
b044a057df
|
|||
fc48bc8ba4
|
|||
|
8251f07674 | ||
|
0b15a5af11 | ||
|
28d905368b | ||
e5d49132ec
|
|||
8daecc4726
|
|||
|
bf678df159 | ||
|
36b08b1bcf | ||
36ff3da26d
|
|||
|
6b281ebcc8 | ||
|
3aaddd096f | ||
|
15a5a4c407 | ||
|
0afe14a447 | ||
|
fc9b7b5c92 | ||
|
3f79d2877f | ||
ecc2800705
|
|||
0524d47d64
|
|||
d46f544466
|
|||
79190ef2e1
|
|||
|
4f04e72670 | ||
5cef0f4875
|
|||
28c7558f01
|
|||
|
e19d10f6a7 |
14
.github/workflows/codeql-analysis.yml
vendored
14
.github/workflows/codeql-analysis.yml
vendored
@@ -2,12 +2,12 @@ name: "CodeQL"
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [ master ]
|
branches: [master]
|
||||||
pull_request:
|
pull_request:
|
||||||
# The branches below must be a subset of the branches above
|
# The branches below must be a subset of the branches above
|
||||||
branches: [ master ]
|
branches: [master]
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '16 17 * * 3'
|
- cron: "16 17 * * 3"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
analyze:
|
analyze:
|
||||||
@@ -17,7 +17,7 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
language: [ 'python' ]
|
language: ["python"]
|
||||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
|
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
|
||||||
# Learn more:
|
# Learn more:
|
||||||
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
|
# https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
|
||||||
@@ -28,7 +28,7 @@ jobs:
|
|||||||
|
|
||||||
# Initializes the CodeQL tools for scanning.
|
# Initializes the CodeQL tools for scanning.
|
||||||
- name: Initialize CodeQL
|
- name: Initialize CodeQL
|
||||||
uses: github/codeql-action/init@v1
|
uses: github/codeql-action/init@v2
|
||||||
with:
|
with:
|
||||||
languages: ${{ matrix.language }}
|
languages: ${{ matrix.language }}
|
||||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||||
@@ -39,7 +39,7 @@ jobs:
|
|||||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||||
# If this step fails, then you should remove it and run the build manually (see below)
|
# If this step fails, then you should remove it and run the build manually (see below)
|
||||||
- name: Autobuild
|
- name: Autobuild
|
||||||
uses: github/codeql-action/autobuild@v1
|
uses: github/codeql-action/autobuild@v2
|
||||||
|
|
||||||
# ℹ️ Command-line programs to run using the OS shell.
|
# ℹ️ Command-line programs to run using the OS shell.
|
||||||
# 📚 https://git.io/JvXDl
|
# 📚 https://git.io/JvXDl
|
||||||
@@ -53,4 +53,4 @@ jobs:
|
|||||||
# make release
|
# make release
|
||||||
|
|
||||||
- name: Perform CodeQL Analysis
|
- name: Perform CodeQL Analysis
|
||||||
uses: github/codeql-action/analyze@v1
|
uses: github/codeql-action/analyze@v2
|
||||||
|
12
.github/workflows/main.yml
vendored
12
.github/workflows/main.yml
vendored
@@ -12,13 +12,13 @@ jobs:
|
|||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [macos-latest, ubuntu-latest]
|
os: [macos-latest, ubuntu-latest, windows-latest]
|
||||||
python: [3.8]
|
python: [3.11, 3.12]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v4
|
||||||
- name: Set up Python ${{ matrix.python }}
|
- name: Set up Python ${{ matrix.python }}
|
||||||
uses: actions/setup-python@v2
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python }}
|
python-version: ${{ matrix.python }}
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
@@ -28,14 +28,14 @@ jobs:
|
|||||||
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
||||||
- name: Lint
|
- name: Lint
|
||||||
run: |
|
run: |
|
||||||
black --check --diff stree
|
# black --check --diff stree
|
||||||
flake8 --count stree
|
flake8 --count stree
|
||||||
- name: Tests
|
- name: Tests
|
||||||
run: |
|
run: |
|
||||||
coverage run -m unittest -v stree.tests
|
coverage run -m unittest -v stree.tests
|
||||||
coverage xml
|
coverage xml
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v1
|
uses: codecov/codecov-action@v4
|
||||||
with:
|
with:
|
||||||
token: ${{ secrets.CODECOV_TOKEN }}
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
files: ./coverage.xml
|
files: ./coverage.xml
|
||||||
|
37
CITATION.cff
Normal file
37
CITATION.cff
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
cff-version: 1.2.0
|
||||||
|
message: "If you use this software, please cite it as below."
|
||||||
|
authors:
|
||||||
|
- family-names: "Montañana"
|
||||||
|
given-names: "Ricardo"
|
||||||
|
orcid: "https://orcid.org/0000-0003-3242-5452"
|
||||||
|
- family-names: "Gámez"
|
||||||
|
given-names: "José A."
|
||||||
|
orcid: "https://orcid.org/0000-0003-1188-1117"
|
||||||
|
- family-names: "Puerta"
|
||||||
|
given-names: "José M."
|
||||||
|
orcid: "https://orcid.org/0000-0002-9164-5191"
|
||||||
|
title: "STree"
|
||||||
|
version: 1.2.3
|
||||||
|
doi: 10.5281/zenodo.5504083
|
||||||
|
date-released: 2021-11-02
|
||||||
|
url: "https://github.com/Doctorado-ML/STree"
|
||||||
|
preferred-citation:
|
||||||
|
type: article
|
||||||
|
authors:
|
||||||
|
- family-names: "Montañana"
|
||||||
|
given-names: "Ricardo"
|
||||||
|
orcid: "https://orcid.org/0000-0003-3242-5452"
|
||||||
|
- family-names: "Gámez"
|
||||||
|
given-names: "José A."
|
||||||
|
orcid: "https://orcid.org/0000-0003-1188-1117"
|
||||||
|
- family-names: "Puerta"
|
||||||
|
given-names: "José M."
|
||||||
|
orcid: "https://orcid.org/0000-0002-9164-5191"
|
||||||
|
doi: "10.1007/978-3-030-85713-4_6"
|
||||||
|
journal: "Lecture Notes in Computer Science"
|
||||||
|
month: 9
|
||||||
|
start: 54
|
||||||
|
end: 64
|
||||||
|
title: "STree: A Single Multi-class Oblique Decision Tree Based on Support Vector Machines"
|
||||||
|
volume: 12882
|
||||||
|
year: 2021
|
1
MANIFEST.in
Normal file
1
MANIFEST.in
Normal file
@@ -0,0 +1 @@
|
|||||||
|
include README.md LICENSE
|
41
Makefile
41
Makefile
@@ -1,29 +1,36 @@
|
|||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
.PHONY: coverage deps help lint push test
|
.PHONY: audit coverage help lint test doc doc-clean build
|
||||||
|
|
||||||
coverage: ## Run tests with coverage
|
coverage: ## Run tests with coverage
|
||||||
coverage erase
|
@coverage erase
|
||||||
coverage run -m unittest -v stree.tests
|
@coverage run -m unittest -v stree.tests
|
||||||
coverage report -m
|
@coverage report -m
|
||||||
|
|
||||||
deps: ## Install dependencies
|
lint: ## Lint source files
|
||||||
pip install -r requirements.txt
|
@black stree
|
||||||
|
@flake8 stree
|
||||||
lint: ## Lint and static-check
|
|
||||||
black stree
|
|
||||||
flake8 stree
|
|
||||||
mypy stree
|
|
||||||
|
|
||||||
push: ## Push code with tags
|
|
||||||
git push && git push --tags
|
|
||||||
|
|
||||||
test: ## Run tests
|
test: ## Run tests
|
||||||
python -m unittest -v stree.tests
|
@python -m unittest -v stree.tests
|
||||||
|
|
||||||
help: ## Show help message
|
doc: ## Update documentation
|
||||||
|
@make -C docs --makefile=Makefile html
|
||||||
|
|
||||||
|
build: ## Build package
|
||||||
|
@rm -fr dist/*
|
||||||
|
@rm -fr build/*
|
||||||
|
@hatch build
|
||||||
|
|
||||||
|
doc-clean: ## Clean documentation folders
|
||||||
|
@make -C docs --makefile=Makefile clean
|
||||||
|
|
||||||
|
audit: ## Audit pip
|
||||||
|
@pip-audit
|
||||||
|
|
||||||
|
help: ## Show this help message
|
||||||
@IFS=$$'\n' ; \
|
@IFS=$$'\n' ; \
|
||||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
help_lines=(`grep -Fh "##" $(MAKEFILE_LIST) | grep -Fv fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||||
printf "%s\n\n" "Usage: make [task]"; \
|
printf "%s\n\n" "Usage: make [task]"; \
|
||||||
printf "%-20s %s\n" "task" "help" ; \
|
printf "%-20s %s\n" "task" "help" ; \
|
||||||
printf "%-20s %s\n" "------" "----" ; \
|
printf "%-20s %s\n" "------" "----" ; \
|
||||||
|
28
README.md
28
README.md
@@ -1,8 +1,12 @@
|
|||||||

|

|
||||||
|
[](https://github.com/Doctorado-ML/STree/actions/workflows/codeql-analysis.yml)
|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
[](https://badge.fury.io/py/STree)
|
||||||
|

|
||||||
|
[](https://zenodo.org/badge/latestdoi/262658230)
|
||||||
|
|
||||||
# Stree
|
# STree
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
@@ -11,19 +15,17 @@ Oblique Tree classifier based on SVM nodes. The nodes are built and splitted wit
|
|||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install git+https://github.com/doctorado-ml/stree
|
pip install Stree
|
||||||
```
|
```
|
||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
||||||
Can be found in
|
Can be found in [stree.readthedocs.io](https://stree.readthedocs.io/en/stable/)
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
@@ -35,21 +37,23 @@ Can be found in
|
|||||||
## Hyperparameters
|
## Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
| | splitter | {"best", "random", "trandom", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features).
|
||||||
|
Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them. **“trandom”**: The algorithm generates only one random combination. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label. **"cfs"**: Apply Correlation-based Feature Selection. **"fcbf"**: Apply Fast Correlation-Based Filter. **"iwss"**: IWSS based algorithm |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
@@ -70,3 +74,7 @@ python -m unittest -v stree.tests
|
|||||||
## License
|
## License
|
||||||
|
|
||||||
STree is [MIT](https://github.com/doctorado-ml/stree/blob/master/LICENSE) licensed
|
STree is [MIT](https://github.com/doctorado-ml/stree/blob/master/LICENSE) licensed
|
||||||
|
|
||||||
|
## Reference
|
||||||
|
|
||||||
|
R. Montañana, J. A. Gámez, J. M. Puerta, "STree: a single multi-class oblique decision tree based on support vector machines.", 2021 LNAI 12882, pg. 54-64
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
sphinx
|
sphinx
|
||||||
sphinx-rtd-theme
|
sphinx-rtd-theme
|
||||||
myst-parser
|
myst-parser
|
||||||
|
mufs
|
@@ -1,7 +1,7 @@
|
|||||||
Siterator
|
Siterator
|
||||||
=========
|
=========
|
||||||
|
|
||||||
.. automodule:: stree
|
.. automodule:: Splitter
|
||||||
.. autoclass:: Siterator
|
.. autoclass:: Siterator
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
Snode
|
Snode
|
||||||
=====
|
=====
|
||||||
|
|
||||||
.. automodule:: stree
|
.. automodule:: Splitter
|
||||||
.. autoclass:: Snode
|
.. autoclass:: Snode
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
Splitter
|
Splitter
|
||||||
========
|
========
|
||||||
|
|
||||||
.. automodule:: stree
|
.. automodule:: Splitter
|
||||||
.. autoclass:: Splitter
|
.. autoclass:: Splitter
|
||||||
:members:
|
:members:
|
||||||
:undoc-members:
|
:undoc-members:
|
||||||
|
@@ -6,6 +6,6 @@ API index
|
|||||||
:caption: Contents:
|
:caption: Contents:
|
||||||
|
|
||||||
Stree
|
Stree
|
||||||
Splitter
|
|
||||||
Snode
|
|
||||||
Siterator
|
Siterator
|
||||||
|
Snode
|
||||||
|
Splitter
|
||||||
|
@@ -12,18 +12,19 @@
|
|||||||
#
|
#
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
from stree._version import __version__
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath("../../stree/"))
|
sys.path.insert(0, os.path.abspath("../../stree/"))
|
||||||
|
|
||||||
|
|
||||||
# -- Project information -----------------------------------------------------
|
# -- Project information -----------------------------------------------------
|
||||||
|
|
||||||
project = "STree"
|
project = "STree"
|
||||||
copyright = "2020 - 2021, Ricardo Montañana Gómez"
|
copyright = "2020 - 2022, Ricardo Montañana Gómez"
|
||||||
author = "Ricardo Montañana Gómez"
|
author = "Ricardo Montañana Gómez"
|
||||||
|
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = "1.0"
|
version = __version__
|
||||||
|
release = version
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
@@ -52,4 +53,4 @@ html_theme = "sphinx_rtd_theme"
|
|||||||
# Add any paths that contain custom static files (such as style sheets) here,
|
# Add any paths that contain custom static files (such as style sheets) here,
|
||||||
# relative to this directory. They are copied after the builtin static files,
|
# relative to this directory. They are copied after the builtin static files,
|
||||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||||
html_static_path = ["_static"]
|
html_static_path = []
|
||||||
|
@@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
## Notebooks
|
## Notebooks
|
||||||
|
|
||||||
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Some features
|
||||||
|
@@ -1,21 +1,22 @@
|
|||||||
# Hyperparameters
|
# Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | -------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"linear", "poly", "rbf"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’.<br>liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*.<br>max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features).<br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
| | splitter | {"best", "random", "trandom", "mutual", "cfs", "fcbf", "iwss"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features).<br>Supported strategies are:<br>**“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features.<br>**“random”**: The algorithm generates 5 candidates and choose the best (max. info. gain) of them.<br>**“trandom”**: The algorithm generates only one random combination.<br>**"mutual"**: Chooses the best features w.r.t. their mutual info with the label.<br>**"cfs"**: Apply Correlation-based Feature Selection.<br>**"fcbf"**: Apply Fast Correlation-Based Filter.<br>**"iwss"**: IWSS based algorithm |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets:<br>**"ovo"**: one versus one.<br>**"ovr"**: one versus rest |
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
|
@@ -1,8 +1,12 @@
|
|||||||
# Stree
|
# STree
|
||||||
|
|
||||||
[](https://app.codeship.com/projects/399170)
|

|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
||||||
|
[](https://badge.fury.io/py/STree)
|
||||||
|

|
||||||
|
[](https://zenodo.org/badge/latestdoi/262658230)
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
|
@@ -178,7 +178,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Stree\n",
|
"# Stree\n",
|
||||||
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3)"
|
"stree = Stree(random_state=random_state, C=.01, max_iter=1000, kernel=\"liblinear\", multiclass_strategy=\"ovr\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -198,7 +198,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# SVC (linear)\n",
|
"# SVC (linear)\n",
|
||||||
"svc = LinearSVC(random_state=random_state, C=.01, max_iter=1e3)"
|
"svc = LinearSVC(random_state=random_state, C=.01, max_iter=1000)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@@ -133,33 +133,33 @@
|
|||||||
" 'base_estimator': [Stree(random_state=random_state)],\n",
|
" 'base_estimator': [Stree(random_state=random_state)],\n",
|
||||||
" 'n_estimators': [10, 25],\n",
|
" 'n_estimators': [10, 25],\n",
|
||||||
" 'learning_rate': [.5, 1],\n",
|
" 'learning_rate': [.5, 1],\n",
|
||||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
" 'estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
" 'estimator__tol': [.1, 1e-02],\n",
|
||||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
" 'estimator__max_depth': [3, 5, 7],\n",
|
||||||
" 'base_estimator__C': [1, 7, 55],\n",
|
" 'estimator__C': [1, 7, 55],\n",
|
||||||
" 'base_estimator__kernel': ['linear']\n",
|
" 'estimator__kernel': ['linear']\n",
|
||||||
"},\n",
|
"},\n",
|
||||||
"{\n",
|
"{\n",
|
||||||
" 'base_estimator': [Stree(random_state=random_state)],\n",
|
" 'base_estimator': [Stree(random_state=random_state)],\n",
|
||||||
" 'n_estimators': [10, 25],\n",
|
" 'n_estimators': [10, 25],\n",
|
||||||
" 'learning_rate': [.5, 1],\n",
|
" 'learning_rate': [.5, 1],\n",
|
||||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
" 'estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
" 'estimator__tol': [.1, 1e-02],\n",
|
||||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
" 'estimator__max_depth': [3, 5, 7],\n",
|
||||||
" 'base_estimator__C': [1, 7, 55],\n",
|
" 'estimator__C': [1, 7, 55],\n",
|
||||||
" 'base_estimator__degree': [3, 5, 7],\n",
|
" 'estimator__degree': [3, 5, 7],\n",
|
||||||
" 'base_estimator__kernel': ['poly']\n",
|
" 'estimator__kernel': ['poly']\n",
|
||||||
"},\n",
|
"},\n",
|
||||||
"{\n",
|
"{\n",
|
||||||
" 'base_estimator': [Stree(random_state=random_state)],\n",
|
" 'base_estimator': [Stree(random_state=random_state)],\n",
|
||||||
" 'n_estimators': [10, 25],\n",
|
" 'n_estimators': [10, 25],\n",
|
||||||
" 'learning_rate': [.5, 1],\n",
|
" 'learning_rate': [.5, 1],\n",
|
||||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
" 'estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
" 'estimator__tol': [.1, 1e-02],\n",
|
||||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
" 'estimator__max_depth': [3, 5, 7],\n",
|
||||||
" 'base_estimator__C': [1, 7, 55],\n",
|
" 'estimator__C': [1, 7, 55],\n",
|
||||||
" 'base_estimator__gamma': [.1, 1, 10],\n",
|
" 'estimator__gamma': [.1, 1, 10],\n",
|
||||||
" 'base_estimator__kernel': ['rbf']\n",
|
" 'estimator__kernel': ['rbf']\n",
|
||||||
"}]"
|
"}]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -214,7 +214,7 @@
|
|||||||
" base_estimator=Stree(C=55, max_depth=7, random_state=1,\n",
|
" base_estimator=Stree(C=55, max_depth=7, random_state=1,\n",
|
||||||
" split_criteria='max_samples', tol=0.1),\n",
|
" split_criteria='max_samples', tol=0.1),\n",
|
||||||
" learning_rate=0.5, n_estimators=25, random_state=1)\n",
|
" learning_rate=0.5, n_estimators=25, random_state=1)\n",
|
||||||
"Best hyperparameters: {'base_estimator': Stree(C=55, max_depth=7, random_state=1, split_criteria='max_samples', tol=0.1), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 7, 'base_estimator__split_criteria': 'max_samples', 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 25}"
|
"Best hyperparameters: {'base_estimator': Stree(C=55, max_depth=7, random_state=1, split_criteria='max_samples', tol=0.1), 'estimator__C': 55, 'estimator__kernel': 'linear', 'estimator__max_depth': 7, 'estimator__split_criteria': 'max_samples', 'estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 25}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@@ -1,5 +1,65 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["hatchling"]
|
||||||
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "STree"
|
||||||
|
dependencies = ["scikit-learn>1.0", "mufs"]
|
||||||
|
license = { file = "LICENSE" }
|
||||||
|
description = "Oblique decision tree with svm nodes."
|
||||||
|
readme = "README.md"
|
||||||
|
authors = [
|
||||||
|
{ name = "Ricardo Montañana", email = "ricardo.montanana@alu.uclm.es" },
|
||||||
|
]
|
||||||
|
dynamic = ['version']
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
keywords = [
|
||||||
|
"scikit-learn",
|
||||||
|
"oblique-classifier",
|
||||||
|
"oblique-decision-tree",
|
||||||
|
"decision-tree",
|
||||||
|
"svm",
|
||||||
|
"svc",
|
||||||
|
]
|
||||||
|
classifiers = [
|
||||||
|
"Development Status :: 5 - Production/Stable",
|
||||||
|
"Intended Audience :: Science/Research",
|
||||||
|
"Intended Audience :: Developers",
|
||||||
|
"Topic :: Software Development",
|
||||||
|
"Topic :: Scientific/Engineering",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Natural Language :: English",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = ["black", "flake8", "coverage", "hatch", "pip-audit"]
|
||||||
|
doc = ["sphinx", "myst-parser", "sphinx_rtd_theme", "sphinx-autodoc-typehints"]
|
||||||
|
|
||||||
|
[project.urls]
|
||||||
|
Code = "https://github.com/Doctorado-ML/STree"
|
||||||
|
Documentation = "https://stree.readthedocs.io/en/latest/index.html"
|
||||||
|
|
||||||
|
[tool.hatch.version]
|
||||||
|
path = "stree/_version.py"
|
||||||
|
|
||||||
|
[tool.hatch.build.targets.sdist]
|
||||||
|
include = ["/stree"]
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
branch = true
|
||||||
|
source = ["stree"]
|
||||||
|
command_line = "-m unittest discover -s stree.tests"
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
show_missing = true
|
||||||
|
fail_under = 100
|
||||||
|
|
||||||
[tool.black]
|
[tool.black]
|
||||||
line-length = 79
|
line-length = 79
|
||||||
|
target-version = ["py311"]
|
||||||
include = '\.pyi?$'
|
include = '\.pyi?$'
|
||||||
exclude = '''
|
exclude = '''
|
||||||
/(
|
/(
|
||||||
|
@@ -1 +1,2 @@
|
|||||||
scikit-learn>0.24
|
scikit-learn>0.24
|
||||||
|
mufs
|
@@ -1 +0,0 @@
|
|||||||
python-3.8
|
|
35
setup.py
35
setup.py
@@ -1,35 +0,0 @@
|
|||||||
import setuptools
|
|
||||||
import stree
|
|
||||||
|
|
||||||
|
|
||||||
def readme():
|
|
||||||
with open("README.md") as f:
|
|
||||||
return f.read()
|
|
||||||
|
|
||||||
|
|
||||||
VERSION = stree.__version__
|
|
||||||
setuptools.setup(
|
|
||||||
name="STree",
|
|
||||||
version=stree.__version__,
|
|
||||||
license=stree.__license__,
|
|
||||||
description="Oblique decision tree with svm nodes",
|
|
||||||
long_description=readme(),
|
|
||||||
long_description_content_type="text/markdown",
|
|
||||||
packages=setuptools.find_packages(),
|
|
||||||
url=stree.__url__,
|
|
||||||
author=stree.__author__,
|
|
||||||
author_email=stree.__author_email__,
|
|
||||||
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
|
||||||
tree svm svc",
|
|
||||||
classifiers=[
|
|
||||||
"Development Status :: 5 - Production/Stable",
|
|
||||||
"License :: OSI Approved :: " + stree.__license__,
|
|
||||||
"Programming Language :: Python :: 3.8",
|
|
||||||
"Natural Language :: English",
|
|
||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
||||||
"Intended Audience :: Science/Research",
|
|
||||||
],
|
|
||||||
install_requires=["scikit-learn", "numpy", "ipympl"],
|
|
||||||
test_suite="stree.tests",
|
|
||||||
zip_safe=False,
|
|
||||||
)
|
|
10
stree/.readthedocs.yaml
Normal file
10
stree/.readthedocs.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
version: 2
|
||||||
|
|
||||||
|
sphinx:
|
||||||
|
configuration: docs/source/conf.py
|
||||||
|
|
||||||
|
python:
|
||||||
|
version: 3.8
|
||||||
|
install:
|
||||||
|
- requirements: requirements.txt
|
||||||
|
- requirements: docs/requirements.txt
|
812
stree/Splitter.py
Normal file
812
stree/Splitter.py
Normal file
@@ -0,0 +1,812 @@
|
|||||||
|
"""
|
||||||
|
Oblique decision tree classifier based on SVM nodes
|
||||||
|
Splitter class
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import warnings
|
||||||
|
import random
|
||||||
|
from math import log, factorial
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_selection import SelectKBest, mutual_info_classif
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
|
from mufs import MUFS
|
||||||
|
|
||||||
|
|
||||||
|
class Snode:
|
||||||
|
"""
|
||||||
|
Nodes of the tree that keeps the svm classifier and if testing the
|
||||||
|
dataset assigned to it
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
clf : SVC
|
||||||
|
Classifier used
|
||||||
|
X : np.ndarray
|
||||||
|
input dataset in train time (only in testing)
|
||||||
|
y : np.ndarray
|
||||||
|
input labes in train time
|
||||||
|
features : np.array
|
||||||
|
features used to compute hyperplane
|
||||||
|
impurity : float
|
||||||
|
impurity of the node
|
||||||
|
title : str
|
||||||
|
label describing the route to the node
|
||||||
|
weight : np.ndarray, optional
|
||||||
|
weights applied to input dataset in train time, by default None
|
||||||
|
scaler : StandardScaler, optional
|
||||||
|
scaler used if any, by default None
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
clf: SVC,
|
||||||
|
X: np.ndarray,
|
||||||
|
y: np.ndarray,
|
||||||
|
features: np.array,
|
||||||
|
impurity: float,
|
||||||
|
title: str,
|
||||||
|
weight: np.ndarray = None,
|
||||||
|
scaler: StandardScaler = None,
|
||||||
|
):
|
||||||
|
self._clf = clf
|
||||||
|
self._title = title
|
||||||
|
self._belief = 0.0
|
||||||
|
# Only store dataset in Testing
|
||||||
|
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
self._y = y
|
||||||
|
self._down = None
|
||||||
|
self._up = None
|
||||||
|
self._class = None
|
||||||
|
self._feature = None
|
||||||
|
self._sample_weight = (
|
||||||
|
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
)
|
||||||
|
self._features = features
|
||||||
|
self._impurity = impurity
|
||||||
|
self._partition_column: int = -1
|
||||||
|
self._scaler = scaler
|
||||||
|
self._proba = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def copy(cls, node: "Snode") -> "Snode":
|
||||||
|
return cls(
|
||||||
|
node._clf,
|
||||||
|
node._X,
|
||||||
|
node._y,
|
||||||
|
node._features,
|
||||||
|
node._impurity,
|
||||||
|
node._title,
|
||||||
|
node._sample_weight,
|
||||||
|
node._scaler,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_partition_column(self, col: int):
|
||||||
|
self._partition_column = col
|
||||||
|
|
||||||
|
def get_partition_column(self) -> int:
|
||||||
|
return self._partition_column
|
||||||
|
|
||||||
|
def set_down(self, son):
|
||||||
|
self._down = son
|
||||||
|
|
||||||
|
def set_title(self, title):
|
||||||
|
self._title = title
|
||||||
|
|
||||||
|
def set_classifier(self, clf):
|
||||||
|
self._clf = clf
|
||||||
|
|
||||||
|
def set_features(self, features):
|
||||||
|
self._features = features
|
||||||
|
|
||||||
|
def set_impurity(self, impurity):
|
||||||
|
self._impurity = impurity
|
||||||
|
|
||||||
|
def get_title(self) -> str:
|
||||||
|
return self._title
|
||||||
|
|
||||||
|
def get_classifier(self) -> SVC:
|
||||||
|
return self._clf
|
||||||
|
|
||||||
|
def get_impurity(self) -> float:
|
||||||
|
return self._impurity
|
||||||
|
|
||||||
|
def get_features(self) -> np.array:
|
||||||
|
return self._features
|
||||||
|
|
||||||
|
def set_up(self, son):
|
||||||
|
self._up = son
|
||||||
|
|
||||||
|
def is_leaf(self) -> bool:
|
||||||
|
return self._up is None and self._down is None
|
||||||
|
|
||||||
|
def get_down(self) -> "Snode":
|
||||||
|
return self._down
|
||||||
|
|
||||||
|
def get_up(self) -> "Snode":
|
||||||
|
return self._up
|
||||||
|
|
||||||
|
def make_predictor(self, num_classes: int) -> None:
|
||||||
|
"""Compute the class of the predictor and its belief based on the
|
||||||
|
subdataset of the node only if it is a leaf
|
||||||
|
"""
|
||||||
|
if not self.is_leaf():
|
||||||
|
return
|
||||||
|
classes, card = np.unique(self._y, return_counts=True)
|
||||||
|
self._proba = np.zeros((num_classes,), dtype=np.int64)
|
||||||
|
for c, n in zip(classes, card):
|
||||||
|
self._proba[c] = n
|
||||||
|
try:
|
||||||
|
max_card = max(card)
|
||||||
|
self._class = classes[card == max_card][0]
|
||||||
|
self._belief = max_card / np.sum(card)
|
||||||
|
except ValueError:
|
||||||
|
self._class = None
|
||||||
|
|
||||||
|
def graph(self):
|
||||||
|
"""
|
||||||
|
Return a string representing the node in graphviz format
|
||||||
|
"""
|
||||||
|
output = ""
|
||||||
|
count_values = np.unique(self._y, return_counts=True)
|
||||||
|
if self.is_leaf():
|
||||||
|
output += (
|
||||||
|
f'N{id(self)} [shape=box style=filled label="'
|
||||||
|
f"class={self._class} impurity={self._impurity:.3f} "
|
||||||
|
f'counts={self._proba}"];\n'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output += (
|
||||||
|
f'N{id(self)} [label="#features={len(self._features)} '
|
||||||
|
f"classes={count_values[0]} samples={count_values[1]} "
|
||||||
|
f'({sum(count_values[1])})" fontcolor=black];\n'
|
||||||
|
)
|
||||||
|
output += f"N{id(self)} -> N{id(self.get_up())} [color=black];\n"
|
||||||
|
output += f"N{id(self)} -> N{id(self.get_down())} [color=black];\n"
|
||||||
|
return output
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
count_values = np.unique(self._y, return_counts=True)
|
||||||
|
if self.is_leaf():
|
||||||
|
return (
|
||||||
|
f"{self._title} - Leaf class={self._class} belief="
|
||||||
|
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
f"{self._title} feaures={self._features} impurity="
|
||||||
|
f"{self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Siterator:
|
||||||
|
"""Stree preorder iterator"""
|
||||||
|
|
||||||
|
def __init__(self, tree: Snode):
|
||||||
|
self._stack = []
|
||||||
|
self._push(tree)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
# To complete the iterator interface
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _push(self, node: Snode):
|
||||||
|
if node is not None:
|
||||||
|
self._stack.append(node)
|
||||||
|
|
||||||
|
def __next__(self) -> Snode:
|
||||||
|
if len(self._stack) == 0:
|
||||||
|
raise StopIteration()
|
||||||
|
node = self._stack.pop()
|
||||||
|
self._push(node.get_up())
|
||||||
|
self._push(node.get_down())
|
||||||
|
return node
|
||||||
|
|
||||||
|
|
||||||
|
class Splitter:
|
||||||
|
"""
|
||||||
|
Splits a dataset in two based on different criteria
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
clf : SVC, optional
|
||||||
|
classifier, by default None
|
||||||
|
criterion : str, optional
|
||||||
|
The function to measure the quality of a split (only used if
|
||||||
|
max_features != num_features). Supported criteria are “gini” for the
|
||||||
|
Gini impurity and “entropy” for the information gain., by default
|
||||||
|
"entropy", by default None
|
||||||
|
feature_select : str, optional
|
||||||
|
The strategy used to choose the feature set at each node (only used if
|
||||||
|
max_features < num_features). Supported strategies are: “best”: sklearn
|
||||||
|
SelectKBest algorithm is used in every node to choose the max_features
|
||||||
|
best features. “random”: The algorithm generates 5 candidates and
|
||||||
|
choose the best (max. info. gain) of them. “trandom”: The algorithm
|
||||||
|
generates only one random combination. "mutual": Chooses the best
|
||||||
|
features w.r.t. their mutual info with the label. "cfs": Apply
|
||||||
|
Correlation-based Feature Selection. "fcbf": Apply Fast Correlation-
|
||||||
|
Based, by default None
|
||||||
|
criteria : str, optional
|
||||||
|
ecides (just in case of a multi class classification) which column
|
||||||
|
(class) use to split the dataset in a node. max_samples is
|
||||||
|
incompatible with 'ovo' multiclass_strategy, by default None
|
||||||
|
min_samples_split : int, optional
|
||||||
|
The minimum number of samples required to split an internal node. 0
|
||||||
|
(default) for any, by default None
|
||||||
|
random_state : optional
|
||||||
|
Controls the pseudo random number generation for shuffling the data for
|
||||||
|
probability estimates. Ignored when probability is False.Pass an int
|
||||||
|
for reproducible output across multiple function calls, by
|
||||||
|
default None
|
||||||
|
normalize : bool, optional
|
||||||
|
If standardization of features should be applied on each node with the
|
||||||
|
samples that reach it , by default False
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
clf has to be a sklearn estimator
|
||||||
|
ValueError
|
||||||
|
criterion must be gini or entropy
|
||||||
|
ValueError
|
||||||
|
criteria has to be max_samples or impurity
|
||||||
|
ValueError
|
||||||
|
splitter must be in {random, best, mutual, cfs, fcbf}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
clf: SVC = None,
|
||||||
|
criterion: str = None,
|
||||||
|
feature_select: str = None,
|
||||||
|
criteria: str = None,
|
||||||
|
min_samples_split: int = None,
|
||||||
|
random_state=None,
|
||||||
|
normalize=False,
|
||||||
|
):
|
||||||
|
self._clf = clf
|
||||||
|
self._random_state = random_state
|
||||||
|
if random_state is not None:
|
||||||
|
random.seed(random_state)
|
||||||
|
self._criterion = criterion
|
||||||
|
self._min_samples_split = min_samples_split
|
||||||
|
self._criteria = criteria
|
||||||
|
self._feature_select = feature_select
|
||||||
|
self._normalize = normalize
|
||||||
|
|
||||||
|
if clf is None:
|
||||||
|
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||||
|
|
||||||
|
if criterion not in ["gini", "entropy"]:
|
||||||
|
raise ValueError(
|
||||||
|
f"criterion must be gini or entropy got({criterion})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if criteria not in [
|
||||||
|
"max_samples",
|
||||||
|
"impurity",
|
||||||
|
]:
|
||||||
|
raise ValueError(
|
||||||
|
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if feature_select not in [
|
||||||
|
"random",
|
||||||
|
"trandom",
|
||||||
|
"best",
|
||||||
|
"mutual",
|
||||||
|
"cfs",
|
||||||
|
"fcbf",
|
||||||
|
"iwss",
|
||||||
|
]:
|
||||||
|
raise ValueError(
|
||||||
|
"splitter must be in {random, trandom, best, mutual, cfs, "
|
||||||
|
"fcbf, iwss} "
|
||||||
|
f"got ({feature_select})"
|
||||||
|
)
|
||||||
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
|
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||||
|
self.fs_function = getattr(self, f"_fs_{self._feature_select}")
|
||||||
|
|
||||||
|
def _fs_random(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the best of five random feature set combinations
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# Random feature reduction
|
||||||
|
n_features = dataset.shape[1]
|
||||||
|
features_sets = self._generate_spaces(n_features, max_features)
|
||||||
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_trandom(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the a random feature set combination
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# Random feature reduction
|
||||||
|
n_features = dataset.shape[1]
|
||||||
|
return tuple(sorted(random.sample(range(n_features), max_features)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_best(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the variabes with higher f-score
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
SelectKBest(k=max_features)
|
||||||
|
.fit(dataset, labels)
|
||||||
|
.get_support(indices=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fs_mutual(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the best features with mutual information with labels
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# return best features with mutual info with the label
|
||||||
|
feature_list = mutual_info_classif(
|
||||||
|
dataset, labels, random_state=self._random_state
|
||||||
|
)
|
||||||
|
return tuple(
|
||||||
|
sorted(
|
||||||
|
range(len(feature_list)),
|
||||||
|
key=lambda sub: feature_list[sub],
|
||||||
|
)[-max_features:]
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_cfs(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Correlattion-based feature selection with max_features limit
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
|
return mufs.cfs(dataset, labels).get_results()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_fcbf(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Fast Correlation-based Filter algorithm with max_features limit
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
|
return mufs.fcbf(dataset, labels, 5e-4).get_results()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_iwss(
|
||||||
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Correlattion-based feature selection based on iwss with max_features
|
||||||
|
limit
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(< number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
mufs = MUFS(max_features=max_features, discrete=False)
|
||||||
|
return mufs.iwss(dataset, labels, 0.25).get_results()
|
||||||
|
|
||||||
|
def partition_impurity(self, y: np.array) -> np.array:
|
||||||
|
return self.criterion_function(y)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _gini(y: np.array) -> float:
|
||||||
|
_, count = np.unique(y, return_counts=True)
|
||||||
|
return 1 - np.sum(np.square(count / np.sum(count)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _entropy(y: np.array) -> float:
|
||||||
|
"""Compute entropy of a labels set
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : np.array
|
||||||
|
set of labels
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
entropy
|
||||||
|
"""
|
||||||
|
n_labels = len(y)
|
||||||
|
if n_labels <= 1:
|
||||||
|
return 0
|
||||||
|
counts = np.bincount(y)
|
||||||
|
proportions = counts / n_labels
|
||||||
|
n_classes = np.count_nonzero(proportions)
|
||||||
|
if n_classes <= 1:
|
||||||
|
return 0
|
||||||
|
entropy = 0.0
|
||||||
|
# Compute standard entropy.
|
||||||
|
for prop in proportions:
|
||||||
|
if prop != 0.0:
|
||||||
|
entropy -= prop * log(prop, n_classes)
|
||||||
|
return entropy
|
||||||
|
|
||||||
|
def information_gain(
|
||||||
|
self,
|
||||||
|
labels: np.array,
|
||||||
|
labels_up: np.array,
|
||||||
|
labels_dn: np.array,
|
||||||
|
) -> float:
|
||||||
|
"""Compute information gain of a split candidate
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
labels_up : np.array
|
||||||
|
labels of one side
|
||||||
|
labels_dn : np.array
|
||||||
|
labels on the other side
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
information gain
|
||||||
|
"""
|
||||||
|
imp_prev = self.criterion_function(labels)
|
||||||
|
card_up = card_dn = imp_up = imp_dn = 0
|
||||||
|
if labels_up is not None:
|
||||||
|
card_up = labels_up.shape[0]
|
||||||
|
imp_up = self.criterion_function(labels_up)
|
||||||
|
if labels_dn is not None:
|
||||||
|
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||||
|
imp_dn = self.criterion_function(labels_dn)
|
||||||
|
samples = card_up + card_dn
|
||||||
|
if samples == 0:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
result = (
|
||||||
|
imp_prev
|
||||||
|
- (card_up / samples) * imp_up
|
||||||
|
- (card_dn / samples) * imp_dn
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _select_best_set(
|
||||||
|
self, dataset: np.array, labels: np.array, features_sets: list
|
||||||
|
) -> list:
|
||||||
|
"""Return the best set of features among feature_sets, the criterion is
|
||||||
|
the information gain
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
labels : np.array
|
||||||
|
array of labels
|
||||||
|
features_sets : list
|
||||||
|
list of features sets to check
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
best feature set
|
||||||
|
"""
|
||||||
|
max_gain = 0
|
||||||
|
selected = None
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
for feature_set in features_sets:
|
||||||
|
self._clf.fit(dataset[:, feature_set], labels)
|
||||||
|
node = Snode(
|
||||||
|
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
||||||
|
)
|
||||||
|
self.partition(dataset, node, train=True)
|
||||||
|
y1, y2 = self.part(labels)
|
||||||
|
gain = self.information_gain(labels, y1, y2)
|
||||||
|
if gain > max_gain:
|
||||||
|
max_gain = gain
|
||||||
|
selected = feature_set
|
||||||
|
return selected if selected is not None else feature_set
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_spaces(features: int, max_features: int) -> list:
|
||||||
|
"""Generate at most 5 feature random combinations
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
features : int
|
||||||
|
number of features in each combination
|
||||||
|
max_features : int
|
||||||
|
number of features in dataset
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list with up to 5 combination of features randomly selected
|
||||||
|
"""
|
||||||
|
comb = set()
|
||||||
|
# Generate at most 5 combinations
|
||||||
|
number = factorial(features) / (
|
||||||
|
factorial(max_features) * factorial(features - max_features)
|
||||||
|
)
|
||||||
|
set_length = min(5, number)
|
||||||
|
while len(comb) < set_length:
|
||||||
|
comb.add(
|
||||||
|
tuple(sorted(random.sample(range(features), max_features)))
|
||||||
|
)
|
||||||
|
return list(comb)
|
||||||
|
|
||||||
|
def _get_subspaces_set(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Compute the indices of the features selected by splitter depending
|
||||||
|
on the self._feature_select hyper parameter
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features of the subspace
|
||||||
|
(<= number of features in dataset)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
# No feature reduction
|
||||||
|
n_features = dataset.shape[1]
|
||||||
|
if n_features == max_features:
|
||||||
|
return tuple(range(n_features))
|
||||||
|
# select features as selected in constructor
|
||||||
|
return self.fs_function(dataset, labels, max_features)
|
||||||
|
|
||||||
|
def get_subspace(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Re3turn a subspace of the selected dataset of max_features length.
|
||||||
|
Depending on hyperparameter
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
dataset : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
labels : np.array
|
||||||
|
labels of the dataset
|
||||||
|
max_features : int
|
||||||
|
number of features to form the subspace
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
tuple
|
||||||
|
tuple with the dataset with only the features selected and the
|
||||||
|
indices of the features selected
|
||||||
|
"""
|
||||||
|
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||||
|
return dataset[:, indices], indices
|
||||||
|
|
||||||
|
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
||||||
|
"""return column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : np.array
|
||||||
|
distances to hyper plane of every class
|
||||||
|
y : np.array
|
||||||
|
vector of labels (classes)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
"""
|
||||||
|
max_gain = 0
|
||||||
|
selected = -1
|
||||||
|
for col in range(data.shape[1]):
|
||||||
|
tup = y[data[:, col] > 0]
|
||||||
|
tdn = y[data[:, col] <= 0]
|
||||||
|
info_gain = self.information_gain(y, tup, tdn)
|
||||||
|
if info_gain > max_gain:
|
||||||
|
selected = col
|
||||||
|
max_gain = info_gain
|
||||||
|
return selected
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||||
|
"""return column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
data : np.array
|
||||||
|
distances to hyper plane of every class
|
||||||
|
y : np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
column of dataset to be taken into account to split dataset
|
||||||
|
"""
|
||||||
|
# select the class with max number of samples
|
||||||
|
_, samples = np.unique(y, return_counts=True)
|
||||||
|
return np.argmax(samples)
|
||||||
|
|
||||||
|
def partition(self, samples: np.array, node: Snode, train: bool):
|
||||||
|
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||||
|
that should go to one side of the tree (up)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
samples : np.array
|
||||||
|
array of samples (# samples, # features)
|
||||||
|
node : Snode
|
||||||
|
Node of the tree where partition is going to be made
|
||||||
|
train : bool
|
||||||
|
Train time - True / Test time - False
|
||||||
|
"""
|
||||||
|
# data contains the distances of every sample to every class hyperplane
|
||||||
|
# array of (m, nc) nc = # classes
|
||||||
|
data = self._distances(node, samples)
|
||||||
|
if data.shape[0] < self._min_samples_split:
|
||||||
|
# there aren't enough samples to split
|
||||||
|
self._up = np.ones((data.shape[0]), dtype=bool)
|
||||||
|
return
|
||||||
|
if data.ndim > 1:
|
||||||
|
# split criteria for multiclass
|
||||||
|
# Convert data to a (m, 1) array selecting values for samples
|
||||||
|
if train:
|
||||||
|
# in train time we have to compute the column to take into
|
||||||
|
# account to split the dataset
|
||||||
|
col = self.decision_criteria(data, node._y)
|
||||||
|
node.set_partition_column(col)
|
||||||
|
else:
|
||||||
|
# in predcit time just use the column computed in train time
|
||||||
|
# is taking the classifier of class <col>
|
||||||
|
col = node.get_partition_column()
|
||||||
|
if col == -1:
|
||||||
|
# No partition is producing information gain
|
||||||
|
data = np.ones(data.shape)
|
||||||
|
data = data[:, col]
|
||||||
|
self._up = data > 0
|
||||||
|
|
||||||
|
def part(self, origin: np.array) -> list:
|
||||||
|
"""Split an array in two based on indices (self._up) and its complement
|
||||||
|
partition has to be called first to establish up indices
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
origin : np.array
|
||||||
|
dataset to split
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list with two splits of the array
|
||||||
|
"""
|
||||||
|
down = ~self._up
|
||||||
|
return [
|
||||||
|
origin[self._up] if any(self._up) else None,
|
||||||
|
origin[down] if any(down) else None,
|
||||||
|
]
|
||||||
|
|
||||||
|
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
||||||
|
"""Compute distances of the samples to the hyperplane of the node
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
node : Snode
|
||||||
|
node containing the svm classifier
|
||||||
|
data : np.ndarray
|
||||||
|
samples to compute distance to hyperplane
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
np.array
|
||||||
|
array of shape (m, nc) with the distances of every sample to
|
||||||
|
the hyperplane of every class. nc = # of classes
|
||||||
|
"""
|
||||||
|
X_transformed = data[:, node._features]
|
||||||
|
if self._normalize:
|
||||||
|
X_transformed = node._scaler.transform(X_transformed)
|
||||||
|
return node._clf.decision_function(X_transformed)
|
879
stree/Strees.py
879
stree/Strees.py
File diff suppressed because it is too large
Load Diff
@@ -1,11 +1,9 @@
|
|||||||
from .Strees import Stree, Snode, Siterator, Splitter
|
from .Strees import Stree, Siterator
|
||||||
|
from ._version import __version__
|
||||||
__version__ = "1.0"
|
|
||||||
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
||||||
__license__ = "MIT License"
|
__license__ = "MIT License"
|
||||||
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
||||||
__url__ = "https://github.com/doctorado-ml/stree"
|
|
||||||
|
|
||||||
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
__all__ = ["__version__", "Stree", "Siterator"]
|
||||||
|
1
stree/_version.py
Normal file
1
stree/_version.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
__version__ = "1.4.0"
|
@@ -1,14 +1,19 @@
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from stree import Stree, Snode
|
from stree import Stree
|
||||||
|
from stree.Splitter import Snode
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
class Snode_test(unittest.TestCase):
|
class Snode_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree(random_state=self._random_state)
|
self._clf = Stree(
|
||||||
|
random_state=self._random_state,
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
)
|
||||||
self._clf.fit(*load_dataset(self._random_state))
|
self._clf.fit(*load_dataset(self._random_state))
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@@ -62,10 +67,28 @@ class Snode_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_make_predictor_on_leaf(self):
|
def test_make_predictor_on_leaf(self):
|
||||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
test.make_predictor()
|
test.make_predictor(2)
|
||||||
self.assertEqual(1, test._class)
|
self.assertEqual(1, test._class)
|
||||||
self.assertEqual(0.75, test._belief)
|
self.assertEqual(0.75, test._belief)
|
||||||
self.assertEqual(-1, test._partition_column)
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
self.assertListEqual([1, 3], test._proba.tolist())
|
||||||
|
|
||||||
|
def test_make_predictor_on_not_leaf(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
|
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
|
||||||
|
test.make_predictor(2)
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
self.assertEqual(0, test._belief)
|
||||||
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
self.assertEqual(-1, test.get_up()._partition_column)
|
||||||
|
self.assertIsNone(test._proba)
|
||||||
|
|
||||||
|
def test_make_predictor_on_leaf_bogus_data(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
||||||
|
test.make_predictor(2)
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
self.assertListEqual([0, 0], test._proba.tolist())
|
||||||
|
|
||||||
def test_set_title(self):
|
def test_set_title(self):
|
||||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
@@ -92,21 +115,6 @@ class Snode_test(unittest.TestCase):
|
|||||||
test.set_features([1, 2])
|
test.set_features([1, 2])
|
||||||
self.assertListEqual([1, 2], test.get_features())
|
self.assertListEqual([1, 2], test.get_features())
|
||||||
|
|
||||||
def test_make_predictor_on_not_leaf(self):
|
|
||||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
|
||||||
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
|
|
||||||
test.make_predictor()
|
|
||||||
self.assertIsNone(test._class)
|
|
||||||
self.assertEqual(0, test._belief)
|
|
||||||
self.assertEqual(-1, test._partition_column)
|
|
||||||
self.assertEqual(-1, test.get_up()._partition_column)
|
|
||||||
|
|
||||||
def test_make_predictor_on_leaf_bogus_data(self):
|
|
||||||
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
|
||||||
test.make_predictor()
|
|
||||||
self.assertIsNone(test._class)
|
|
||||||
self.assertEqual(-1, test._partition_column)
|
|
||||||
|
|
||||||
def test_copy_node(self):
|
def test_copy_node(self):
|
||||||
px = [1, 2, 3, 4]
|
px = [1, 2, 3, 4]
|
||||||
py = [1]
|
py = [1]
|
||||||
|
@@ -5,8 +5,8 @@ import random
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.datasets import load_wine, load_iris
|
from sklearn.datasets import load_wine, load_iris
|
||||||
from stree import Splitter
|
from stree.Splitter import Splitter
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset, load_disc_dataset
|
||||||
|
|
||||||
|
|
||||||
class Splitter_test(unittest.TestCase):
|
class Splitter_test(unittest.TestCase):
|
||||||
@@ -195,10 +195,14 @@ class Splitter_test(unittest.TestCase):
|
|||||||
[0, 3, 7, 12], # random entropy impurity
|
[0, 3, 7, 12], # random entropy impurity
|
||||||
[1, 7, 9, 12], # random gini max_samples
|
[1, 7, 9, 12], # random gini max_samples
|
||||||
[1, 5, 8, 12], # random gini impurity
|
[1, 5, 8, 12], # random gini impurity
|
||||||
|
[6, 9, 11, 12], # mutual entropy max_samples
|
||||||
|
[6, 9, 11, 12], # mutual entropy impurity
|
||||||
|
[6, 9, 11, 12], # mutual gini max_samples
|
||||||
|
[6, 9, 11, 12], # mutual gini impurity
|
||||||
]
|
]
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
rn = 0
|
rn = 0
|
||||||
for feature_select in ["best", "random"]:
|
for feature_select in ["best", "random", "mutual"]:
|
||||||
for criterion in ["entropy", "gini"]:
|
for criterion in ["entropy", "gini"]:
|
||||||
for criteria in [
|
for criteria in [
|
||||||
"max_samples",
|
"max_samples",
|
||||||
@@ -221,7 +225,7 @@ class Splitter_test(unittest.TestCase):
|
|||||||
# criteria,
|
# criteria,
|
||||||
# )
|
# )
|
||||||
# )
|
# )
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, sorted(list(computed)))
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
X[:, computed].tolist(), dataset.tolist()
|
X[:, computed].tolist(), dataset.tolist()
|
||||||
)
|
)
|
||||||
@@ -240,3 +244,69 @@ class Splitter_test(unittest.TestCase):
|
|||||||
Xs, computed = tcl.get_subspace(X, y, k)
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_best_subspaces_discrete(self):
|
||||||
|
results = [
|
||||||
|
(4, [0, 3, 16, 18]),
|
||||||
|
(7, [0, 3, 13, 14, 16, 18, 19]),
|
||||||
|
(9, [0, 3, 7, 13, 14, 15, 16, 18, 19]),
|
||||||
|
]
|
||||||
|
X, y = load_disc_dataset(n_features=20)
|
||||||
|
for k, expected in results:
|
||||||
|
tcl = self.build(
|
||||||
|
feature_select="best",
|
||||||
|
)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_cfs_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [1, 5, 9, 12]),
|
||||||
|
(6, [1, 5, 9, 12, 4, 2]),
|
||||||
|
(7, [1, 5, 9, 12, 4, 2, 3]),
|
||||||
|
]
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
for k, expected in results:
|
||||||
|
tcl = self.build(feature_select="cfs")
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, k)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_fcbf_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [1, 5, 9, 12]),
|
||||||
|
(6, [1, 5, 9, 12, 4, 2]),
|
||||||
|
(7, [1, 5, 9, 12, 4, 2, 16]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="fcbf", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_iwss_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [1, 5, 9, 12]),
|
||||||
|
(6, [1, 5, 9, 12, 4, 15]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="iwss", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
|
||||||
|
def test_get_trandom_subspaces(self):
|
||||||
|
results = [
|
||||||
|
(4, [3, 7, 9, 12]),
|
||||||
|
(6, [0, 1, 2, 8, 15, 18]),
|
||||||
|
(7, [1, 2, 4, 8, 10, 12, 13]),
|
||||||
|
]
|
||||||
|
for rs, expected in results:
|
||||||
|
X, y = load_dataset(n_features=20, n_informative=7)
|
||||||
|
tcl = self.build(feature_select="trandom", random_state=rs)
|
||||||
|
Xs, computed = tcl.get_subspace(X, y, rs)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(X[:, expected].tolist(), Xs.tolist())
|
||||||
|
@@ -7,14 +7,16 @@ from sklearn.datasets import load_iris, load_wine
|
|||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
from stree import Stree, Snode
|
from stree import Stree
|
||||||
|
from stree.Splitter import Snode
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset
|
||||||
|
from .._version import __version__
|
||||||
|
|
||||||
|
|
||||||
class Stree_test(unittest.TestCase):
|
class Stree_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._kernels = ["linear", "rbf", "poly"]
|
self._kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -22,10 +24,9 @@ class Stree_test(unittest.TestCase):
|
|||||||
os.environ["TESTING"] = "1"
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
def test_valid_kernels(self):
|
def test_valid_kernels(self):
|
||||||
valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
|
|
||||||
X, y = load_dataset()
|
X, y = load_dataset()
|
||||||
for kernel in valid_kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel)
|
clf = Stree(kernel=kernel, multiclass_strategy="ovr")
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertIsNotNone(clf.tree_)
|
self.assertIsNotNone(clf.tree_)
|
||||||
|
|
||||||
@@ -55,14 +56,19 @@ class Stree_test(unittest.TestCase):
|
|||||||
# i.e. The partition algorithm didn't forget any sample
|
# i.e. The partition algorithm didn't forget any sample
|
||||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||||
_, count_d = np.unique(y_down, return_counts=True)
|
labels_d, count_d = np.unique(y_down, return_counts=True)
|
||||||
_, count_u = np.unique(y_up, return_counts=True)
|
labels_u, count_u = np.unique(y_up, return_counts=True)
|
||||||
|
dict_d = {label: count_d[i] for i, label in enumerate(labels_d)}
|
||||||
|
dict_u = {label: count_u[i] for i, label in enumerate(labels_u)}
|
||||||
#
|
#
|
||||||
for i in unique_y:
|
for i in unique_y:
|
||||||
number_up = count_u[i]
|
|
||||||
try:
|
try:
|
||||||
number_down = count_d[i]
|
number_up = dict_u[i]
|
||||||
except IndexError:
|
except KeyError:
|
||||||
|
number_up = 0
|
||||||
|
try:
|
||||||
|
number_down = dict_d[i]
|
||||||
|
except KeyError:
|
||||||
number_down = 0
|
number_down = 0
|
||||||
self.assertEqual(count_y[i], number_down + number_up)
|
self.assertEqual(count_y[i], number_down + number_up)
|
||||||
# Is the partition made the same as the prediction?
|
# Is the partition made the same as the prediction?
|
||||||
@@ -77,14 +83,22 @@ class Stree_test(unittest.TestCase):
|
|||||||
"""Check if the tree is built the same way as predictions of models"""
|
"""Check if the tree is built the same way as predictions of models"""
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="sigmoid",
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
self._check_tree(clf.tree_)
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
def test_single_prediction(self):
|
def test_single_prediction(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||||
self.assertEqual(yp[0], y[0])
|
self.assertEqual(yp[0], y[0])
|
||||||
|
|
||||||
@@ -92,18 +106,58 @@ class Stree_test(unittest.TestCase):
|
|||||||
# First 27 elements the predictions are the same as the truth
|
# First 27 elements the predictions are the same as the truth
|
||||||
num = 27
|
num = 27
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in ["liblinear", "linear", "rbf", "poly"]:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
yp = clf.fit(X, y).predict(X[:num, :])
|
yp = clf.fit(X, y).predict(X[:num, :])
|
||||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||||
|
|
||||||
|
def test_multiple_predict_proba(self):
|
||||||
|
expected = {
|
||||||
|
"liblinear": {
|
||||||
|
0: [0.02401129943502825, 0.9759887005649718],
|
||||||
|
17: [0.9282970550576184, 0.07170294494238157],
|
||||||
|
},
|
||||||
|
"linear": {
|
||||||
|
0: [0.029329608938547486, 0.9706703910614525],
|
||||||
|
17: [0.9298469387755102, 0.07015306122448979],
|
||||||
|
},
|
||||||
|
"rbf": {
|
||||||
|
0: [0.023448275862068966, 0.976551724137931],
|
||||||
|
17: [0.9458064516129032, 0.05419354838709677],
|
||||||
|
},
|
||||||
|
"poly": {
|
||||||
|
0: [0.01601164483260553, 0.9839883551673945],
|
||||||
|
17: [0.9089790897908979, 0.0910209102091021],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
indices = [0, 17]
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in ["liblinear", "linear", "rbf", "poly"]:
|
||||||
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
|
yp = clf.fit(X, y).predict_proba(X)
|
||||||
|
for index in indices:
|
||||||
|
for exp, comp in zip(expected[kernel][index], yp[index]):
|
||||||
|
self.assertAlmostEqual(exp, comp)
|
||||||
|
|
||||||
def test_single_vs_multiple_prediction(self):
|
def test_single_vs_multiple_prediction(self):
|
||||||
"""Check if predicting sample by sample gives the same result as
|
"""Check if predicting sample by sample gives the same result as
|
||||||
predicting all samples at once
|
predicting all samples at once
|
||||||
"""
|
"""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
# Compute prediction line by line
|
# Compute prediction line by line
|
||||||
yp_line = np.array([], dtype=int)
|
yp_line = np.array([], dtype=int)
|
||||||
@@ -135,9 +189,13 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
computed = []
|
computed = []
|
||||||
expected_string = ""
|
expected_string = ""
|
||||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
for node in clf:
|
for node in iter(clf):
|
||||||
computed.append(str(node))
|
computed.append(str(node))
|
||||||
expected_string += str(node) + "\n"
|
expected_string += str(node) + "\n"
|
||||||
self.assertListEqual(expected, computed)
|
self.assertListEqual(expected, computed)
|
||||||
@@ -173,9 +231,15 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_check_max_depth(self):
|
def test_check_max_depth(self):
|
||||||
depths = (3, 4)
|
depths = (3, 4)
|
||||||
for depth in depths:
|
for depth in depths:
|
||||||
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
tcl = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
max_depth=depth,
|
||||||
|
)
|
||||||
tcl.fit(*load_dataset(self._random_state))
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
self.assertEqual(depth, tcl.depth_)
|
self.assertEqual(depth, tcl.depth_)
|
||||||
|
self.assertEqual(depth, tcl.get_depth())
|
||||||
|
|
||||||
def test_unfitted_tree_is_iterable(self):
|
def test_unfitted_tree_is_iterable(self):
|
||||||
tcl = Stree()
|
tcl = Stree()
|
||||||
@@ -194,7 +258,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
split_criteria="max_samples",
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
px = [[1, 2], [5, 6], [9, 10]]
|
px = [[1, 2], [5, 6], [9, 10]]
|
||||||
@@ -205,26 +269,36 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertListEqual(py, clf.classes_.tolist())
|
self.assertListEqual(py, clf.classes_.tolist())
|
||||||
|
|
||||||
def test_muticlass_dataset(self):
|
def test_muticlass_dataset(self):
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
datasets = {
|
datasets = {
|
||||||
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||||
"Iris": load_wine(return_X_y=True),
|
"Iris": load_wine(return_X_y=True),
|
||||||
}
|
}
|
||||||
outcomes = {
|
outcomes = {
|
||||||
"Synt": {
|
"Synt": {
|
||||||
"max_samples linear": 0.9606666666666667,
|
"max_samples liblinear": 0.9493333333333334,
|
||||||
"max_samples rbf": 0.7133333333333334,
|
"max_samples linear": 0.9426666666666667,
|
||||||
"max_samples poly": 0.618,
|
"max_samples rbf": 0.9606666666666667,
|
||||||
"impurity linear": 0.9606666666666667,
|
"max_samples poly": 0.9373333333333334,
|
||||||
"impurity rbf": 0.7133333333333334,
|
"max_samples sigmoid": 0.824,
|
||||||
"impurity poly": 0.618,
|
"impurity liblinear": 0.9493333333333334,
|
||||||
|
"impurity linear": 0.9426666666666667,
|
||||||
|
"impurity rbf": 0.9606666666666667,
|
||||||
|
"impurity poly": 0.9373333333333334,
|
||||||
|
"impurity sigmoid": 0.824,
|
||||||
},
|
},
|
||||||
"Iris": {
|
"Iris": {
|
||||||
|
"max_samples liblinear": 0.9887640449438202,
|
||||||
"max_samples linear": 1.0,
|
"max_samples linear": 1.0,
|
||||||
"max_samples rbf": 0.6910112359550562,
|
"max_samples rbf": 0.6685393258426966,
|
||||||
"max_samples poly": 0.6966292134831461,
|
"max_samples poly": 0.6853932584269663,
|
||||||
"impurity linear": 1,
|
"max_samples sigmoid": 0.6404494382022472,
|
||||||
"impurity rbf": 0.6910112359550562,
|
"impurity liblinear": 0.9887640449438202,
|
||||||
"impurity poly": 0.6966292134831461,
|
"impurity linear": 1.0,
|
||||||
|
"impurity rbf": 0.6685393258426966,
|
||||||
|
"impurity poly": 0.6853932584269663,
|
||||||
|
"impurity sigmoid": 0.6404494382022472,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,18 +307,22 @@ class Stree_test(unittest.TestCase):
|
|||||||
for criteria in ["max_samples", "impurity"]:
|
for criteria in ["max_samples", "impurity"]:
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
C=55,
|
max_iter=int(1e4),
|
||||||
max_iter=1e5,
|
multiclass_strategy=(
|
||||||
|
"ovr" if kernel == "liblinear" else "ovo"
|
||||||
|
),
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
clf.fit(px, py)
|
clf.fit(px, py)
|
||||||
outcome = outcomes[name][f"{criteria} {kernel}"]
|
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||||
# print(
|
# print(f'"{criteria} {kernel}": {clf.score(px, py)},')
|
||||||
# f"{name} {criteria} {kernel} {outcome} {clf.score(px"
|
self.assertAlmostEqual(
|
||||||
# ", py)}"
|
outcome,
|
||||||
# )
|
clf.score(px, py),
|
||||||
self.assertAlmostEqual(outcome, clf.score(px, py))
|
5,
|
||||||
|
f"{name} - {criteria} - {kernel}",
|
||||||
|
)
|
||||||
|
|
||||||
def test_max_features(self):
|
def test_max_features(self):
|
||||||
n_features = 16
|
n_features = 16
|
||||||
@@ -269,6 +347,12 @@ class Stree_test(unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = clf._initialize_max_features()
|
_ = clf._initialize_max_features()
|
||||||
|
|
||||||
|
def test_wrong_max_features(self):
|
||||||
|
X, y = load_dataset(n_features=15)
|
||||||
|
clf = Stree(max_features=16)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
def test_get_subspaces(self):
|
def test_get_subspaces(self):
|
||||||
dataset = np.random.random((10, 16))
|
dataset = np.random.random((10, 16))
|
||||||
y = np.random.randint(0, 2, 10)
|
y = np.random.randint(0, 2, 10)
|
||||||
@@ -306,17 +390,20 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf.predict(X[:, :3])
|
clf.predict(X[:, :3])
|
||||||
|
|
||||||
# Tests of score
|
# Tests of score
|
||||||
|
|
||||||
def test_score_binary(self):
|
def test_score_binary(self):
|
||||||
|
"""Check score for binary classification."""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
accuracies = [
|
accuracies = [
|
||||||
0.9506666666666667,
|
0.9506666666666667,
|
||||||
|
0.9493333333333334,
|
||||||
0.9606666666666667,
|
0.9606666666666667,
|
||||||
0.9433333333333334,
|
0.9433333333333334,
|
||||||
|
0.9153333333333333,
|
||||||
]
|
]
|
||||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
)
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
@@ -327,12 +414,19 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||||
|
|
||||||
def test_score_max_features(self):
|
def test_score_max_features(self):
|
||||||
|
"""Check score using max_features."""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
clf = Stree(random_state=self._random_state, max_features=2)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
max_features=2,
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
||||||
|
|
||||||
def test_bogus_splitter_parameter(self):
|
def test_bogus_splitter_parameter(self):
|
||||||
|
"""Check that bogus splitter parameter raises exception."""
|
||||||
clf = Stree(splitter="duck")
|
clf = Stree(splitter="duck")
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
clf.fit(*load_dataset())
|
clf.fit(*load_dataset())
|
||||||
@@ -340,14 +434,16 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_multiclass_classifier_integrity(self):
|
def test_multiclass_classifier_integrity(self):
|
||||||
"""Checks if the multiclass operation is done right"""
|
"""Checks if the multiclass operation is done right"""
|
||||||
X, y = load_iris(return_X_y=True)
|
X, y = load_iris(return_X_y=True)
|
||||||
clf = Stree(random_state=0)
|
clf = Stree(
|
||||||
|
kernel="liblinear", multiclass_strategy="ovr", random_state=0
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
score = clf.score(X, y)
|
score = clf.score(X, y)
|
||||||
# Check accuracy of the whole model
|
# Check accuracy of the whole model
|
||||||
self.assertAlmostEquals(0.98, score, 5)
|
self.assertAlmostEqual(0.98, score, 5)
|
||||||
svm = LinearSVC(random_state=0)
|
svm = LinearSVC(random_state=0)
|
||||||
svm.fit(X, y)
|
svm.fit(X, y)
|
||||||
self.assertAlmostEquals(0.9666666666666667, svm.score(X, y), 5)
|
self.assertAlmostEqual(0.9666666666666667, svm.score(X, y), 5)
|
||||||
data = svm.decision_function(X)
|
data = svm.decision_function(X)
|
||||||
expected = [
|
expected = [
|
||||||
0.4444444444444444,
|
0.4444444444444444,
|
||||||
@@ -359,7 +455,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
ty[data > 0] = 1
|
ty[data > 0] = 1
|
||||||
ty = ty.astype(int)
|
ty = ty.astype(int)
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
self.assertAlmostEquals(
|
self.assertAlmostEqual(
|
||||||
expected[i],
|
expected[i],
|
||||||
clf.splitter_._gini(ty[:, i]),
|
clf.splitter_._gini(ty[:, i]),
|
||||||
)
|
)
|
||||||
@@ -386,6 +482,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertListEqual([47], resdn[1].tolist())
|
self.assertListEqual([47], resdn[1].tolist())
|
||||||
|
|
||||||
def test_score_multiclass_rbf(self):
|
def test_score_multiclass_rbf(self):
|
||||||
|
"""Test score for multiclass classification with rbf kernel."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -396,13 +493,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="rbf", random_state=self._random_state, normalize=True
|
kernel="rbf", random_state=self._random_state, normalize=True
|
||||||
)
|
)
|
||||||
self.assertEqual(0.768, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.966, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.964, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.6685393258426966, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_poly(self):
|
def test_score_multiclass_poly(self):
|
||||||
|
"""Test score for multiclass classification with poly kernel."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -417,36 +515,95 @@ class Stree_test(unittest.TestCase):
|
|||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.946, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.972, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.7808988764044944, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_liblinear(self):
|
||||||
|
"""Test score for multiclass classification with liblinear kernel."""
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
C=10,
|
||||||
|
)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
|
)
|
||||||
|
self.assertEqual(0.968, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.97, clf2.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(1.0, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_sigmoid(self):
|
||||||
|
"""Test score for multiclass classification with sigmoid kernel."""
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(kernel="sigmoid", random_state=self._random_state, C=10)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="sigmoid",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
|
C=10,
|
||||||
|
)
|
||||||
|
self.assertEqual(0.796, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.952, clf2.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(0.6910112359550562, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.9662921348314607, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_linear(self):
|
def test_score_multiclass_linear(self):
|
||||||
|
"""Test score for multiclass classification with linear kernel."""
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
n_features=5,
|
n_features=5,
|
||||||
n_samples=1500,
|
n_samples=1500,
|
||||||
)
|
)
|
||||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||||
# Check with context based standardization
|
# Check with context based standardization
|
||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="linear", random_state=self._random_state, normalize=True
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.9887640449438202, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_zero_all_sample_weights(self):
|
def test_zero_all_sample_weights(self):
|
||||||
|
"""Test exception raises when all sample weights are zero."""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
Stree().fit(X, y, np.zeros(len(y)))
|
Stree().fit(X, y, np.zeros(len(y)))
|
||||||
|
|
||||||
def test_mask_samples_weighted_zero(self):
|
def test_mask_samples_weighted_zero(self):
|
||||||
|
"""Check that the weighted zero samples are masked."""
|
||||||
X = np.array(
|
X = np.array(
|
||||||
[
|
[
|
||||||
[1, 1],
|
[1, 1],
|
||||||
@@ -461,7 +618,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
||||||
yw = np.array([1, 1, 1, 5, 5, 5, 5, 5, 5])
|
yw = np.array([1, 1, 1, 1, 1, 1, 5, 5, 5])
|
||||||
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
||||||
model1 = Stree().fit(X, y)
|
model1 = Stree().fit(X, y)
|
||||||
model2 = Stree().fit(X, y, w)
|
model2 = Stree().fit(X, y, w)
|
||||||
@@ -474,6 +631,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertEqual(model2.score(X, y, w), 1)
|
self.assertEqual(model2.score(X, y, w), 1)
|
||||||
|
|
||||||
def test_depth(self):
|
def test_depth(self):
|
||||||
|
"""Check depth of the tree."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -483,12 +641,15 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertEqual(6, clf.depth_)
|
self.assertEqual(6, clf.depth_)
|
||||||
|
self.assertEqual(6, clf.get_depth())
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertEqual(4, clf.depth_)
|
self.assertEqual(4, clf.depth_)
|
||||||
|
self.assertEqual(4, clf.get_depth())
|
||||||
|
|
||||||
def test_nodes_leaves(self):
|
def test_nodes_leaves(self):
|
||||||
|
"""Check number of nodes and leaves."""
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
@@ -498,16 +659,21 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(25, nodes)
|
self.assertEqual(31, nodes)
|
||||||
self.assertEqual(13, leaves)
|
self.assertEqual(31, clf.get_nodes())
|
||||||
|
self.assertEqual(16, leaves)
|
||||||
|
self.assertEqual(16, clf.get_leaves())
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(9, nodes)
|
self.assertEqual(11, nodes)
|
||||||
self.assertEqual(5, leaves)
|
self.assertEqual(11, clf.get_nodes())
|
||||||
|
self.assertEqual(6, leaves)
|
||||||
|
self.assertEqual(6, clf.get_leaves())
|
||||||
|
|
||||||
def test_nodes_leaves_artificial(self):
|
def test_nodes_leaves_artificial(self):
|
||||||
|
"""Check leaves of artificial dataset."""
|
||||||
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
||||||
n2 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test2")
|
n2 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test2")
|
||||||
n3 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test3")
|
n3 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test3")
|
||||||
@@ -523,4 +689,85 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf.tree_ = n1
|
clf.tree_ = n1
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(6, nodes)
|
self.assertEqual(6, nodes)
|
||||||
|
self.assertEqual(6, clf.get_nodes())
|
||||||
self.assertEqual(2, leaves)
|
self.assertEqual(2, leaves)
|
||||||
|
self.assertEqual(2, clf.get_leaves())
|
||||||
|
|
||||||
|
def test_bogus_multiclass_strategy(self):
|
||||||
|
"""Check invalid multiclass strategy."""
|
||||||
|
clf = Stree(multiclass_strategy="other")
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
|
def test_multiclass_strategy(self):
|
||||||
|
"""Check multiclass strategy."""
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf_o = Stree(multiclass_strategy="ovo")
|
||||||
|
clf_r = Stree(multiclass_strategy="ovr")
|
||||||
|
score_o = clf_o.fit(X, y).score(X, y)
|
||||||
|
score_r = clf_r.fit(X, y).score(X, y)
|
||||||
|
self.assertEqual(1.0, score_o)
|
||||||
|
self.assertEqual(0.9269662921348315, score_r)
|
||||||
|
|
||||||
|
def test_incompatible_hyperparameters(self):
|
||||||
|
"""Check incompatible hyperparameters."""
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf = Stree(kernel="liblinear", multiclass_strategy="ovo")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
clf = Stree(multiclass_strategy="ovo", split_criteria="max_samples")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
|
def test_version(self):
|
||||||
|
"""Check STree version."""
|
||||||
|
clf = Stree()
|
||||||
|
self.assertEqual(__version__, clf.version())
|
||||||
|
|
||||||
|
def test_call(self) -> None:
|
||||||
|
"""Check call method."""
|
||||||
|
clf = Stree()
|
||||||
|
self.assertEqual(__version__, clf())
|
||||||
|
|
||||||
|
def test_graph(self):
|
||||||
|
"""Check graphviz representation of the tree."""
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf = Stree(random_state=self._random_state)
|
||||||
|
|
||||||
|
expected_head = (
|
||||||
|
"digraph STree {\nlabel=<STree >\nfontsize=30\n"
|
||||||
|
"fontcolor=blue\nlabelloc=t\n"
|
||||||
|
)
|
||||||
|
expected_tail = (
|
||||||
|
' [shape=box style=filled label="class=1 impurity=0.000 '
|
||||||
|
'counts=[0 1 0]"];\n}\n'
|
||||||
|
)
|
||||||
|
self.assertEqual(clf.graph(), expected_head + "}\n")
|
||||||
|
clf.fit(X, y)
|
||||||
|
computed = clf.graph()
|
||||||
|
computed_head = computed[: len(expected_head)]
|
||||||
|
num = -len(expected_tail)
|
||||||
|
computed_tail = computed[num:]
|
||||||
|
self.assertEqual(computed_head, expected_head)
|
||||||
|
self.assertEqual(computed_tail, expected_tail)
|
||||||
|
|
||||||
|
def test_graph_title(self):
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf = Stree(random_state=self._random_state)
|
||||||
|
expected_head = (
|
||||||
|
"digraph STree {\nlabel=<STree Sample title>\nfontsize=30\n"
|
||||||
|
"fontcolor=blue\nlabelloc=t\n"
|
||||||
|
)
|
||||||
|
expected_tail = (
|
||||||
|
' [shape=box style=filled label="class=1 impurity=0.000 '
|
||||||
|
'counts=[0 1 0]"];\n}\n'
|
||||||
|
)
|
||||||
|
self.assertEqual(clf.graph("Sample title"), expected_head + "}\n")
|
||||||
|
clf.fit(X, y)
|
||||||
|
computed = clf.graph("Sample title")
|
||||||
|
computed_head = computed[: len(expected_head)]
|
||||||
|
num = -len(expected_tail)
|
||||||
|
computed_tail = computed[num:]
|
||||||
|
self.assertEqual(computed_head, expected_head)
|
||||||
|
self.assertEqual(computed_tail, expected_tail)
|
||||||
|
@@ -1,11 +1,14 @@
|
|||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
def load_dataset(
|
||||||
|
random_state=0, n_classes=2, n_features=3, n_samples=1500, n_informative=3
|
||||||
|
):
|
||||||
X, y = make_classification(
|
X, y = make_classification(
|
||||||
n_samples=n_samples,
|
n_samples=n_samples,
|
||||||
n_features=n_features,
|
n_features=n_features,
|
||||||
n_informative=3,
|
n_informative=n_informative,
|
||||||
n_redundant=0,
|
n_redundant=0,
|
||||||
n_repeated=0,
|
n_repeated=0,
|
||||||
n_classes=n_classes,
|
n_classes=n_classes,
|
||||||
@@ -15,3 +18,12 @@ def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
|||||||
random_state=random_state,
|
random_state=random_state,
|
||||||
)
|
)
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
def load_disc_dataset(
|
||||||
|
random_state=0, n_classes=2, n_features=3, n_samples=1500
|
||||||
|
):
|
||||||
|
np.random.seed(random_state)
|
||||||
|
X = np.random.randint(1, 17, size=(n_samples, n_features)).astype(float)
|
||||||
|
y = np.random.randint(low=0, high=n_classes, size=(n_samples), dtype=int)
|
||||||
|
return X, y
|
||||||
|
Reference in New Issue
Block a user