mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
6 Commits
package_do
...
1.1
Author | SHA1 | Date | |
---|---|---|---|
d46f544466
|
|||
79190ef2e1
|
|||
|
4f04e72670 | ||
5cef0f4875
|
|||
28c7558f01
|
|||
|
e19d10f6a7 |
12
Makefile
12
Makefile
@@ -1,6 +1,6 @@
|
|||||||
SHELL := /bin/bash
|
SHELL := /bin/bash
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
.PHONY: coverage deps help lint push test
|
.PHONY: coverage deps help lint push test doc build
|
||||||
|
|
||||||
coverage: ## Run tests with coverage
|
coverage: ## Run tests with coverage
|
||||||
coverage erase
|
coverage erase
|
||||||
@@ -21,6 +21,16 @@ push: ## Push code with tags
|
|||||||
test: ## Run tests
|
test: ## Run tests
|
||||||
python -m unittest -v stree.tests
|
python -m unittest -v stree.tests
|
||||||
|
|
||||||
|
doc: ## Update documentation
|
||||||
|
make -C docs --makefile=Makefile html
|
||||||
|
|
||||||
|
build: ## Build package
|
||||||
|
rm -fr dist/*
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
|
||||||
|
doc-clean: ## Update documentation
|
||||||
|
make -C docs --makefile=Makefile clean
|
||||||
|
|
||||||
help: ## Show help message
|
help: ## Show help message
|
||||||
@IFS=$$'\n' ; \
|
@IFS=$$'\n' ; \
|
||||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||||
|
12
README.md
12
README.md
@@ -1,8 +1,9 @@
|
|||||||

|

|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
||||||
|
|
||||||
# Stree
|
# STree
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
@@ -35,21 +36,22 @@ Can be found in
|
|||||||
## Hyperparameters
|
## Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
| | splitter | {"best", "random", "mutual"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose one randomly. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
sphinx
|
sphinx
|
||||||
sphinx-rtd-theme
|
sphinx-rtd-theme
|
||||||
myst-parser
|
myst-parser
|
||||||
|
git+https://github.com/doctorado-ml/stree
|
||||||
|
@@ -12,6 +12,7 @@
|
|||||||
#
|
#
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import stree
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath("../../stree/"))
|
sys.path.insert(0, os.path.abspath("../../stree/"))
|
||||||
|
|
||||||
@@ -23,7 +24,8 @@ copyright = "2020 - 2021, Ricardo Montañana Gómez"
|
|||||||
author = "Ricardo Montañana Gómez"
|
author = "Ricardo Montañana Gómez"
|
||||||
|
|
||||||
# The full version, including alpha/beta/rc tags
|
# The full version, including alpha/beta/rc tags
|
||||||
release = "1.0"
|
version = stree.__version__
|
||||||
|
release = version
|
||||||
|
|
||||||
|
|
||||||
# -- General configuration ---------------------------------------------------
|
# -- General configuration ---------------------------------------------------
|
||||||
|
@@ -1,21 +1,22 @@
|
|||||||
# Hyperparameters
|
## Hyperparameters
|
||||||
|
|
||||||
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
|
||||||
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --- | ------------------- | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
|
||||||
| \* | kernel | {"linear", "poly", "rbf"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’. |
|
| \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
|
||||||
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
|
||||||
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
|
||||||
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
|
||||||
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
|
||||||
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels. |
|
||||||
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features. |
|
||||||
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* |
|
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
|
||||||
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
|
||||||
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
|
||||||
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
|
||||||
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. |
|
| | splitter | {"best", "random", "mutual"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose one randomly. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label |
|
||||||
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
|
||||||
|
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
|
||||||
|
|
||||||
\* Hyperparameter used by the support vector classifier of every node
|
\* Hyperparameter used by the support vector classifier of every node
|
||||||
|
|
||||||
|
@@ -1,8 +1,9 @@
|
|||||||
# Stree
|
# STree
|
||||||
|
|
||||||
[](https://app.codeship.com/projects/399170)
|
[](https://app.codeship.com/projects/399170)
|
||||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
[](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||
|
33
setup.py
33
setup.py
@@ -1,5 +1,4 @@
|
|||||||
import setuptools
|
import setuptools
|
||||||
import stree
|
|
||||||
|
|
||||||
|
|
||||||
def readme():
|
def readme():
|
||||||
@@ -7,29 +6,45 @@ def readme():
|
|||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
VERSION = stree.__version__
|
def get_data(field):
|
||||||
|
item = ""
|
||||||
|
with open("stree/__init__.py") as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
if line.startswith(f"__{field}__"):
|
||||||
|
delim = '"' if '"' in line else "'"
|
||||||
|
item = line.split(delim)[1]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unable to find {field} string.")
|
||||||
|
return item
|
||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="STree",
|
name="STree",
|
||||||
version=stree.__version__,
|
version=get_data("version"),
|
||||||
license=stree.__license__,
|
license=get_data("license"),
|
||||||
description="Oblique decision tree with svm nodes",
|
description="Oblique decision tree with svm nodes",
|
||||||
long_description=readme(),
|
long_description=readme(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(),
|
||||||
url=stree.__url__,
|
url="https://github.com/Doctorado-ML/STree#stree",
|
||||||
author=stree.__author__,
|
project_urls={
|
||||||
author_email=stree.__author_email__,
|
"Code": "https://github.com/Doctorado-ML/STree",
|
||||||
|
"Documentation": "https://stree.readthedocs.io/en/latest/index.html",
|
||||||
|
},
|
||||||
|
author=get_data("author"),
|
||||||
|
author_email=get_data("author_email"),
|
||||||
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
||||||
tree svm svc",
|
tree svm svc",
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Development Status :: 5 - Production/Stable",
|
"Development Status :: 5 - Production/Stable",
|
||||||
"License :: OSI Approved :: " + stree.__license__,
|
"License :: OSI Approved :: " + get_data("license"),
|
||||||
"Programming Language :: Python :: 3.8",
|
"Programming Language :: Python :: 3.8",
|
||||||
"Natural Language :: English",
|
"Natural Language :: English",
|
||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
],
|
],
|
||||||
install_requires=["scikit-learn", "numpy", "ipympl"],
|
install_requires=["scikit-learn", "numpy"],
|
||||||
test_suite="stree.tests",
|
test_suite="stree.tests",
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
10
stree/.readthedocs.yaml
Normal file
10
stree/.readthedocs.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
version: 2
|
||||||
|
|
||||||
|
sphinx:
|
||||||
|
configuration: docs/source/conf.py
|
||||||
|
|
||||||
|
python:
|
||||||
|
version: 3.8
|
||||||
|
install:
|
||||||
|
- requirements: requirements.txt
|
||||||
|
- requirements: docs/requirements.txt
|
@@ -11,7 +11,7 @@ from typing import Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import SVC, LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
from sklearn.feature_selection import SelectKBest
|
from sklearn.feature_selection import SelectKBest, mutual_info_classif
|
||||||
from sklearn.preprocessing import StandardScaler
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.utils.multiclass import check_classification_targets
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
@@ -155,6 +155,10 @@ class Siterator:
|
|||||||
self._stack = []
|
self._stack = []
|
||||||
self._push(tree)
|
self._push(tree)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
# To complete the iterator interface
|
||||||
|
return self
|
||||||
|
|
||||||
def _push(self, node: Snode):
|
def _push(self, node: Snode):
|
||||||
if node is not None:
|
if node is not None:
|
||||||
self._stack.append(node)
|
self._stack.append(node)
|
||||||
@@ -205,9 +209,9 @@ class Splitter:
|
|||||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if feature_select not in ["random", "best"]:
|
if feature_select not in ["random", "best", "mutual"]:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"splitter must be either random or best, got "
|
"splitter must be in {random, best, mutual} got "
|
||||||
f"({feature_select})"
|
f"({feature_select})"
|
||||||
)
|
)
|
||||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
@@ -373,20 +377,29 @@ class Splitter:
|
|||||||
tuple
|
tuple
|
||||||
indices of the features selected
|
indices of the features selected
|
||||||
"""
|
"""
|
||||||
|
# No feature reduction
|
||||||
if dataset.shape[1] == max_features:
|
if dataset.shape[1] == max_features:
|
||||||
# No feature reduction applies
|
|
||||||
return tuple(range(dataset.shape[1]))
|
return tuple(range(dataset.shape[1]))
|
||||||
|
# Random feature reduction
|
||||||
if self._feature_select == "random":
|
if self._feature_select == "random":
|
||||||
features_sets = self._generate_spaces(
|
features_sets = self._generate_spaces(
|
||||||
dataset.shape[1], max_features
|
dataset.shape[1], max_features
|
||||||
)
|
)
|
||||||
return self._select_best_set(dataset, labels, features_sets)
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
# Take KBest features
|
# return the KBest features
|
||||||
|
if self._feature_select == "best":
|
||||||
return (
|
return (
|
||||||
SelectKBest(k=max_features)
|
SelectKBest(k=max_features)
|
||||||
.fit(dataset, labels)
|
.fit(dataset, labels)
|
||||||
.get_support(indices=True)
|
.get_support(indices=True)
|
||||||
)
|
)
|
||||||
|
# return best features with mutual info with the label
|
||||||
|
feature_list = mutual_info_classif(dataset, labels)
|
||||||
|
return tuple(
|
||||||
|
sorted(
|
||||||
|
range(len(feature_list)), key=lambda sub: feature_list[sub]
|
||||||
|
)[-max_features:]
|
||||||
|
)
|
||||||
|
|
||||||
def get_subspace(
|
def get_subspace(
|
||||||
self, dataset: np.array, labels: np.array, max_features: int
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
@@ -561,6 +574,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
min_samples_split: int = 0,
|
min_samples_split: int = 0,
|
||||||
max_features=None,
|
max_features=None,
|
||||||
splitter: str = "random",
|
splitter: str = "random",
|
||||||
|
multiclass_strategy: str = "ovo",
|
||||||
normalize: bool = False,
|
normalize: bool = False,
|
||||||
):
|
):
|
||||||
self.max_iter = max_iter
|
self.max_iter = max_iter
|
||||||
@@ -577,6 +591,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.criterion = criterion
|
self.criterion = criterion
|
||||||
self.splitter = splitter
|
self.splitter = splitter
|
||||||
self.normalize = normalize
|
self.normalize = normalize
|
||||||
|
self.multiclass_strategy = multiclass_strategy
|
||||||
|
|
||||||
def _more_tags(self) -> dict:
|
def _more_tags(self) -> dict:
|
||||||
"""Required by sklearn to supply features of the classifier
|
"""Required by sklearn to supply features of the classifier
|
||||||
@@ -621,7 +636,23 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
f"Maximum depth has to be greater than 1... got (max_depth=\
|
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||||
{self.max_depth})"
|
{self.max_depth})"
|
||||||
)
|
)
|
||||||
kernels = ["linear", "rbf", "poly", "sigmoid"]
|
if self.multiclass_strategy not in ["ovr", "ovo"]:
|
||||||
|
raise ValueError(
|
||||||
|
"mutliclass_strategy has to be either ovr or ovo"
|
||||||
|
f" but got {self.multiclass_strategy}"
|
||||||
|
)
|
||||||
|
if self.multiclass_strategy == "ovo":
|
||||||
|
if self.kernel == "liblinear":
|
||||||
|
raise ValueError(
|
||||||
|
"The kernel liblinear is incompatible with ovo "
|
||||||
|
"multiclass_strategy"
|
||||||
|
)
|
||||||
|
if self.split_criteria == "max_samples":
|
||||||
|
raise ValueError(
|
||||||
|
"The multiclass_strategy 'ovo' is incompatible with "
|
||||||
|
"split_criteria 'max_samples'"
|
||||||
|
)
|
||||||
|
kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
|
||||||
if self.kernel not in kernels:
|
if self.kernel not in kernels:
|
||||||
raise ValueError(f"Kernel {self.kernel} not in {kernels}")
|
raise ValueError(f"Kernel {self.kernel} not in {kernels}")
|
||||||
check_classification_targets(y)
|
check_classification_targets(y)
|
||||||
@@ -653,12 +684,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.n_features_ = X.shape[1]
|
self.n_features_ = X.shape[1]
|
||||||
self.n_features_in_ = X.shape[1]
|
self.n_features_in_ = X.shape[1]
|
||||||
self.max_features_ = self._initialize_max_features()
|
self.max_features_ = self._initialize_max_features()
|
||||||
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
self.tree_ = self._train(X, y, sample_weight, 1, "root")
|
||||||
self.X_ = X
|
self.X_ = X
|
||||||
self.y_ = y
|
self.y_ = y
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def train(
|
def _train(
|
||||||
self,
|
self,
|
||||||
X: np.ndarray,
|
X: np.ndarray,
|
||||||
y: np.ndarray,
|
y: np.ndarray,
|
||||||
@@ -723,10 +754,10 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
node.make_predictor()
|
node.make_predictor()
|
||||||
return node
|
return node
|
||||||
node.set_up(
|
node.set_up(
|
||||||
self.train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
|
self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
|
||||||
)
|
)
|
||||||
node.set_down(
|
node.set_down(
|
||||||
self.train(
|
self._train(
|
||||||
X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})"
|
X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})"
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
@@ -741,7 +772,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
C=self.C,
|
C=self.C,
|
||||||
tol=self.tol,
|
tol=self.tol,
|
||||||
)
|
)
|
||||||
if self.kernel == "linear"
|
if self.kernel == "liblinear"
|
||||||
else SVC(
|
else SVC(
|
||||||
kernel=self.kernel,
|
kernel=self.kernel,
|
||||||
max_iter=self.max_iter,
|
max_iter=self.max_iter,
|
||||||
@@ -750,6 +781,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
gamma=self.gamma,
|
gamma=self.gamma,
|
||||||
degree=self.degree,
|
degree=self.degree,
|
||||||
random_state=self.random_state,
|
random_state=self.random_state,
|
||||||
|
decision_function_shape=self.multiclass_strategy,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -892,6 +924,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
elif self.max_features is None:
|
elif self.max_features is None:
|
||||||
max_features = self.n_features_
|
max_features = self.n_features_
|
||||||
elif isinstance(self.max_features, numbers.Integral):
|
elif isinstance(self.max_features, numbers.Integral):
|
||||||
|
if self.max_features > self.n_features_:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid value for max_features. "
|
||||||
|
"It can not be greater than number of features "
|
||||||
|
f"({self.n_features_})"
|
||||||
|
)
|
||||||
max_features = self.max_features
|
max_features = self.max_features
|
||||||
else: # float
|
else: # float
|
||||||
if self.max_features > 0.0:
|
if self.max_features > 0.0:
|
||||||
|
@@ -1,11 +1,10 @@
|
|||||||
from .Strees import Stree, Snode, Siterator, Splitter
|
from .Strees import Stree, Snode, Siterator, Splitter
|
||||||
|
|
||||||
__version__ = "1.0"
|
__version__ = "1.1"
|
||||||
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
||||||
__license__ = "MIT License"
|
__license__ = "MIT License"
|
||||||
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
||||||
__url__ = "https://github.com/doctorado-ml/stree"
|
|
||||||
|
|
||||||
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
||||||
|
@@ -8,7 +8,11 @@ from .utils import load_dataset
|
|||||||
class Snode_test(unittest.TestCase):
|
class Snode_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree(random_state=self._random_state)
|
self._clf = Stree(
|
||||||
|
random_state=self._random_state,
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
)
|
||||||
self._clf.fit(*load_dataset(self._random_state))
|
self._clf.fit(*load_dataset(self._random_state))
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@@ -195,10 +195,14 @@ class Splitter_test(unittest.TestCase):
|
|||||||
[0, 3, 7, 12], # random entropy impurity
|
[0, 3, 7, 12], # random entropy impurity
|
||||||
[1, 7, 9, 12], # random gini max_samples
|
[1, 7, 9, 12], # random gini max_samples
|
||||||
[1, 5, 8, 12], # random gini impurity
|
[1, 5, 8, 12], # random gini impurity
|
||||||
|
[6, 9, 11, 12], # mutual entropy max_samples
|
||||||
|
[6, 9, 11, 12], # mutual entropy impurity
|
||||||
|
[6, 9, 11, 12], # mutual gini max_samples
|
||||||
|
[6, 9, 11, 12], # mutual gini impurity
|
||||||
]
|
]
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
rn = 0
|
rn = 0
|
||||||
for feature_select in ["best", "random"]:
|
for feature_select in ["best", "random", "mutual"]:
|
||||||
for criterion in ["entropy", "gini"]:
|
for criterion in ["entropy", "gini"]:
|
||||||
for criteria in [
|
for criteria in [
|
||||||
"max_samples",
|
"max_samples",
|
||||||
@@ -221,7 +225,7 @@ class Splitter_test(unittest.TestCase):
|
|||||||
# criteria,
|
# criteria,
|
||||||
# )
|
# )
|
||||||
# )
|
# )
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, sorted(list(computed)))
|
||||||
self.assertListEqual(
|
self.assertListEqual(
|
||||||
X[:, computed].tolist(), dataset.tolist()
|
X[:, computed].tolist(), dataset.tolist()
|
||||||
)
|
)
|
||||||
|
@@ -14,7 +14,7 @@ from .utils import load_dataset
|
|||||||
class Stree_test(unittest.TestCase):
|
class Stree_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._kernels = ["linear", "rbf", "poly"]
|
self._kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -22,10 +22,9 @@ class Stree_test(unittest.TestCase):
|
|||||||
os.environ["TESTING"] = "1"
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
def test_valid_kernels(self):
|
def test_valid_kernels(self):
|
||||||
valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
|
|
||||||
X, y = load_dataset()
|
X, y = load_dataset()
|
||||||
for kernel in valid_kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel)
|
clf = Stree(kernel=kernel, multiclass_strategy="ovr")
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertIsNotNone(clf.tree_)
|
self.assertIsNotNone(clf.tree_)
|
||||||
|
|
||||||
@@ -55,14 +54,19 @@ class Stree_test(unittest.TestCase):
|
|||||||
# i.e. The partition algorithm didn't forget any sample
|
# i.e. The partition algorithm didn't forget any sample
|
||||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||||
_, count_d = np.unique(y_down, return_counts=True)
|
labels_d, count_d = np.unique(y_down, return_counts=True)
|
||||||
_, count_u = np.unique(y_up, return_counts=True)
|
labels_u, count_u = np.unique(y_up, return_counts=True)
|
||||||
|
dict_d = {label: count_d[i] for i, label in enumerate(labels_d)}
|
||||||
|
dict_u = {label: count_u[i] for i, label in enumerate(labels_u)}
|
||||||
#
|
#
|
||||||
for i in unique_y:
|
for i in unique_y:
|
||||||
number_up = count_u[i]
|
|
||||||
try:
|
try:
|
||||||
number_down = count_d[i]
|
number_up = dict_u[i]
|
||||||
except IndexError:
|
except KeyError:
|
||||||
|
number_up = 0
|
||||||
|
try:
|
||||||
|
number_down = dict_d[i]
|
||||||
|
except KeyError:
|
||||||
number_down = 0
|
number_down = 0
|
||||||
self.assertEqual(count_y[i], number_down + number_up)
|
self.assertEqual(count_y[i], number_down + number_up)
|
||||||
# Is the partition made the same as the prediction?
|
# Is the partition made the same as the prediction?
|
||||||
@@ -77,14 +81,22 @@ class Stree_test(unittest.TestCase):
|
|||||||
"""Check if the tree is built the same way as predictions of models"""
|
"""Check if the tree is built the same way as predictions of models"""
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="sigmoid",
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
self._check_tree(clf.tree_)
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
def test_single_prediction(self):
|
def test_single_prediction(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||||
self.assertEqual(yp[0], y[0])
|
self.assertEqual(yp[0], y[0])
|
||||||
|
|
||||||
@@ -92,8 +104,12 @@ class Stree_test(unittest.TestCase):
|
|||||||
# First 27 elements the predictions are the same as the truth
|
# First 27 elements the predictions are the same as the truth
|
||||||
num = 27
|
num = 27
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in ["liblinear", "linear", "rbf", "poly"]:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
yp = clf.fit(X, y).predict(X[:num, :])
|
yp = clf.fit(X, y).predict(X[:num, :])
|
||||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||||
|
|
||||||
@@ -103,7 +119,11 @@ class Stree_test(unittest.TestCase):
|
|||||||
"""
|
"""
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
# Compute prediction line by line
|
# Compute prediction line by line
|
||||||
yp_line = np.array([], dtype=int)
|
yp_line = np.array([], dtype=int)
|
||||||
@@ -135,9 +155,13 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
computed = []
|
computed = []
|
||||||
expected_string = ""
|
expected_string = ""
|
||||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
clf.fit(*load_dataset(self._random_state))
|
clf.fit(*load_dataset(self._random_state))
|
||||||
for node in clf:
|
for node in iter(clf):
|
||||||
computed.append(str(node))
|
computed.append(str(node))
|
||||||
expected_string += str(node) + "\n"
|
expected_string += str(node) + "\n"
|
||||||
self.assertListEqual(expected, computed)
|
self.assertListEqual(expected, computed)
|
||||||
@@ -173,7 +197,12 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_check_max_depth(self):
|
def test_check_max_depth(self):
|
||||||
depths = (3, 4)
|
depths = (3, 4)
|
||||||
for depth in depths:
|
for depth in depths:
|
||||||
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
tcl = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
max_depth=depth,
|
||||||
|
)
|
||||||
tcl.fit(*load_dataset(self._random_state))
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
self.assertEqual(depth, tcl.depth_)
|
self.assertEqual(depth, tcl.depth_)
|
||||||
|
|
||||||
@@ -194,7 +223,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
split_criteria="max_samples",
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
px = [[1, 2], [5, 6], [9, 10]]
|
px = [[1, 2], [5, 6], [9, 10]]
|
||||||
@@ -205,26 +234,36 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertListEqual(py, clf.classes_.tolist())
|
self.assertListEqual(py, clf.classes_.tolist())
|
||||||
|
|
||||||
def test_muticlass_dataset(self):
|
def test_muticlass_dataset(self):
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
datasets = {
|
datasets = {
|
||||||
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||||
"Iris": load_wine(return_X_y=True),
|
"Iris": load_wine(return_X_y=True),
|
||||||
}
|
}
|
||||||
outcomes = {
|
outcomes = {
|
||||||
"Synt": {
|
"Synt": {
|
||||||
"max_samples linear": 0.9606666666666667,
|
"max_samples liblinear": 0.9493333333333334,
|
||||||
"max_samples rbf": 0.7133333333333334,
|
"max_samples linear": 0.9426666666666667,
|
||||||
"max_samples poly": 0.618,
|
"max_samples rbf": 0.9606666666666667,
|
||||||
"impurity linear": 0.9606666666666667,
|
"max_samples poly": 0.9373333333333334,
|
||||||
"impurity rbf": 0.7133333333333334,
|
"max_samples sigmoid": 0.824,
|
||||||
"impurity poly": 0.618,
|
"impurity liblinear": 0.9493333333333334,
|
||||||
|
"impurity linear": 0.9426666666666667,
|
||||||
|
"impurity rbf": 0.9606666666666667,
|
||||||
|
"impurity poly": 0.9373333333333334,
|
||||||
|
"impurity sigmoid": 0.824,
|
||||||
},
|
},
|
||||||
"Iris": {
|
"Iris": {
|
||||||
|
"max_samples liblinear": 0.9550561797752809,
|
||||||
"max_samples linear": 1.0,
|
"max_samples linear": 1.0,
|
||||||
"max_samples rbf": 0.6910112359550562,
|
"max_samples rbf": 0.6685393258426966,
|
||||||
"max_samples poly": 0.6966292134831461,
|
"max_samples poly": 0.6853932584269663,
|
||||||
"impurity linear": 1,
|
"max_samples sigmoid": 0.6404494382022472,
|
||||||
"impurity rbf": 0.6910112359550562,
|
"impurity liblinear": 0.9550561797752809,
|
||||||
"impurity poly": 0.6966292134831461,
|
"impurity linear": 1.0,
|
||||||
|
"impurity rbf": 0.6685393258426966,
|
||||||
|
"impurity poly": 0.6853932584269663,
|
||||||
|
"impurity sigmoid": 0.6404494382022472,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,18 +272,22 @@ class Stree_test(unittest.TestCase):
|
|||||||
for criteria in ["max_samples", "impurity"]:
|
for criteria in ["max_samples", "impurity"]:
|
||||||
for kernel in self._kernels:
|
for kernel in self._kernels:
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
C=55,
|
max_iter=1e4,
|
||||||
max_iter=1e5,
|
multiclass_strategy="ovr"
|
||||||
|
if kernel == "liblinear"
|
||||||
|
else "ovo",
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
)
|
)
|
||||||
clf.fit(px, py)
|
clf.fit(px, py)
|
||||||
outcome = outcomes[name][f"{criteria} {kernel}"]
|
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||||
# print(
|
# print(f'"{criteria} {kernel}": {clf.score(px, py)},')
|
||||||
# f"{name} {criteria} {kernel} {outcome} {clf.score(px"
|
self.assertAlmostEqual(
|
||||||
# ", py)}"
|
outcome,
|
||||||
# )
|
clf.score(px, py),
|
||||||
self.assertAlmostEqual(outcome, clf.score(px, py))
|
5,
|
||||||
|
f"{name} - {criteria} - {kernel}",
|
||||||
|
)
|
||||||
|
|
||||||
def test_max_features(self):
|
def test_max_features(self):
|
||||||
n_features = 16
|
n_features = 16
|
||||||
@@ -269,6 +312,12 @@ class Stree_test(unittest.TestCase):
|
|||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
_ = clf._initialize_max_features()
|
_ = clf._initialize_max_features()
|
||||||
|
|
||||||
|
def test_wrong_max_features(self):
|
||||||
|
X, y = load_dataset(n_features=15)
|
||||||
|
clf = Stree(max_features=16)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
def test_get_subspaces(self):
|
def test_get_subspaces(self):
|
||||||
dataset = np.random.random((10, 16))
|
dataset = np.random.random((10, 16))
|
||||||
y = np.random.randint(0, 2, 10)
|
y = np.random.randint(0, 2, 10)
|
||||||
@@ -306,17 +355,19 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf.predict(X[:, :3])
|
clf.predict(X[:, :3])
|
||||||
|
|
||||||
# Tests of score
|
# Tests of score
|
||||||
|
|
||||||
def test_score_binary(self):
|
def test_score_binary(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
accuracies = [
|
accuracies = [
|
||||||
0.9506666666666667,
|
0.9506666666666667,
|
||||||
|
0.9493333333333334,
|
||||||
0.9606666666666667,
|
0.9606666666666667,
|
||||||
0.9433333333333334,
|
0.9433333333333334,
|
||||||
|
0.9153333333333333,
|
||||||
]
|
]
|
||||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||||
clf = Stree(
|
clf = Stree(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
|
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
|
||||||
kernel=kernel,
|
kernel=kernel,
|
||||||
)
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
@@ -328,7 +379,12 @@ class Stree_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_score_max_features(self):
|
def test_score_max_features(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
clf = Stree(random_state=self._random_state, max_features=2)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
max_features=2,
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
|
||||||
|
|
||||||
@@ -340,7 +396,9 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_multiclass_classifier_integrity(self):
|
def test_multiclass_classifier_integrity(self):
|
||||||
"""Checks if the multiclass operation is done right"""
|
"""Checks if the multiclass operation is done right"""
|
||||||
X, y = load_iris(return_X_y=True)
|
X, y = load_iris(return_X_y=True)
|
||||||
clf = Stree(random_state=0)
|
clf = Stree(
|
||||||
|
kernel="liblinear", multiclass_strategy="ovr", random_state=0
|
||||||
|
)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
score = clf.score(X, y)
|
score = clf.score(X, y)
|
||||||
# Check accuracy of the whole model
|
# Check accuracy of the whole model
|
||||||
@@ -396,10 +454,10 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="rbf", random_state=self._random_state, normalize=True
|
kernel="rbf", random_state=self._random_state, normalize=True
|
||||||
)
|
)
|
||||||
self.assertEqual(0.768, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.966, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.964, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.6685393258426966, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_poly(self):
|
def test_score_multiclass_poly(self):
|
||||||
@@ -417,24 +475,78 @@ class Stree_test(unittest.TestCase):
|
|||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
normalize=True,
|
normalize=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.946, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.972, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.7808988764044944, clf.fit(X, y).score(X, y))
|
||||||
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_liblinear(self):
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
C=10,
|
||||||
|
)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
|
)
|
||||||
|
self.assertEqual(0.968, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.97, clf2.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(1.0, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_sigmoid(self):
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(kernel="sigmoid", random_state=self._random_state, C=10)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="sigmoid",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
|
C=10,
|
||||||
|
)
|
||||||
|
self.assertEqual(0.796, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.952, clf2.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(0.6910112359550562, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.9662921348314607, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_linear(self):
|
def test_score_multiclass_linear(self):
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_classes=3,
|
n_classes=3,
|
||||||
n_features=5,
|
n_features=5,
|
||||||
n_samples=1500,
|
n_samples=1500,
|
||||||
)
|
)
|
||||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
clf = Stree(
|
||||||
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||||
# Check with context based standardization
|
# Check with context based standardization
|
||||||
clf2 = Stree(
|
clf2 = Stree(
|
||||||
kernel="linear", random_state=self._random_state, normalize=True
|
kernel="liblinear",
|
||||||
|
multiclass_strategy="ovr",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
)
|
)
|
||||||
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
@@ -461,7 +573,7 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
|
||||||
yw = np.array([1, 1, 1, 5, 5, 5, 5, 5, 5])
|
yw = np.array([1, 1, 1, 1, 1, 1, 5, 5, 5])
|
||||||
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
|
||||||
model1 = Stree().fit(X, y)
|
model1 = Stree().fit(X, y)
|
||||||
model2 = Stree().fit(X, y, w)
|
model2 = Stree().fit(X, y, w)
|
||||||
@@ -498,14 +610,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(25, nodes)
|
self.assertEqual(31, nodes)
|
||||||
self.assertEqual(13, leaves)
|
self.assertEqual(16, leaves)
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
clf = Stree(random_state=self._random_state)
|
clf = Stree(random_state=self._random_state)
|
||||||
clf.fit(X, y)
|
clf.fit(X, y)
|
||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(9, nodes)
|
self.assertEqual(11, nodes)
|
||||||
self.assertEqual(5, leaves)
|
self.assertEqual(6, leaves)
|
||||||
|
|
||||||
def test_nodes_leaves_artificial(self):
|
def test_nodes_leaves_artificial(self):
|
||||||
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
||||||
@@ -524,3 +636,27 @@ class Stree_test(unittest.TestCase):
|
|||||||
nodes, leaves = clf.nodes_leaves()
|
nodes, leaves = clf.nodes_leaves()
|
||||||
self.assertEqual(6, nodes)
|
self.assertEqual(6, nodes)
|
||||||
self.assertEqual(2, leaves)
|
self.assertEqual(2, leaves)
|
||||||
|
|
||||||
|
def test_bogus_multiclass_strategy(self):
|
||||||
|
clf = Stree(multiclass_strategy="other")
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
|
def test_multiclass_strategy(self):
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf_o = Stree(multiclass_strategy="ovo")
|
||||||
|
clf_r = Stree(multiclass_strategy="ovr")
|
||||||
|
score_o = clf_o.fit(X, y).score(X, y)
|
||||||
|
score_r = clf_r.fit(X, y).score(X, y)
|
||||||
|
self.assertEqual(1.0, score_o)
|
||||||
|
self.assertEqual(0.9269662921348315, score_r)
|
||||||
|
|
||||||
|
def test_incompatible_hyperparameters(self):
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
clf = Stree(kernel="liblinear", multiclass_strategy="ovo")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
clf = Stree(multiclass_strategy="ovo", split_criteria="max_samples")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(X, y)
|
||||||
|
Reference in New Issue
Block a user