Compare commits

..

6 Commits

Author SHA1 Message Date
d46f544466 Add docs config
Update setup remove ipympl dependency
Update Project Name
add build to Makefile
2021-05-11 19:11:03 +02:00
79190ef2e1 Add doc-clean and lgtm badge 2021-05-11 09:03:26 +02:00
Ricardo Montañana Gómez
4f04e72670 Implement ovo strategy (#37)
* Implement ovo strategy
* Set ovo strategy as default
* Add kernel liblinear with LinearSVC classifier
* Fix weak test
2021-05-10 12:16:53 +02:00
5cef0f4875 Implement splitter type mutual info 2021-05-01 23:38:34 +02:00
28c7558f01 Update Readme
Add max_features > n_features test
Add make doc
2021-04-27 23:15:21 +02:00
Ricardo Montañana Gómez
e19d10f6a7 Package doc #7 (#34)
* Add first doc info to sources

* Update doc to separate classes in api

* Refactor build_predictor

* Fix random_sate issue in non linear kernels

* Refactor score method using base class implementation

* Some quality refactoring

* Fix codecov config.

* Add sigmoid kernel

* Refactor setup and add Makefile
2021-04-26 09:10:01 +02:00
13 changed files with 342 additions and 119 deletions

View File

@@ -1,6 +1,6 @@
SHELL := /bin/bash SHELL := /bin/bash
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
.PHONY: coverage deps help lint push test .PHONY: coverage deps help lint push test doc build
coverage: ## Run tests with coverage coverage: ## Run tests with coverage
coverage erase coverage erase
@@ -21,6 +21,16 @@ push: ## Push code with tags
test: ## Run tests test: ## Run tests
python -m unittest -v stree.tests python -m unittest -v stree.tests
doc: ## Update documentation
make -C docs --makefile=Makefile html
build: ## Build package
rm -fr dist/*
python setup.py sdist bdist_wheel
doc-clean: ## Update documentation
make -C docs --makefile=Makefile clean
help: ## Show help message help: ## Show help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

View File

@@ -1,8 +1,9 @@
![CI](https://github.com/Doctorado-ML/STree/workflows/CI/badge.svg) ![CI](https://github.com/Doctorado-ML/STree/workflows/CI/badge.svg)
[![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree) [![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/STree.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
# Stree # STree
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc. Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
@@ -34,22 +35,23 @@ Can be found in
## Hyperparameters ## Hyperparameters
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** | | | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --- | ------------------- | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. | | \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
| \* | kernel | {"linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of linear, poly or rbf. | | \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of liblinear, linear, poly or rbf. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. | | \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls | | \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree | | | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. | | \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (poly). Ignored by all other kernels. | | \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (poly). Ignored by all other kernels. |
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for rbf and poly.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if auto, uses 1 / n_features. | | \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for rbf and poly.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if auto, uses 1 / n_features. |
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* | | | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. | | | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any | | | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. | | | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. | | | splitter | {"best", "random", "mutual"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **random”**: The algorithm generates 5 candidates and choose one randomly. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label |
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it | | | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
\* Hyperparameter used by the support vector classifier of every node \* Hyperparameter used by the support vector classifier of every node

View File

@@ -1,3 +1,4 @@
sphinx sphinx
sphinx-rtd-theme sphinx-rtd-theme
myst-parser myst-parser
git+https://github.com/doctorado-ml/stree

View File

@@ -12,6 +12,7 @@
# #
import os import os
import sys import sys
import stree
sys.path.insert(0, os.path.abspath("../../stree/")) sys.path.insert(0, os.path.abspath("../../stree/"))
@@ -23,7 +24,8 @@ copyright = "2020 - 2021, Ricardo Montañana Gómez"
author = "Ricardo Montañana Gómez" author = "Ricardo Montañana Gómez"
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
release = "1.0" version = stree.__version__
release = version
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------

View File

@@ -1,21 +1,22 @@
# Hyperparameters ## Hyperparameters
| | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** | | | **Hyperparameter** | **Type/Values** | **Default** | **Meaning** |
| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --- | ------------------- | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. | | \* | C | \<float\> | 1.0 | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. |
| \* | kernel | {"linear", "poly", "rbf"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of linear, poly or rbf. | | \* | kernel | {"liblinear", "linear", "poly", "rbf", "sigmoid"} | linear | Specifies the kernel type to be used in the algorithm. It must be one of liblinear, linear, poly or rbf. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library |
| \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. | | \* | max_iter | \<int\> | 1e5 | Hard limit on iterations within solver, or -1 for no limit. |
| \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls | | \* | random_state | \<int\> | None | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls |
| | max_depth | \<int\> | None | Specifies the maximum depth of the tree | | | max_depth | \<int\> | None | Specifies the maximum depth of the tree |
| \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. | | \* | tol | \<float\> | 1e-4 | Tolerance for stopping criterion. |
| \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (poly). Ignored by all other kernels. | | \* | degree | \<int\> | 3 | Degree of the polynomial kernel function (poly). Ignored by all other kernels. |
| \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for rbf and poly.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if auto, uses 1 / n_features. | | \* | gamma | {"scale", "auto"} or \<float\> | scale | Kernel coefficient for rbf and poly.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if auto, uses 1 / n_features. |
| | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\* | | | split_criteria | {"impurity", "max_samples"} | impurity | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy |
| | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. | | | criterion | {“gini”, “entropy”} | entropy | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain. |
| | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any | | | min_samples_split | \<int\> | 0 | The minimum number of samples required to split an internal node. 0 (default) for any |
| | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. | | | max_features | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
| | splitter | {"best", "random"} | random | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies. | | | splitter | {"best", "random", "mutual"} | "random" | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **random”**: The algorithm generates 5 candidates and choose one randomly. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label |
| | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it | | | normalize | \<bool\> | False | If standardization of features should be applied on each node with the samples that reach it |
| \* | multiclass_strategy | {"ovo", "ovr"} | "ovo" | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest |
\* Hyperparameter used by the support vector classifier of every node \* Hyperparameter used by the support vector classifier of every node

View File

@@ -1,8 +1,9 @@
# Stree # STree
[![Codeship Status for Doctorado-ML/STree](https://app.codeship.com/projects/8b2bd350-8a1b-0138-5f2c-3ad36f3eb318/status?branch=master)](https://app.codeship.com/projects/399170) [![Codeship Status for Doctorado-ML/STree](https://app.codeship.com/projects/8b2bd350-8a1b-0138-5f2c-3ad36f3eb318/status?branch=master)](https://app.codeship.com/projects/399170)
[![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree) [![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/STree.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/STree/context:python)
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc. Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.

View File

@@ -1,5 +1,4 @@
import setuptools import setuptools
import stree
def readme(): def readme():
@@ -7,29 +6,45 @@ def readme():
return f.read() return f.read()
VERSION = stree.__version__ def get_data(field):
item = ""
with open("stree/__init__.py") as f:
for line in f.readlines():
if line.startswith(f"__{field}__"):
delim = '"' if '"' in line else "'"
item = line.split(delim)[1]
break
else:
raise RuntimeError(f"Unable to find {field} string.")
return item
setuptools.setup( setuptools.setup(
name="STree", name="STree",
version=stree.__version__, version=get_data("version"),
license=stree.__license__, license=get_data("license"),
description="Oblique decision tree with svm nodes", description="Oblique decision tree with svm nodes",
long_description=readme(), long_description=readme(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
packages=setuptools.find_packages(), packages=setuptools.find_packages(),
url=stree.__url__, url="https://github.com/Doctorado-ML/STree#stree",
author=stree.__author__, project_urls={
author_email=stree.__author_email__, "Code": "https://github.com/Doctorado-ML/STree",
"Documentation": "https://stree.readthedocs.io/en/latest/index.html",
},
author=get_data("author"),
author_email=get_data("author_email"),
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\ keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
tree svm svc", tree svm svc",
classifiers=[ classifiers=[
"Development Status :: 5 - Production/Stable", "Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: " + stree.__license__, "License :: OSI Approved :: " + get_data("license"),
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Natural Language :: English", "Natural Language :: English",
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
], ],
install_requires=["scikit-learn", "numpy", "ipympl"], install_requires=["scikit-learn", "numpy"],
test_suite="stree.tests", test_suite="stree.tests",
zip_safe=False, zip_safe=False,
) )

10
stree/.readthedocs.yaml Normal file
View File

@@ -0,0 +1,10 @@
version: 2
sphinx:
configuration: docs/source/conf.py
python:
version: 3.8
install:
- requirements: requirements.txt
- requirements: docs/requirements.txt

View File

@@ -11,7 +11,7 @@ from typing import Optional
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC from sklearn.svm import SVC, LinearSVC
from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.multiclass import check_classification_targets
from sklearn.exceptions import ConvergenceWarning from sklearn.exceptions import ConvergenceWarning
@@ -155,6 +155,10 @@ class Siterator:
self._stack = [] self._stack = []
self._push(tree) self._push(tree)
def __iter__(self):
# To complete the iterator interface
return self
def _push(self, node: Snode): def _push(self, node: Snode):
if node is not None: if node is not None:
self._stack.append(node) self._stack.append(node)
@@ -205,9 +209,9 @@ class Splitter:
f"criteria has to be max_samples or impurity; got ({criteria})" f"criteria has to be max_samples or impurity; got ({criteria})"
) )
if feature_select not in ["random", "best"]: if feature_select not in ["random", "best", "mutual"]:
raise ValueError( raise ValueError(
"splitter must be either random or best, got " "splitter must be in {random, best, mutual} got "
f"({feature_select})" f"({feature_select})"
) )
self.criterion_function = getattr(self, f"_{self._criterion}") self.criterion_function = getattr(self, f"_{self._criterion}")
@@ -373,19 +377,28 @@ class Splitter:
tuple tuple
indices of the features selected indices of the features selected
""" """
# No feature reduction
if dataset.shape[1] == max_features: if dataset.shape[1] == max_features:
# No feature reduction applies
return tuple(range(dataset.shape[1])) return tuple(range(dataset.shape[1]))
# Random feature reduction
if self._feature_select == "random": if self._feature_select == "random":
features_sets = self._generate_spaces( features_sets = self._generate_spaces(
dataset.shape[1], max_features dataset.shape[1], max_features
) )
return self._select_best_set(dataset, labels, features_sets) return self._select_best_set(dataset, labels, features_sets)
# Take KBest features # return the KBest features
return ( if self._feature_select == "best":
SelectKBest(k=max_features) return (
.fit(dataset, labels) SelectKBest(k=max_features)
.get_support(indices=True) .fit(dataset, labels)
.get_support(indices=True)
)
# return best features with mutual info with the label
feature_list = mutual_info_classif(dataset, labels)
return tuple(
sorted(
range(len(feature_list)), key=lambda sub: feature_list[sub]
)[-max_features:]
) )
def get_subspace( def get_subspace(
@@ -561,6 +574,7 @@ class Stree(BaseEstimator, ClassifierMixin):
min_samples_split: int = 0, min_samples_split: int = 0,
max_features=None, max_features=None,
splitter: str = "random", splitter: str = "random",
multiclass_strategy: str = "ovo",
normalize: bool = False, normalize: bool = False,
): ):
self.max_iter = max_iter self.max_iter = max_iter
@@ -577,6 +591,7 @@ class Stree(BaseEstimator, ClassifierMixin):
self.criterion = criterion self.criterion = criterion
self.splitter = splitter self.splitter = splitter
self.normalize = normalize self.normalize = normalize
self.multiclass_strategy = multiclass_strategy
def _more_tags(self) -> dict: def _more_tags(self) -> dict:
"""Required by sklearn to supply features of the classifier """Required by sklearn to supply features of the classifier
@@ -621,7 +636,23 @@ class Stree(BaseEstimator, ClassifierMixin):
f"Maximum depth has to be greater than 1... got (max_depth=\ f"Maximum depth has to be greater than 1... got (max_depth=\
{self.max_depth})" {self.max_depth})"
) )
kernels = ["linear", "rbf", "poly", "sigmoid"] if self.multiclass_strategy not in ["ovr", "ovo"]:
raise ValueError(
"mutliclass_strategy has to be either ovr or ovo"
f" but got {self.multiclass_strategy}"
)
if self.multiclass_strategy == "ovo":
if self.kernel == "liblinear":
raise ValueError(
"The kernel liblinear is incompatible with ovo "
"multiclass_strategy"
)
if self.split_criteria == "max_samples":
raise ValueError(
"The multiclass_strategy 'ovo' is incompatible with "
"split_criteria 'max_samples'"
)
kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
if self.kernel not in kernels: if self.kernel not in kernels:
raise ValueError(f"Kernel {self.kernel} not in {kernels}") raise ValueError(f"Kernel {self.kernel} not in {kernels}")
check_classification_targets(y) check_classification_targets(y)
@@ -653,12 +684,12 @@ class Stree(BaseEstimator, ClassifierMixin):
self.n_features_ = X.shape[1] self.n_features_ = X.shape[1]
self.n_features_in_ = X.shape[1] self.n_features_in_ = X.shape[1]
self.max_features_ = self._initialize_max_features() self.max_features_ = self._initialize_max_features()
self.tree_ = self.train(X, y, sample_weight, 1, "root") self.tree_ = self._train(X, y, sample_weight, 1, "root")
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
return self return self
def train( def _train(
self, self,
X: np.ndarray, X: np.ndarray,
y: np.ndarray, y: np.ndarray,
@@ -723,10 +754,10 @@ class Stree(BaseEstimator, ClassifierMixin):
node.make_predictor() node.make_predictor()
return node return node
node.set_up( node.set_up(
self.train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})") self._train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
) )
node.set_down( node.set_down(
self.train( self._train(
X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})" X_D, y_d, sw_d, depth + 1, title + f" - Down({depth+1})"
) )
) )
@@ -741,7 +772,7 @@ class Stree(BaseEstimator, ClassifierMixin):
C=self.C, C=self.C,
tol=self.tol, tol=self.tol,
) )
if self.kernel == "linear" if self.kernel == "liblinear"
else SVC( else SVC(
kernel=self.kernel, kernel=self.kernel,
max_iter=self.max_iter, max_iter=self.max_iter,
@@ -750,6 +781,7 @@ class Stree(BaseEstimator, ClassifierMixin):
gamma=self.gamma, gamma=self.gamma,
degree=self.degree, degree=self.degree,
random_state=self.random_state, random_state=self.random_state,
decision_function_shape=self.multiclass_strategy,
) )
) )
@@ -892,6 +924,12 @@ class Stree(BaseEstimator, ClassifierMixin):
elif self.max_features is None: elif self.max_features is None:
max_features = self.n_features_ max_features = self.n_features_
elif isinstance(self.max_features, numbers.Integral): elif isinstance(self.max_features, numbers.Integral):
if self.max_features > self.n_features_:
raise ValueError(
"Invalid value for max_features. "
"It can not be greater than number of features "
f"({self.n_features_})"
)
max_features = self.max_features max_features = self.max_features
else: # float else: # float
if self.max_features > 0.0: if self.max_features > 0.0:

View File

@@ -1,11 +1,10 @@
from .Strees import Stree, Snode, Siterator, Splitter from .Strees import Stree, Snode, Siterator, Splitter
__version__ = "1.0" __version__ = "1.1"
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es" __author_email__ = "ricardo.montanana@alu.uclm.es"
__url__ = "https://github.com/doctorado-ml/stree"
__all__ = ["Stree", "Snode", "Siterator", "Splitter"] __all__ = ["Stree", "Snode", "Siterator", "Splitter"]

View File

@@ -8,7 +8,11 @@ from .utils import load_dataset
class Snode_test(unittest.TestCase): class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self._random_state = 1 self._random_state = 1
self._clf = Stree(random_state=self._random_state) self._clf = Stree(
random_state=self._random_state,
kernel="liblinear",
multiclass_strategy="ovr",
)
self._clf.fit(*load_dataset(self._random_state)) self._clf.fit(*load_dataset(self._random_state))
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)

View File

@@ -195,10 +195,14 @@ class Splitter_test(unittest.TestCase):
[0, 3, 7, 12], # random entropy impurity [0, 3, 7, 12], # random entropy impurity
[1, 7, 9, 12], # random gini max_samples [1, 7, 9, 12], # random gini max_samples
[1, 5, 8, 12], # random gini impurity [1, 5, 8, 12], # random gini impurity
[6, 9, 11, 12], # mutual entropy max_samples
[6, 9, 11, 12], # mutual entropy impurity
[6, 9, 11, 12], # mutual gini max_samples
[6, 9, 11, 12], # mutual gini impurity
] ]
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
rn = 0 rn = 0
for feature_select in ["best", "random"]: for feature_select in ["best", "random", "mutual"]:
for criterion in ["entropy", "gini"]: for criterion in ["entropy", "gini"]:
for criteria in [ for criteria in [
"max_samples", "max_samples",
@@ -221,7 +225,7 @@ class Splitter_test(unittest.TestCase):
# criteria, # criteria,
# ) # )
# ) # )
self.assertListEqual(expected, list(computed)) self.assertListEqual(expected, sorted(list(computed)))
self.assertListEqual( self.assertListEqual(
X[:, computed].tolist(), dataset.tolist() X[:, computed].tolist(), dataset.tolist()
) )

View File

@@ -14,7 +14,7 @@ from .utils import load_dataset
class Stree_test(unittest.TestCase): class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self._random_state = 1 self._random_state = 1
self._kernels = ["linear", "rbf", "poly"] self._kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@classmethod @classmethod
@@ -22,10 +22,9 @@ class Stree_test(unittest.TestCase):
os.environ["TESTING"] = "1" os.environ["TESTING"] = "1"
def test_valid_kernels(self): def test_valid_kernels(self):
valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
X, y = load_dataset() X, y = load_dataset()
for kernel in valid_kernels: for kernel in self._kernels:
clf = Stree(kernel=kernel) clf = Stree(kernel=kernel, multiclass_strategy="ovr")
clf.fit(X, y) clf.fit(X, y)
self.assertIsNotNone(clf.tree_) self.assertIsNotNone(clf.tree_)
@@ -55,14 +54,19 @@ class Stree_test(unittest.TestCase):
# i.e. The partition algorithm didn't forget any sample # i.e. The partition algorithm didn't forget any sample
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0]) self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
unique_y, count_y = np.unique(node._y, return_counts=True) unique_y, count_y = np.unique(node._y, return_counts=True)
_, count_d = np.unique(y_down, return_counts=True) labels_d, count_d = np.unique(y_down, return_counts=True)
_, count_u = np.unique(y_up, return_counts=True) labels_u, count_u = np.unique(y_up, return_counts=True)
dict_d = {label: count_d[i] for i, label in enumerate(labels_d)}
dict_u = {label: count_u[i] for i, label in enumerate(labels_u)}
# #
for i in unique_y: for i in unique_y:
number_up = count_u[i]
try: try:
number_down = count_d[i] number_up = dict_u[i]
except IndexError: except KeyError:
number_up = 0
try:
number_down = dict_d[i]
except KeyError:
number_down = 0 number_down = 0
self.assertEqual(count_y[i], number_down + number_up) self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction? # Is the partition made the same as the prediction?
@@ -77,14 +81,22 @@ class Stree_test(unittest.TestCase):
"""Check if the tree is built the same way as predictions of models""" """Check if the tree is built the same way as predictions of models"""
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state) clf = Stree(
kernel="sigmoid",
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
random_state=self._random_state,
)
clf.fit(*load_dataset(self._random_state)) clf.fit(*load_dataset(self._random_state))
self._check_tree(clf.tree_) self._check_tree(clf.tree_)
def test_single_prediction(self): def test_single_prediction(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state) clf = Stree(
kernel=kernel,
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
random_state=self._random_state,
)
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1]))) yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
self.assertEqual(yp[0], y[0]) self.assertEqual(yp[0], y[0])
@@ -92,8 +104,12 @@ class Stree_test(unittest.TestCase):
# First 27 elements the predictions are the same as the truth # First 27 elements the predictions are the same as the truth
num = 27 num = 27
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
for kernel in self._kernels: for kernel in ["liblinear", "linear", "rbf", "poly"]:
clf = Stree(kernel=kernel, random_state=self._random_state) clf = Stree(
kernel=kernel,
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
random_state=self._random_state,
)
yp = clf.fit(X, y).predict(X[:num, :]) yp = clf.fit(X, y).predict(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp.tolist()) self.assertListEqual(y[:num].tolist(), yp.tolist())
@@ -103,7 +119,11 @@ class Stree_test(unittest.TestCase):
""" """
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state) clf = Stree(
kernel=kernel,
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
random_state=self._random_state,
)
clf.fit(X, y) clf.fit(X, y)
# Compute prediction line by line # Compute prediction line by line
yp_line = np.array([], dtype=int) yp_line = np.array([], dtype=int)
@@ -135,9 +155,13 @@ class Stree_test(unittest.TestCase):
] ]
computed = [] computed = []
expected_string = "" expected_string = ""
clf = Stree(kernel="linear", random_state=self._random_state) clf = Stree(
kernel="liblinear",
multiclass_strategy="ovr",
random_state=self._random_state,
)
clf.fit(*load_dataset(self._random_state)) clf.fit(*load_dataset(self._random_state))
for node in clf: for node in iter(clf):
computed.append(str(node)) computed.append(str(node))
expected_string += str(node) + "\n" expected_string += str(node) + "\n"
self.assertListEqual(expected, computed) self.assertListEqual(expected, computed)
@@ -173,7 +197,12 @@ class Stree_test(unittest.TestCase):
def test_check_max_depth(self): def test_check_max_depth(self):
depths = (3, 4) depths = (3, 4)
for depth in depths: for depth in depths:
tcl = Stree(random_state=self._random_state, max_depth=depth) tcl = Stree(
kernel="liblinear",
multiclass_strategy="ovr",
random_state=self._random_state,
max_depth=depth,
)
tcl.fit(*load_dataset(self._random_state)) tcl.fit(*load_dataset(self._random_state))
self.assertEqual(depth, tcl.depth_) self.assertEqual(depth, tcl.depth_)
@@ -194,7 +223,7 @@ class Stree_test(unittest.TestCase):
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree( clf = Stree(
kernel=kernel, kernel=kernel,
split_criteria="max_samples", multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
random_state=self._random_state, random_state=self._random_state,
) )
px = [[1, 2], [5, 6], [9, 10]] px = [[1, 2], [5, 6], [9, 10]]
@@ -205,26 +234,36 @@ class Stree_test(unittest.TestCase):
self.assertListEqual(py, clf.classes_.tolist()) self.assertListEqual(py, clf.classes_.tolist())
def test_muticlass_dataset(self): def test_muticlass_dataset(self):
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
datasets = { datasets = {
"Synt": load_dataset(random_state=self._random_state, n_classes=3), "Synt": load_dataset(random_state=self._random_state, n_classes=3),
"Iris": load_wine(return_X_y=True), "Iris": load_wine(return_X_y=True),
} }
outcomes = { outcomes = {
"Synt": { "Synt": {
"max_samples linear": 0.9606666666666667, "max_samples liblinear": 0.9493333333333334,
"max_samples rbf": 0.7133333333333334, "max_samples linear": 0.9426666666666667,
"max_samples poly": 0.618, "max_samples rbf": 0.9606666666666667,
"impurity linear": 0.9606666666666667, "max_samples poly": 0.9373333333333334,
"impurity rbf": 0.7133333333333334, "max_samples sigmoid": 0.824,
"impurity poly": 0.618, "impurity liblinear": 0.9493333333333334,
"impurity linear": 0.9426666666666667,
"impurity rbf": 0.9606666666666667,
"impurity poly": 0.9373333333333334,
"impurity sigmoid": 0.824,
}, },
"Iris": { "Iris": {
"max_samples liblinear": 0.9550561797752809,
"max_samples linear": 1.0, "max_samples linear": 1.0,
"max_samples rbf": 0.6910112359550562, "max_samples rbf": 0.6685393258426966,
"max_samples poly": 0.6966292134831461, "max_samples poly": 0.6853932584269663,
"impurity linear": 1, "max_samples sigmoid": 0.6404494382022472,
"impurity rbf": 0.6910112359550562, "impurity liblinear": 0.9550561797752809,
"impurity poly": 0.6966292134831461, "impurity linear": 1.0,
"impurity rbf": 0.6685393258426966,
"impurity poly": 0.6853932584269663,
"impurity sigmoid": 0.6404494382022472,
}, },
} }
@@ -233,18 +272,22 @@ class Stree_test(unittest.TestCase):
for criteria in ["max_samples", "impurity"]: for criteria in ["max_samples", "impurity"]:
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree( clf = Stree(
C=55, max_iter=1e4,
max_iter=1e5, multiclass_strategy="ovr"
if kernel == "liblinear"
else "ovo",
kernel=kernel, kernel=kernel,
random_state=self._random_state, random_state=self._random_state,
) )
clf.fit(px, py) clf.fit(px, py)
outcome = outcomes[name][f"{criteria} {kernel}"] outcome = outcomes[name][f"{criteria} {kernel}"]
# print( # print(f'"{criteria} {kernel}": {clf.score(px, py)},')
# f"{name} {criteria} {kernel} {outcome} {clf.score(px" self.assertAlmostEqual(
# ", py)}" outcome,
# ) clf.score(px, py),
self.assertAlmostEqual(outcome, clf.score(px, py)) 5,
f"{name} - {criteria} - {kernel}",
)
def test_max_features(self): def test_max_features(self):
n_features = 16 n_features = 16
@@ -269,6 +312,12 @@ class Stree_test(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
_ = clf._initialize_max_features() _ = clf._initialize_max_features()
def test_wrong_max_features(self):
X, y = load_dataset(n_features=15)
clf = Stree(max_features=16)
with self.assertRaises(ValueError):
clf.fit(X, y)
def test_get_subspaces(self): def test_get_subspaces(self):
dataset = np.random.random((10, 16)) dataset = np.random.random((10, 16))
y = np.random.randint(0, 2, 10) y = np.random.randint(0, 2, 10)
@@ -306,17 +355,19 @@ class Stree_test(unittest.TestCase):
clf.predict(X[:, :3]) clf.predict(X[:, :3])
# Tests of score # Tests of score
def test_score_binary(self): def test_score_binary(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
accuracies = [ accuracies = [
0.9506666666666667, 0.9506666666666667,
0.9493333333333334,
0.9606666666666667, 0.9606666666666667,
0.9433333333333334, 0.9433333333333334,
0.9153333333333333,
] ]
for kernel, accuracy_expected in zip(self._kernels, accuracies): for kernel, accuracy_expected in zip(self._kernels, accuracies):
clf = Stree( clf = Stree(
random_state=self._random_state, random_state=self._random_state,
multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
kernel=kernel, kernel=kernel,
) )
clf.fit(X, y) clf.fit(X, y)
@@ -328,7 +379,12 @@ class Stree_test(unittest.TestCase):
def test_score_max_features(self): def test_score_max_features(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2) clf = Stree(
kernel="liblinear",
multiclass_strategy="ovr",
random_state=self._random_state,
max_features=2,
)
clf.fit(X, y) clf.fit(X, y)
self.assertAlmostEqual(0.9453333333333334, clf.score(X, y)) self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
@@ -340,7 +396,9 @@ class Stree_test(unittest.TestCase):
def test_multiclass_classifier_integrity(self): def test_multiclass_classifier_integrity(self):
"""Checks if the multiclass operation is done right""" """Checks if the multiclass operation is done right"""
X, y = load_iris(return_X_y=True) X, y = load_iris(return_X_y=True)
clf = Stree(random_state=0) clf = Stree(
kernel="liblinear", multiclass_strategy="ovr", random_state=0
)
clf.fit(X, y) clf.fit(X, y)
score = clf.score(X, y) score = clf.score(X, y)
# Check accuracy of the whole model # Check accuracy of the whole model
@@ -396,10 +454,10 @@ class Stree_test(unittest.TestCase):
clf2 = Stree( clf2 = Stree(
kernel="rbf", random_state=self._random_state, normalize=True kernel="rbf", random_state=self._random_state, normalize=True
) )
self.assertEqual(0.768, clf.fit(X, y).score(X, y)) self.assertEqual(0.966, clf.fit(X, y).score(X, y))
self.assertEqual(0.814, clf2.fit(X, y).score(X, y)) self.assertEqual(0.964, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y)) self.assertEqual(0.6685393258426966, clf.fit(X, y).score(X, y))
self.assertEqual(1.0, clf2.fit(X, y).score(X, y)) self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_score_multiclass_poly(self): def test_score_multiclass_poly(self):
@@ -417,24 +475,78 @@ class Stree_test(unittest.TestCase):
random_state=self._random_state, random_state=self._random_state,
normalize=True, normalize=True,
) )
self.assertEqual(0.786, clf.fit(X, y).score(X, y)) self.assertEqual(0.946, clf.fit(X, y).score(X, y))
self.assertEqual(0.818, clf2.fit(X, y).score(X, y)) self.assertEqual(0.972, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y)) self.assertEqual(0.7808988764044944, clf.fit(X, y).score(X, y))
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y)) self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_score_multiclass_liblinear(self):
X, y = load_dataset(
random_state=self._random_state,
n_classes=3,
n_features=5,
n_samples=500,
)
clf = Stree(
kernel="liblinear",
multiclass_strategy="ovr",
random_state=self._random_state,
C=10,
)
clf2 = Stree(
kernel="liblinear",
multiclass_strategy="ovr",
random_state=self._random_state,
normalize=True,
)
self.assertEqual(0.968, clf.fit(X, y).score(X, y))
self.assertEqual(0.97, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True)
self.assertEqual(1.0, clf.fit(X, y).score(X, y))
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_score_multiclass_sigmoid(self):
X, y = load_dataset(
random_state=self._random_state,
n_classes=3,
n_features=5,
n_samples=500,
)
clf = Stree(kernel="sigmoid", random_state=self._random_state, C=10)
clf2 = Stree(
kernel="sigmoid",
random_state=self._random_state,
normalize=True,
C=10,
)
self.assertEqual(0.796, clf.fit(X, y).score(X, y))
self.assertEqual(0.952, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True)
self.assertEqual(0.6910112359550562, clf.fit(X, y).score(X, y))
self.assertEqual(0.9662921348314607, clf2.fit(X, y).score(X, y))
def test_score_multiclass_linear(self): def test_score_multiclass_linear(self):
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
X, y = load_dataset( X, y = load_dataset(
random_state=self._random_state, random_state=self._random_state,
n_classes=3, n_classes=3,
n_features=5, n_features=5,
n_samples=1500, n_samples=1500,
) )
clf = Stree(kernel="linear", random_state=self._random_state) clf = Stree(
kernel="liblinear",
multiclass_strategy="ovr",
random_state=self._random_state,
)
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y)) self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
# Check with context based standardization # Check with context based standardization
clf2 = Stree( clf2 = Stree(
kernel="linear", random_state=self._random_state, normalize=True kernel="liblinear",
multiclass_strategy="ovr",
random_state=self._random_state,
normalize=True,
) )
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y)) self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
@@ -461,7 +573,7 @@ class Stree_test(unittest.TestCase):
] ]
) )
y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5]) y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
yw = np.array([1, 1, 1, 5, 5, 5, 5, 5, 5]) yw = np.array([1, 1, 1, 1, 1, 1, 5, 5, 5])
w = [1, 1, 1, 0, 0, 0, 1, 1, 1] w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
model1 = Stree().fit(X, y) model1 = Stree().fit(X, y)
model2 = Stree().fit(X, y, w) model2 = Stree().fit(X, y, w)
@@ -498,14 +610,14 @@ class Stree_test(unittest.TestCase):
clf = Stree(random_state=self._random_state) clf = Stree(random_state=self._random_state)
clf.fit(X, y) clf.fit(X, y)
nodes, leaves = clf.nodes_leaves() nodes, leaves = clf.nodes_leaves()
self.assertEqual(25, nodes) self.assertEqual(31, nodes)
self.assertEqual(13, leaves) self.assertEqual(16, leaves)
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
clf = Stree(random_state=self._random_state) clf = Stree(random_state=self._random_state)
clf.fit(X, y) clf.fit(X, y)
nodes, leaves = clf.nodes_leaves() nodes, leaves = clf.nodes_leaves()
self.assertEqual(9, nodes) self.assertEqual(11, nodes)
self.assertEqual(5, leaves) self.assertEqual(6, leaves)
def test_nodes_leaves_artificial(self): def test_nodes_leaves_artificial(self):
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1") n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
@@ -524,3 +636,27 @@ class Stree_test(unittest.TestCase):
nodes, leaves = clf.nodes_leaves() nodes, leaves = clf.nodes_leaves()
self.assertEqual(6, nodes) self.assertEqual(6, nodes)
self.assertEqual(2, leaves) self.assertEqual(2, leaves)
def test_bogus_multiclass_strategy(self):
clf = Stree(multiclass_strategy="other")
X, y = load_wine(return_X_y=True)
with self.assertRaises(ValueError):
clf.fit(X, y)
def test_multiclass_strategy(self):
X, y = load_wine(return_X_y=True)
clf_o = Stree(multiclass_strategy="ovo")
clf_r = Stree(multiclass_strategy="ovr")
score_o = clf_o.fit(X, y).score(X, y)
score_r = clf_r.fit(X, y).score(X, y)
self.assertEqual(1.0, score_o)
self.assertEqual(0.9269662921348315, score_r)
def test_incompatible_hyperparameters(self):
X, y = load_wine(return_X_y=True)
clf = Stree(kernel="liblinear", multiclass_strategy="ovo")
with self.assertRaises(ValueError):
clf.fit(X, y)
clf = Stree(multiclass_strategy="ovo", split_criteria="max_samples")
with self.assertRaises(ValueError):
clf.fit(X, y)