Compare commits

..

21 Commits

Author SHA1 Message Date
d1e30a3372 Refactor predict and score and make mypy --strict 2020-07-01 18:37:10 +02:00
fa001f97a4 First Approach 2020-06-28 02:46:20 +02:00
be552fdd6c Add test for getting 3 feature_sets in Splitter
Add ensemble notebook
2020-06-28 02:45:08 +02:00
5e3a8e3ec5 Change adaboost notebook 2020-06-27 23:34:15 +02:00
554ec03c32 Get only 3 sets for best split
Fix flaky test in Splitter_test
2020-06-27 18:29:40 +02:00
4b7e4a3fb0 better solution to the sklearn bagging problem
Add better tests
enhance .coveragerc
2020-06-26 11:22:45 +02:00
76723993fd Solve Warning class label not found when bagging 2020-06-25 13:07:50 +02:00
ecd0b86f4d Solve the mistake of min and max distance
The split criteria functions min and max distance return classes while
max_samples return distances positives and negatives to hyperplane of
the class with more samples in node
2020-06-17 00:13:52 +02:00
3e52a4746c Fix entroy and information_gain functions 2020-06-16 13:56:02 +02:00
Ricardo Montañana Gómez
a20e45e8e7 Merge pull request #10 from Doctorado-ML/add_subspaces
#2 Add subspaces
2020-06-15 11:30:53 +02:00
9334951d1b #2 Cosmetic and style updates 2020-06-15 11:09:11 +02:00
736ab7ef20 #2 update benchmark notebook 2020-06-15 10:33:51 +02:00
c94bc068bd #2 Refactor Stree & create Splitter
Add and test splitter parameter
2020-06-15 00:22:57 +02:00
502ee72799 #2 Add predict and score support
Add a test in features notebook
Show max_features in main.py
2020-06-14 14:00:21 +02:00
f1ee4de37b #2 - Add gini and entropy measures
rename get_dataset to load_dataset
add features and impurity to  __str__ of node
2020-06-14 03:08:55 +02:00
ae1c199e21 # 2 - add max_features parameters 2020-06-13 17:58:45 +02:00
1bfe273a70 Fix problem in _min_distance
Remove grapher (moved to another repo)
2020-06-12 00:50:25 +02:00
Ricardo Montañana Gómez
647d21bdb5 Merge pull request #9 from Doctorado-ML/add_multiclass
#6 Add multiclass
2020-06-11 16:30:16 +02:00
1d392d534f #6 - Update tests and codecov conf 2020-06-11 13:45:24 +02:00
f360a2640c #6 - Add multiclass support
Removed (by now) predict_proba. Created a notebook in jupyter
Added split_criteria parameter with min_distance and max_samples values
Refactor _distances
Refactor _split_criteria
Refactor _reorder_results
2020-06-11 13:10:52 +02:00
Ricardo Montañana Gómez
45510b43bc Merge pull request #5 from Doctorado-ML/add_kernels
#3 Add kernels to STree
2020-06-09 13:43:31 +02:00
20 changed files with 1495 additions and 1509 deletions

View File

@@ -10,5 +10,4 @@ exclude_lines =
if __name__ == .__main__.:
ignore_errors = True
omit =
stree/tests/*
stree/__init__.py

4
.gitignore vendored
View File

@@ -130,4 +130,6 @@ dmypy.json
.idea
.vscode
.pre-commit-config.yaml
.pre-commit-config.yaml
**.csv

View File

@@ -3,9 +3,6 @@ overage:
project:
default:
target: 90%
patch:
default:
target: 90%
comment:
layout: "reach, diff, flags, files"
behavior: default

87
main.py
View File

@@ -1,88 +1,29 @@
import time
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from stree import Stree
random_state = 1
X, y = load_iris(return_X_y=True)
def load_creditcard(n_examples=0):
import pandas as pd
import numpy as np
import random
df = pd.read_csv("data/creditcard.csv")
print(
"Fraud: {0:.3f}% {1}".format(
df.Class[df.Class == 1].count() * 100 / df.shape[0],
df.Class[df.Class == 1].count(),
)
)
print(
"Valid: {0:.3f}% {1}".format(
df.Class[df.Class == 0].count() * 100 / df.shape[0],
df.Class[df.Class == 0].count(),
)
)
y = np.expand_dims(df.Class.values, axis=1)
X = df.drop(["Class", "Time", "Amount"], axis=1).values
if n_examples > 0:
# Take first n_examples samples
X = X[:n_examples, :]
y = y[:n_examples, :]
else:
# Take all the positive samples with a number of random negatives
if n_examples < 0:
Xt = X[(y == 1).ravel()]
yt = y[(y == 1).ravel()]
indices = random.sample(range(X.shape[0]), -1 * n_examples)
X = np.append(Xt, X[indices], axis=0)
y = np.append(yt, y[indices], axis=0)
print("X.shape", X.shape, " y.shape", y.shape)
print(
"Fraud: {0:.3f}% {1}".format(
len(y[y == 1]) * 100 / X.shape[0], len(y[y == 1])
)
)
print(
"Valid: {0:.3f}% {1}".format(
len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])
)
)
Xtrain, Xtest, ytrain, ytest = train_test_split(
X,
y,
train_size=0.7,
shuffle=True,
random_state=random_state,
stratify=y,
)
return Xtrain, Xtest, ytrain, ytest
# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
# data = load_creditcard(5000) # Take the first 5000 samples
data = load_creditcard() # Take all the samples
Xtrain = data[0]
Xtest = data[1]
ytrain = data[2]
ytest = data[3]
Xtrain, Xtest, ytrain, ytest = train_test_split(
X, y, test_size=0.2, random_state=random_state
)
now = time.time()
print("Predicting with max_features=sqrt(n_features)")
clf = Stree(C=0.01, random_state=random_state, max_features="auto")
clf.fit(Xtrain, ytrain)
print(f"Took {time.time() - now:.2f} seconds to train")
print(clf)
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
print("=" * 40)
print("Predicting with max_features=n_features")
clf = Stree(C=0.01, random_state=random_state)
clf.fit(Xtrain, ytrain)
print(f"Took {time.time() - now:.2f} seconds to train")
print(clf)
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
proba = clf.predict_proba(Xtest)
print(
"Checking that we have correct probabilities, these are probabilities of "
"sample belonging to class 1"
)
res0 = proba[proba[:, 0] == 0]
res1 = proba[proba[:, 0] == 1]
print("++++++++++res0 > .8++++++++++++")
print(res0[res0[:, 1] > 0.8])
print("**********res1 < .4************")
print(res1[res1[:, 1] < 0.4])

File diff suppressed because one or more lines are too long

View File

@@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test AdaBoost with different configurations"
"# Test Stree with AdaBoost and Bagging with different configurations"
]
},
{
@@ -34,11 +34,8 @@
"outputs": [],
"source": [
"import time\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.svm import LinearSVC, SVC\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.datasets import load_iris\n",
"from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree"
]
},
@@ -57,12 +54,14 @@
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n"
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.644% 647\nValid: 99.356% 99845\n"
}
],
"source": [
@@ -117,18 +116,20 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## STree alone on the whole dataset and linear kernel"
"## STree alone with 100.000 samples and linear kernel"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.9985499829409757\nScore Test: 0.998407854584052\nTook 39.45 seconds\n"
"text": "Score Train: 0.9985784146480154\nScore Test: 0.9981093273185617\nTook 73.27 seconds\n"
}
],
"source": [
@@ -144,7 +145,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Different kernels with different configuations"
"## Adaboost"
]
},
{
@@ -161,18 +162,20 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n"
"text": "Kernel: linear\tTime: 93.78 seconds\tScore Train: 0.9983083\tScore Test: 0.9983083\nKernel: rbf\tTime: 18.32 seconds\tScore Train: 0.9935602\tScore Test: 0.9935651\nKernel: poly\tTime: 69.68 seconds\tScore Train: 0.9973132\tScore Test: 0.9972801\n"
}
],
"source": [
"for kernel in ['linear', 'rbf', 'poly']:\n",
" now = time.time()\n",
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
" clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" score_train = clf.score(Xtrain, ytrain)\n",
" score_test = clf.score(Xtest, ytest)\n",
@@ -183,24 +186,37 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test algorithm SAMME in AdaBoost to check speed/accuracy"
"## Bagging"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = 10\n",
"C = 7\n",
"max_depth = 3"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n"
"text": "Kernel: linear\tTime: 387.06 seconds\tScore Train: 0.9985784\tScore Test: 0.9981093\nKernel: rbf\tTime: 144.00 seconds\tScore Train: 0.9992750\tScore Test: 0.9983415\nKernel: poly\tTime: 101.78 seconds\tScore Train: 0.9992466\tScore Test: 0.9981757\n"
}
],
"source": [
"for kernel in ['linear', 'rbf', 'poly']:\n",
" now = time.time()\n",
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
" clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" score_train = clf.score(Xtrain, ytrain)\n",
" score_test = clf.score(Xtest, ytest)\n",
@@ -223,7 +239,7 @@
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
"name": "python37664bitgeneralvenve3128601eb614c5da59c5055670b6040",
"display_name": "Python 3.7.6 64-bit ('general': venv)"
}
},

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -1,5 +1,4 @@
numpy
scikit-learn
pandas
matplotlib
ipympl

View File

@@ -30,7 +30,7 @@ setuptools.setup(
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Intended Audience :: Science/Research",
],
install_requires=["scikit-learn>=0.23.0", "numpy", "matplotlib", "ipympl"],
install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl"],
test_suite="stree.tests",
zip_safe=False,
)

View File

@@ -6,21 +6,24 @@ __version__ = "0.9"
Build an oblique tree classifier based on SVM Trees
"""
from __future__ import annotations
import os
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import (
check_X_y,
check_array,
import random
import warnings
from typing import Optional, List, Union, Tuple
from math import log
from itertools import combinations
import numpy as np # type: ignore
from sklearn.base import BaseEstimator, ClassifierMixin # type: ignore
from sklearn.svm import SVC, LinearSVC # type: ignore
from sklearn.utils.multiclass import ( # type: ignore
check_classification_targets,
)
from sklearn.exceptions import ConvergenceWarning # type: ignore
from sklearn.utils.validation import ( # type: ignore
check_is_fitted,
_check_sample_weight,
)
from sklearn.utils.sparsefuncs import count_nonzero
from sklearn.metrics._classification import _weighted_sum, _check_targets
class Snode:
@@ -28,37 +31,60 @@ class Snode:
dataset assigned to it
"""
def __init__(self, clf: SVC, X: np.ndarray, y: np.ndarray, title: str):
self._clf = clf
self._title = title
self._belief = 0.0
def __init__(
self,
clf: Union[SVC, LinearSVC],
X: np.ndarray,
y: np.ndarray,
features: np.array,
impurity: float,
title: str,
weight: np.ndarray = None,
):
self._clf: Union[SVC, LinearSVC] = clf
self._title: str = title
self._belief: float = 0.0
# Only store dataset in Testing
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
self._y = y
self._down = None
self._up = None
self._X: Optional[np.array] = X if os.environ.get(
"TESTING", "NS"
) != "NS" else None
self._y: np.array = y
self._down: Optional[Snode] = None
self._up: Optional[Snode] = None
self._class = None
self._sample_weight: Optional[np.array] = (
weight if os.environ.get("TESTING", "NS") != "NS" else None
)
self._features: Tuple[int, ...] = features
self._impurity: float = impurity
@classmethod
def copy(cls, node: "Snode") -> "Snode":
return cls(node._clf, node._X, node._y, node._title)
def copy(cls, node: Snode) -> Snode:
return cls(
node._clf,
node._X,
node._y,
node._features,
node._impurity,
node._title,
)
def set_down(self, son):
def set_down(self, son: Snode) -> None:
self._down = son
def set_up(self, son):
def set_up(self, son: Snode) -> None:
self._up = son
def is_leaf(self) -> bool:
return self._up is None and self._down is None
def get_down(self) -> "Snode":
def get_down(self) -> Optional[Snode]:
return self._down
def get_up(self) -> "Snode":
def get_up(self) -> Optional[Snode]:
return self._up
def make_predictor(self):
def make_predictor(self) -> None:
"""Compute the class of the predictor and its belief based on the
subdataset of the node only if it is a leaf
"""
@@ -82,25 +108,26 @@ class Snode:
count_values = np.unique(self._y, return_counts=True)
result = (
f"{self._title} - Leaf class={self._class} belief="
f"{self._belief: .6f} counts={count_values}"
f"{self._belief: .6f} impurity={self._impurity:.4f} "
f"counts={count_values}"
)
return result
else:
return f"{self._title}"
return (
f"{self._title} feaures={self._features} impurity="
f"{self._impurity:.4f}"
)
class Siterator:
"""Stree preorder iterator
"""
def __init__(self, tree: Snode):
self._stack = []
def __init__(self, tree: Optional[Snode]):
self._stack: List[Snode] = []
self._push(tree)
def __iter__(self):
return self
def _push(self, node: Snode):
def _push(self, node: Optional[Snode]) -> None:
if node is not None:
self._stack.append(node)
@@ -113,7 +140,235 @@ class Siterator:
return node
class Stree(BaseEstimator, ClassifierMixin):
class Splitter:
def __init__(
self,
clf: Union[SVC, LinearSVC] = None,
criterion: str = "",
splitter_type: str = "",
criteria: str = "",
min_samples_split: int = 0,
random_state: Optional[int] = None,
):
self._clf: Union[SVC, LinearSVC] = clf
self._random_state: Optional[int] = random_state
if random_state is not None:
random.seed(random_state)
self._criterion: str = criterion
self._min_samples_split: int = min_samples_split
self._criteria: str = criteria
self._splitter_type: str = splitter_type
if clf is None:
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
if criterion not in ["gini", "entropy"]:
raise ValueError(
f"criterion must be gini or entropy got({criterion})"
)
if criteria not in ["min_distance", "max_samples", "max_distance"]:
raise ValueError(
"split_criteria has to be min_distance "
f"max_distance or max_samples got ({criteria})"
)
if splitter_type not in ["random", "best"]:
raise ValueError(
f"splitter must be either random or best got({splitter_type})"
)
self.criterion_function = getattr(self, f"_{self._criterion}")
self.decision_criteria = getattr(self, f"_{self._criteria}")
def impurity(self, y: np.array) -> np.array:
return self.criterion_function(y)
@staticmethod
def _gini(y: np.array) -> float:
_, count = np.unique(y, return_counts=True)
return float(1 - np.sum(np.square(count / np.sum(count))))
@staticmethod
def _entropy(y: np.array) -> float:
n_labels = len(y)
if n_labels <= 1:
return 0
counts = np.bincount(y)
proportions = counts / n_labels
n_classes = np.count_nonzero(proportions)
if n_classes <= 1:
return 0
entropy = 0.0
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log(prop, n_classes)
return entropy
def information_gain(
self, labels: np.array, labels_up: np.array, labels_dn: np.array
) -> float:
imp_prev = self.criterion_function(labels)
card_up = card_dn = imp_up = imp_dn = 0
if labels_up is not None:
card_up = labels_up.shape[0]
imp_up = self.criterion_function(labels_up)
if labels_dn is not None:
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
imp_dn = self.criterion_function(labels_dn)
samples = card_up + card_dn
if samples == 0:
return 0.0
else:
result = float(
imp_prev
- (card_up / samples) * imp_up
- (card_dn / samples) * imp_dn
)
return result
def _select_best_set(
self,
dataset: np.array,
labels: np.array,
features_sets: List[Tuple[int, ...]],
) -> Tuple[int, ...]:
max_gain: float = 0.0
selected: Union[Tuple[int, ...], None] = None
warnings.filterwarnings("ignore", category=ConvergenceWarning)
for feature_set in features_sets:
self._clf.fit(dataset[:, feature_set], labels)
node = Snode(
self._clf, dataset, labels, feature_set, 0.0, "subset"
)
self.partition(dataset, node)
y1, y2 = self.part(labels)
gain = self.information_gain(labels, y1, y2)
if gain > max_gain:
max_gain = gain
selected = feature_set
return selected if selected is not None else feature_set
def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int
) -> np.array:
features = range(dataset.shape[1])
features_sets = list(combinations(features, max_features))
if len(features_sets) > 1:
if self._splitter_type == "random":
index = random.randint(0, len(features_sets) - 1)
return features_sets[index]
else:
# get only 3 sets at most
if len(features_sets) > 3:
features_sets = random.sample(features_sets, 3)
return self._select_best_set(dataset, labels, features_sets)
else:
return features_sets[0]
def get_subspace(
self, dataset: np.array, labels: np.array, max_features: int
) -> Tuple[np.array, np.array]:
"""Return the best subspace to make a split
"""
indices = self._get_subspaces_set(dataset, labels, max_features)
return dataset[:, indices], indices
@staticmethod
def _min_distance(data: np.array, _: np.array) -> np.array:
"""Assign class to min distances
return a vector of classes so partition can separate class 0 from
the rest of classes, ie. class 0 goes to one splitted node and the
rest of classes go to the other
:param data: distances to hyper plane of every class
:type data: np.array (m, n_classes)
:param _: enable call compat with other measures
:type _: None
:return: vector with the class assigned to each sample
:rtype: np.array shape (m,)
"""
return np.argmin(data, axis=1)
@staticmethod
def _max_distance(data: np.array, _: np.array) -> np.array:
"""Assign class to max distances
return a vector of classes so partition can separate class 0 from
the rest of classes, ie. class 0 goes to one splitted node and the
rest of classes go to the other
:param data: distances to hyper plane of every class
:type data: np.array (m, n_classes)
:param _: enable call compat with other measures
:type _: None
:return: vector with the class assigned to each sample values
(can be 0, 1, ...)
:rtype: np.array shape (m,)
"""
return np.argmax(data, axis=1)
@staticmethod
def _max_samples(data: np.array, y: np.array) -> np.array:
"""return distances of the class with more samples
:param data: distances to hyper plane of every class
:type data: np.array (m, n_classes)
:param y: vector of labels (classes)
:type y: np.array (m,)
:return: vector with distances to hyperplane (can be positive or neg.)
:rtype: np.array shape (m,)
"""
# select the class with max number of samples
_, samples = np.unique(y, return_counts=True)
selected = np.argmax(samples)
return data[:, selected]
def partition(self, samples: np.array, node: Snode) -> None:
"""Set the criteria to split arrays. Compute the indices of the samples
that should go to one side of the tree (down)
"""
data = self._distances(node, samples)
if data.shape[0] < self._min_samples_split:
self._down = np.ones((data.shape[0]), dtype=bool)
return
if data.ndim > 1:
# split criteria for multiclass
data = self.decision_criteria(data, node._y)
self._down = data > 0
@staticmethod
def _distances(node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node
:param node: node containing the svm classifier
:type node: Snode
:param data: samples to find out distance to hyperplane
:type data: np.ndarray
:return: array of shape (m, 1) with the distances of every sample to
the hyperplane of the node
:rtype: np.array
"""
return node._clf.decision_function(data[:, node._features])
def part(self, origin: np.array) -> Tuple[np.array, np.array]:
"""Split an array in two based on indices (down) and its complement
:param origin: dataset to split
:type origin: np.array
:param down: indices to use to split array
:type down: np.array
:return: list with two splits of the array
:rtype: list
"""
up = ~self._down
return (
origin[up] if any(up) else None,
origin[self._down] if any(self._down) else None,
)
class Stree(BaseEstimator, ClassifierMixin): # type: ignore
"""Estimator that is based on binary trees of svm nodes
can deal with sample_weights in predict, used in boosting sklearn methods
inheriting from BaseEstimator implements get_params and set_params methods
@@ -126,83 +381,34 @@ class Stree(BaseEstimator, ClassifierMixin):
C: float = 1.0,
kernel: str = "linear",
max_iter: int = 1000,
random_state: int = None,
max_depth: int = None,
random_state: Optional[int] = None,
max_depth: Optional[int] = None,
tol: float = 1e-4,
degree: int = 3,
gamma="scale",
gamma: Union[float, str] = "scale",
split_criteria: str = "max_samples",
criterion: str = "gini",
min_samples_split: int = 0,
max_features: Optional[Union[str, int, float]] = None,
splitter: str = "random",
):
self.max_iter = max_iter
self.C = C
self.kernel = kernel
self.random_state = random_state
self.max_depth = max_depth
self.tol = tol
self.gamma = gamma
self.degree = degree
self.min_samples_split = min_samples_split
def _more_tags(self) -> dict:
"""Required by sklearn to tell that this estimator is a binary classifier
:return: the tag required
:rtype: dict
"""
return {"binary_only": True, "requires_y": True}
def _split_array(self, origin: np.array, down: np.array) -> list:
"""Split an array in two based on indices passed as down and its complement
:param origin: dataset to split
:type origin: np.array
:param down: indices to use to split array
:type down: np.array
:return: list with two splits of the array
:rtype: list
"""
up = ~down
return (
origin[up[:, 0]] if any(up) else None,
origin[down[:, 0]] if any(down) else None,
)
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node
:param node: node containing the svm classifier
:type node: Snode
:param data: samples to find out distance to hyperplane
:type data: np.ndarray
:return: array of shape (m, 1) with the distances of every sample to
the hyperplane of the node
:rtype: np.array
"""
res = node._clf.decision_function(data)
if res.ndim == 1:
return np.expand_dims(res, 1)
elif res.shape[1] > 1:
# remove multiclass info
res = np.delete(res, slice(1, res.shape[1]), axis=1)
return res
def _split_criteria(self, data: np.array) -> np.array:
"""Set the criteria to split arrays
:param data: [description]
:type data: np.array
:return: [description]
:rtype: np.array
"""
return (
data > 0
if data.shape[0] >= self.min_samples_split
else np.ones((data.shape[0], 1), dtype=bool)
)
self.C: float = C
self.kernel: str = kernel
self.random_state: Optional[int] = random_state
self.max_depth: Optional[int] = max_depth
self.tol: float = tol
self.gamma: Union[float, str] = gamma
self.degree: int = degree
self.min_samples_split: int = min_samples_split
self.split_criteria: str = split_criteria
self.max_features: Union[str, int, float, None] = max_features
self.criterion: str = criterion
self.splitter: str = splitter
def fit(
self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
) -> "Stree":
) -> Stree:
"""Build the tree based on the dataset of samples and its labels
:param X: dataset of samples to make predictions
@@ -232,52 +438,30 @@ class Stree(BaseEstimator, ClassifierMixin):
{self.max_depth})"
)
check_classification_targets(y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
check_classification_targets(y)
X, y = self._validate_data(X, y)
sample_weight = _check_sample_weight(
sample_weight, X, dtype=np.float64
)
# Initialize computed parameters
self.splitter_ = Splitter(
clf=self._build_clf(),
criterion=self.criterion,
splitter_type=self.splitter,
criteria=self.split_criteria,
random_state=self.random_state,
min_samples_split=self.min_samples_split,
)
if self.random_state is not None:
random.seed(self.random_state)
self.classes_, y = np.unique(y, return_inverse=True)
self.n_classes_ = self.classes_.shape[0]
self.n_iter_ = self.max_iter
self.depth_ = 0
self.n_features_in_ = X.shape[1]
self.max_features_ = self._initialize_max_features()
self.tree_ = self.train(X, y, sample_weight, 1, "root")
self._build_predictor()
return self
def _build_predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Snode):
if node.is_leaf():
node.make_predictor()
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self.tree_)
def _build_clf(self):
""" Build the correct classifier for the node
"""
return (
LinearSVC(
max_iter=self.max_iter,
random_state=self.random_state,
C=self.C,
tol=self.tol,
)
if self.kernel == "linear"
else SVC(
kernel=self.kernel,
max_iter=self.max_iter,
tol=self.tol,
C=self.C,
gamma=self.gamma,
degree=self.degree,
)
)
def train(
self,
X: np.ndarray,
@@ -285,7 +469,7 @@ class Stree(BaseEstimator, ClassifierMixin):
sample_weight: np.ndarray,
depth: int,
title: str,
) -> Snode:
) -> Optional[Snode]:
"""Recursive function to split the original dataset into predictor
nodes (leaves)
@@ -307,24 +491,95 @@ class Stree(BaseEstimator, ClassifierMixin):
return None
if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset
return Snode(None, X, y, title + ", <pure>")
return Snode(
clf=None,
X=X,
y=y,
features=X.shape[1],
impurity=0.0,
title=title + ", <pure>",
weight=sample_weight,
)
# Train the model
clf = self._build_clf()
clf.fit(X, y, sample_weight=sample_weight)
tree = Snode(clf, X, y, title)
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
# solve WARNING: class label 0 specified in weight is not found
# in bagging
if any(sample_weight == 0):
indices = sample_weight == 0
y_next = y[~indices]
# touch weights if removing any class
if np.unique(y_next).shape[0] != self.n_classes_:
sample_weight += 1e-5
clf.fit(Xs, y, sample_weight=sample_weight)
impurity = self.splitter_.impurity(y)
node = Snode(clf, X, y, features, impurity, title, sample_weight)
self.depth_ = max(depth, self.depth_)
down = self._split_criteria(self._distances(tree, X))
X_U, X_D = self._split_array(X, down)
y_u, y_d = self._split_array(y, down)
sw_u, sw_d = self._split_array(sample_weight, down)
self.splitter_.partition(X, node)
X_U, X_D = self.splitter_.part(X)
y_u, y_d = self.splitter_.part(y)
sw_u, sw_d = self.splitter_.part(sample_weight)
if X_U is None or X_D is None:
# didn't part anything
return Snode(clf, X, y, title + ", <cgaf>")
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
return tree
return Snode(
clf,
X,
y,
features=X.shape[1],
impurity=impurity,
title=title + ", <cgaf>",
weight=sample_weight,
)
node.set_up(
self.train( # type: ignore
X_U, y_u, sw_u, depth + 1, title + " - Up"
)
)
node.set_down(
self.train( # type: ignore
X_D, y_d, sw_d, depth + 1, title + " - Down"
)
)
return node
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
def _build_predictor(self) -> None:
"""Process the leaves to make them predictors
"""
def run_tree(node: Optional[Snode]) -> None:
if node is None:
raise ValueError("Can't build predictors on None")
if node.is_leaf():
node.make_predictor()
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self.tree_)
def _build_clf(self) -> Union[LinearSVC, SVC]:
""" Build the correct classifier for the node
"""
return (
LinearSVC(
max_iter=self.max_iter,
random_state=self.random_state,
C=self.C,
tol=self.tol,
)
if self.kernel == "linear"
else SVC(
kernel=self.kernel,
max_iter=self.max_iter,
tol=self.tol,
C=self.C,
gamma=self.gamma,
degree=self.degree,
)
)
@staticmethod
def _reorder_results(y: np.array, indices: np.array) -> np.array:
"""Reorder an array based on the array of indices passed
:param y: data untidy
@@ -334,12 +589,8 @@ class Stree(BaseEstimator, ClassifierMixin):
:return: array y ordered
:rtype: np.array
"""
if y.ndim > 1 and y.shape[1] > 1:
# if predict_proba return np.array of floats
y_ordered = np.zeros(y.shape, dtype=float)
else:
# return array of same type given in y
y_ordered = y.copy()
# return array of same type given in y
y_ordered = y.copy()
indices = indices.astype(int)
for i, index in enumerate(indices):
y_ordered[index] = y[i]
@@ -355,25 +606,30 @@ class Stree(BaseEstimator, ClassifierMixin):
"""
def predict_class(
xp: np.array, indices: np.array, node: Snode
xp: np.array, indices: np.array, node: Optional[Snode]
) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
if node.is_leaf(): # type: ignore
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
prediction = np.full(
(xp.shape[0], 1), node._class # type: ignore
)
return prediction, indices
down = self._split_criteria(self._distances(node, xp))
X_U, X_D = self._split_array(xp, down)
i_u, i_d = self._split_array(indices, down)
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
self.splitter_.partition(xp, node) # type: ignore
x_u, x_d = self.splitter_.part(xp)
i_u, i_d = self.splitter_.part(indices)
prx_u, prin_u = predict_class(
x_u, i_u, node.get_up() # type: ignore
)
prx_d, prin_d = predict_class(
x_d, i_d, node.get_down() # type: ignore
)
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ["tree_"])
check_is_fitted(self, "n_features_in_")
# Input validation
X = check_array(X)
X = self._validate_data(X, reset=False)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
result = (
@@ -383,102 +639,9 @@ class Stree(BaseEstimator, ClassifierMixin):
)
return self.classes_[result]
def predict_proba(self, X: np.array) -> np.array:
"""Computes an approximation of the probability of samples belonging to
class 0 and 1
:param X: dataset
:type X: np.array
:return: array array of shape (m, num_classes), probability of being
each class
:rtype: np.array
"""
def predict_class(
xp: np.array, indices: np.array, dist: np.array, node: Snode
) -> np.array:
"""Run the tree to compute predictions
:param xp: subdataset of samples
:type xp: np.array
:param indices: indices of subdataset samples to rebuild original
order
:type indices: np.array
:param dist: distances of every sample to the hyperplane or the
father node
:type dist: np.array
:param node: node of the leaf with the class
:type node: Snode
:return: array of labels and distances, array of indices
:rtype: np.array
"""
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
prediction_proba = dist
return np.append(prediction, prediction_proba, axis=1), indices
distances = self._distances(node, xp)
down = self._split_criteria(distances)
X_U, X_D = self._split_array(xp, down)
i_u, i_d = self._split_array(indices, down)
di_u, di_d = self._split_array(distances, down)
prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ["tree_"])
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
empty_dist = np.empty((X.shape[0], 1), dtype=float)
result, indices = predict_class(X, indices, empty_dist, self.tree_)
result = result.reshape(X.shape[0], 2)
# Turn distances to hyperplane into probabilities based on fitting
# distances of samples to its hyperplane that classified them, to the
# sigmoid function
# Probability of being 1
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
# Probability of being 0
result[:, 0] = 1 - result[:, 1]
return self._reorder_results(result, indices)
def score(
self, X: np.array, y: np.array, sample_weight: np.array = None
) -> float:
"""Compute accuracy of the prediction
:param X: dataset of samples to make predictions
:type X: np.array
:param y_true: samples labels
:type y_true: np.array
:param sample_weight: weights of the samples. Rescale C per sample.
Hi' weights force the classifier to put more emphasis on these points
:type sample_weight: np.array optional
:return: accuracy of the prediction
:rtype: float
"""
# sklearn check
check_is_fitted(self)
check_classification_targets(y)
X, y = check_X_y(X, y)
y_pred = self.predict(X).reshape(y.shape)
# Compute accuracy for each possible representation
y_type, y_true, y_pred = _check_targets(y, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)
if y_type.startswith("multilabel"):
differing_labels = count_nonzero(y_true - y_pred, axis=1)
score = differing_labels == 0
else:
score = y_true == y_pred
return _weighted_sum(score, sample_weight, normalize=True)
def __iter__(self) -> Siterator:
"""Create an iterator to be able to visit the nodes of the tree in preorder,
can make a list with all the nodes in preorder
"""Create an iterator to be able to visit the nodes of the tree in
preorder, can make a list with all the nodes in preorder
:return: an iterator, can for i in... and list(...)
:rtype: Siterator
@@ -499,3 +662,34 @@ class Stree(BaseEstimator, ClassifierMixin):
for i in self:
output += str(i) + "\n"
return output
def _initialize_max_features(self) -> int:
if isinstance(self.max_features, str):
if self.max_features == "auto":
max_features = max(1, int(np.sqrt(self.n_features_in_)))
elif self.max_features == "sqrt":
max_features = max(1, int(np.sqrt(self.n_features_in_)))
elif self.max_features == "log2":
max_features = max(1, int(np.log2(self.n_features_in_)))
else:
raise ValueError(
"Invalid value for max_features. "
"Allowed string values are 'auto', "
"'sqrt' or 'log2'."
)
elif self.max_features is None:
max_features = self.n_features_in_
elif isinstance(self.max_features, int):
max_features = self.max_features
else: # float
if self.max_features > 0.0:
max_features = max(
1, int(self.max_features * self.n_features_in_)
)
else:
raise ValueError(
"Invalid value for max_features."
"Allowed float must be in range (0, 1] "
f"got ({self.max_features})"
)
return max_features

View File

@@ -1,205 +0,0 @@
"""
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Plot 3D views of nodes in Stree
"""
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from .Strees import Stree, Snode, Siterator
class Snode_graph(Snode):
def __init__(self, node: Stree):
self._plot_size = (8, 8)
self._xlimits = (None, None)
self._ylimits = (None, None)
self._zlimits = (None, None)
n = Snode.copy(node)
super().__init__(n._clf, n._X, n._y, n._title)
def set_plot_size(self, size: tuple):
self._plot_size = size
def get_plot_size(self) -> tuple:
return self._plot_size
def _is_pure(self) -> bool:
"""is considered pure a leaf node with one label
"""
if self.is_leaf():
return self._belief == 1.0
return False
def set_axis_limits(self, limits: tuple):
self._xlimits, self._ylimits, self._zlimits = limits
def get_axis_limits(self) -> tuple:
return self._xlimits, self._ylimits, self._zlimits
def _set_graphics_axis(self, ax: Axes3D):
ax.set_xlim(self._xlimits)
ax.set_ylim(self._ylimits)
ax.set_zlim(self._zlimits)
def save_hyperplane(
self, save_folder: str = "./", save_prefix: str = "", save_seq: int = 1
):
_, fig = self.plot_hyperplane()
name = os.path.join(save_folder, f"{save_prefix}STnode{save_seq}.png")
fig.savefig(name, bbox_inches="tight")
plt.close(fig)
def _get_cmap(self):
cmap = "jet"
if self._is_pure() and self._class == 1:
cmap = "jet_r"
return cmap
def _graph_title(self):
n_class, card = np.unique(self._y, return_counts=True)
return f"{self._title} {n_class} {card}"
def plot_hyperplane(self, plot_distribution: bool = True):
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection="3d")
if not self._is_pure():
# Can't plot hyperplane of leaves with one label because it hasn't
# classiffier
# get the splitting hyperplane
def hyperplane(x, y):
return (
-self._clf.intercept_
- self._clf.coef_[0][0] * x
- self._clf.coef_[0][1] * y
) / self._clf.coef_[0][2]
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
xx, yy = np.meshgrid(tmpx, tmpy)
ax.plot_surface(
xx,
yy,
hyperplane(xx, yy),
alpha=0.5,
antialiased=True,
rstride=1,
cstride=1,
cmap="seismic",
)
self._set_graphics_axis(ax)
if plot_distribution:
self.plot_distribution(ax)
else:
plt.title(self._graph_title())
plt.show()
return ax, fig
def plot_distribution(self, ax: Axes3D = None):
if ax is None:
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection="3d")
plt.title(self._graph_title())
cmap = self._get_cmap()
ax.scatter(
self._X[:, 0], self._X[:, 1], self._X[:, 2], c=self._y, cmap=cmap
)
ax.set_xlabel("X0")
ax.set_ylabel("X1")
ax.set_zlabel("X2")
plt.show()
class Stree_grapher(Stree):
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
make its magic
"""
def __init__(self, params: dict):
self._plot_size = (8, 8)
self._tree_gr = None
# make Snode store X's
os.environ["TESTING"] = "1"
self._fitted = False
self._pca = None
super().__init__(**params)
def __del__(self):
try:
os.environ.pop("TESTING")
except KeyError:
pass
def _copy_tree(self, node: Snode) -> Snode_graph:
mirror = Snode_graph(node)
# clone node
mirror._class = node._class
mirror._belief = node._belief
if node.get_down() is not None:
mirror.set_down(self._copy_tree(node.get_down()))
if node.get_up() is not None:
mirror.set_up(self._copy_tree(node.get_up()))
return mirror
def fit(
self, X: np.array, y: np.array, sample_weight: np.array = None
) -> "Stree_grapher":
"""Fit the Stree and copy the tree in a Snode_graph tree
:param X: Dataset
:type X: np.array
:param y: Labels
:type y: np.array
:return: Stree model
:rtype: Stree
"""
if X.shape[1] != 3:
self._pca = PCA(n_components=3)
X = self._pca.fit_transform(X)
super().fit(X, y, sample_weight=sample_weight)
self._tree_gr = self._copy_tree(self.tree_)
self._fitted = True
return self
def score(self, X: np.array, y: np.array) -> float:
self._check_fitted()
if X.shape[1] != 3:
X = self._pca.transform(X)
return super().score(X, y)
def _check_fitted(self):
if not self._fitted:
raise Exception("Have to fit the grapher first!")
def save_all(self, save_folder: str = "./", save_prefix: str = ""):
"""Save all the node plots in png format, each with a sequence number
:param save_folder: folder where the plots are saved, defaults to './'
:type save_folder: str, optional
"""
self._check_fitted()
if not os.path.isdir(save_folder):
os.mkdir(save_folder)
seq = 1
for node in self:
node.save_hyperplane(
save_folder=save_folder, save_prefix=save_prefix, save_seq=seq
)
seq += 1
def plot_all(self):
"""Plots all the nodes
"""
self._check_fitted()
for node in self:
node.plot_hyperplane()
def __iter__(self):
return Siterator(self._tree_gr)

View File

@@ -1,4 +1,3 @@
from .Strees import Stree, Snode, Siterator
from .Strees_grapher import Stree_grapher, Snode_graph
from .Strees import Stree, Snode, Siterator, Splitter
__all__ = ["Stree", "Snode", "Siterator", "Stree_grapher", "Snode_graph"]
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]

89
stree/tests/Snode_test.py Normal file
View File

@@ -0,0 +1,89 @@
# type: ignore
import os
import unittest
import numpy as np
from stree import Stree, Snode
from .utils import load_dataset
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree(random_state=self._random_state)
self._clf.fit(*load_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
belief = max_card / (max_card + min_card)
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf.tree_)
def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf.tree_)
def test_make_predictor_on_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
test.make_predictor()
self.assertEqual(1, test._class)
self.assertEqual(0.75, test._belief)
def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
test.make_predictor()
self.assertIsNone(test._class)
self.assertEqual(0, test._belief)
def test_make_predictor_on_leaf_bogus_data(self):
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
test.make_predictor()
self.assertIsNone(test._class)
def test_copy_node(self):
px = [1, 2, 3, 4]
py = [1]
test = Snode(Stree(), px, py, [], 0.0, "test")
computed = Snode.copy(test)
self.assertListEqual(computed._X, px)
self.assertListEqual(computed._y, py)
self.assertEqual("test", computed._title)
self.assertIsInstance(computed._clf, Stree)

View File

@@ -0,0 +1,230 @@
# type: ignore
import os
import unittest
import random
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import load_wine, load_iris
from stree import Splitter
class Splitter_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
super().__init__(*args, **kwargs)
@staticmethod
def build(
clf=SVC,
min_samples_split=0,
splitter_type="random",
criterion="gini",
criteria="min_distance",
random_state=None,
):
return Splitter(
clf=clf(random_state=random_state, kernel="rbf"),
min_samples_split=min_samples_split,
splitter_type=splitter_type,
criterion=criterion,
criteria=criteria,
random_state=random_state,
)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_init(self):
with self.assertRaises(ValueError):
self.build(criterion="duck")
with self.assertRaises(ValueError):
self.build(splitter_type="duck")
with self.assertRaises(ValueError):
self.build(criteria="duck")
with self.assertRaises(ValueError):
_ = Splitter(clf=None)
for splitter_type in ["best", "random"]:
for criterion in ["gini", "entropy"]:
for criteria in [
"min_distance",
"max_samples",
"max_distance",
]:
tcl = self.build(
splitter_type=splitter_type,
criterion=criterion,
criteria=criteria,
)
self.assertEqual(splitter_type, tcl._splitter_type)
self.assertEqual(criterion, tcl._criterion)
self.assertEqual(criteria, tcl._criteria)
def test_gini(self):
expected_values = [
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
([0], 0),
([1, 1, 1, 1], 0),
]
for labels, expected in expected_values:
self.assertAlmostEqual(expected, Splitter._gini(labels))
tcl = self.build(criterion="gini")
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
def test_entropy(self):
expected_values = [
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
([0, 0, 1, 1, 1, 1, 0, 0], 1),
([0, 0, 1, 1, 2, 2, 3, 3], 1),
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
([1], 0),
([0, 0, 0, 0], 0),
]
for labels, expected in expected_values:
self.assertAlmostEqual(expected, Splitter._entropy(labels))
tcl = self.build(criterion="entropy")
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
def test_information_gain(self):
expected_values = [
(
[0, 1, 1, 1, 1, 1],
[0, 0, 0, 1],
0.16333333333333333,
0.25642589168200297,
),
(
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
[5, 3, 2, 1, 1],
0.007381776239907684,
-0.03328610916207225,
),
([], [], 0.0, 0.0),
([1], [], 0.0, 0.0),
([], [1], 0.0, 0.0),
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
([], [1, 1, 1, 2], 0.0, 0.0),
(None, [1, 2, 3], 0.0, 0.0),
([1, 2, 3], None, 0.0, 0.0),
]
for yu, yd, expected_gini, expected_entropy in expected_values:
yu = np.array(yu, dtype=np.int32) if yu is not None else None
yd = np.array(yd, dtype=np.int32) if yd is not None else None
if yu is not None and yd is not None:
complete = np.append(yu, yd)
elif yd is not None:
complete = yd
else:
complete = yu
tcl = self.build(criterion="gini")
computed = tcl.information_gain(complete, yu, yd)
self.assertAlmostEqual(expected_gini, computed)
tcl = self.build(criterion="entropy")
computed = tcl.information_gain(complete, yu, yd)
self.assertAlmostEqual(expected_entropy, computed)
def test_max_samples(self):
tcl = self.build(criteria="max_samples")
data = np.array(
[
[-0.1, 0.2, -0.3],
[0.7, 0.01, -0.1],
[0.7, -0.9, 0.5],
[0.1, 0.2, 0.3],
]
)
expected = np.array([0.2, 0.01, -0.9, 0.2])
y = [1, 2, 1, 0]
computed = tcl._max_samples(data, y)
self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_min_distance(self):
tcl = self.build()
data = np.array(
[
[-0.1, 0.2, -0.3],
[0.7, 0.01, -0.1],
[0.7, -0.9, 0.5],
[0.1, 0.2, 0.3],
]
)
expected = np.array([2, 2, 1, 0])
computed = tcl._min_distance(data, None)
self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_max_distance(self):
tcl = self.build(criteria="max_distance")
data = np.array(
[
[-0.1, 0.2, -0.3],
[0.7, 0.01, -0.1],
[0.7, -0.9, 0.5],
[0.1, 0.2, 0.3],
]
)
expected = np.array([1, 0, 0, 2])
computed = tcl._max_distance(data, None)
self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1)
tcl = self.build(splitter_type="best", random_state=self._random_state)
dataset, computed = tcl.get_subspace(X, y, max_features=2)
self.assertListEqual([0, 2], list(computed))
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
def test_splitter_parameter(self):
expected_values = [
[2, 3, 5, 7], # best entropy min_distance
[0, 2, 4, 5], # best entropy max_samples
[0, 2, 8, 12], # best entropy max_distance
[1, 2, 5, 12], # best gini min_distance
[0, 3, 4, 10], # best gini max_samples
[1, 2, 9, 12], # best gini max_distance
[3, 9, 11, 12], # random entropy min_distance
[1, 5, 6, 9], # random entropy max_samples
[1, 2, 4, 8], # random entropy max_distance
[2, 6, 7, 12], # random gini min_distance
[3, 9, 10, 11], # random gini max_samples
[2, 5, 8, 12], # random gini max_distance
]
X, y = load_wine(return_X_y=True)
rn = 0
for splitter_type in ["best", "random"]:
for criterion in ["entropy", "gini"]:
for criteria in [
"min_distance",
"max_samples",
"max_distance",
]:
tcl = self.build(
splitter_type=splitter_type,
criterion=criterion,
criteria=criteria,
)
expected = expected_values.pop(0)
random.seed(rn)
rn += 1
dataset, computed = tcl.get_subspace(X, y, max_features=4)
# print(
# "{}, # {:7s}{:8s}{:15s}".format(
# list(computed), splitter_type, criterion,
# criteria,
# )
# )
self.assertListEqual(expected, list(computed))
self.assertListEqual(
X[:, computed].tolist(), dataset.tolist()
)

438
stree/tests/Stree_test.py Normal file
View File

@@ -0,0 +1,438 @@
# type: ignore
import os
import unittest
import warnings
import numpy as np
from sklearn.datasets import load_iris, load_wine
from sklearn.exceptions import ConvergenceWarning
from stree import Stree, Snode
from .utils import load_dataset
class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._kernels = ["linear", "rbf", "poly"]
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def _check_tree(self, node: Snode):
"""Check recursively that the nodes that are not leaves have the
correct number of labels and its sons have the right number of elements
in their dataset
Arguments:
node {Snode} -- node to check
"""
if node.is_leaf():
return
y_prediction = node._clf.predict(node._X)
y_down = node.get_down()._y
y_up = node.get_up()._y
# Is a correct partition in terms of cadinality?
# i.e. The partition algorithm didn't forget any sample
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
unique_y, count_y = np.unique(node._y, return_counts=True)
_, count_d = np.unique(y_down, return_counts=True)
_, count_u = np.unique(y_up, return_counts=True)
#
for i in unique_y:
number_down = count_d[i]
try:
number_up = count_u[i]
except IndexError:
number_up = 0
self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction?
# as the node is not a leaf...
_, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[0], y_up.shape[0])
self.assertEqual(count_yp[1], y_down.shape[0])
self._check_tree(node.get_down())
self._check_tree(node.get_up())
def test_build_tree(self):
"""Check if the tree is built the same way as predictions of models
"""
warnings.filterwarnings("ignore")
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
clf.fit(*load_dataset(self._random_state))
self._check_tree(clf.tree_)
def test_single_prediction(self):
X, y = load_dataset(self._random_state)
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
self.assertEqual(yp[0], y[0])
def test_multiple_prediction(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = load_dataset(self._random_state)
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
yp = clf.fit(X, y).predict(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp.tolist())
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as
predicting all samples at once
"""
X, y = load_dataset(self._random_state)
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
clf.fit(X, y)
# Compute prediction line by line
yp_line = np.array([], dtype=int)
for xp in X:
yp_line = np.append(
yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
)
# Compute prediction at once
yp_once = clf.predict(X)
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
def test_iterator_and_str(self):
"""Check preorder iterator
"""
expected = [
"root feaures=(0, 1, 2) impurity=0.5000",
"root - Down feaures=(0, 1, 2) impurity=0.0671",
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 "
"impurity=0.0469 counts=(array([0, 1]), array([ 17, 691]))",
"root - Down - Up feaures=(0, 1, 2) impurity=0.3967",
"root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 "
"impurity=0.3750 counts=(array([0, 1]), array([1, 3]))",
"root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 "
"impurity=0.0000 counts=(array([0]), array([7]))",
"root - Up, <cgaf> - Leaf class=0 belief= 0.928297 impurity=0.1331"
" counts=(array([0, 1]), array([725, 56]))",
]
computed = []
expected_string = ""
clf = Stree(kernel="linear", random_state=self._random_state)
clf.fit(*load_dataset(self._random_state))
for node in clf:
computed.append(str(node))
expected_string += str(node) + "\n"
self.assertListEqual(expected, computed)
self.assertEqual(expected_string, str(clf))
@staticmethod
def test_is_a_sklearn_classifier():
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator
check_estimator(Stree())
def test_exception_if_C_is_negative(self):
tclf = Stree(C=-1)
with self.assertRaises(ValueError):
tclf.fit(*load_dataset(self._random_state))
def test_exception_if_bogus_split_criteria(self):
tclf = Stree(split_criteria="duck")
with self.assertRaises(ValueError):
tclf.fit(*load_dataset(self._random_state))
def test_check_max_depth_is_positive_or_None(self):
tcl = Stree()
self.assertIsNone(tcl.max_depth)
tcl = Stree(max_depth=1)
self.assertGreaterEqual(1, tcl.max_depth)
with self.assertRaises(ValueError):
tcl = Stree(max_depth=-1)
tcl.fit(*load_dataset(self._random_state))
def test_check_max_depth(self):
depths = (3, 4)
for depth in depths:
tcl = Stree(random_state=self._random_state, max_depth=depth)
tcl.fit(*load_dataset(self._random_state))
self.assertEqual(depth, tcl.depth_)
def test_unfitted_tree_is_iterable(self):
tcl = Stree()
self.assertEqual(0, len(list(tcl)))
def test_min_samples_split(self):
dataset = [[1], [2], [3]], [1, 1, 0]
tcl_split = Stree(min_samples_split=3).fit(*dataset)
self.assertIsNotNone(tcl_split.tree_.get_down())
self.assertIsNotNone(tcl_split.tree_.get_up())
tcl_nosplit = Stree(min_samples_split=4).fit(*dataset)
self.assertIsNone(tcl_nosplit.tree_.get_down())
self.assertIsNone(tcl_nosplit.tree_.get_up())
def test_simple_muticlass_dataset(self):
for kernel in self._kernels:
clf = Stree(
kernel=kernel,
split_criteria="max_samples",
random_state=self._random_state,
)
px = [[1, 2], [5, 6], [9, 10]]
py = [0, 1, 2]
clf.fit(px, py)
self.assertEqual(1.0, clf.score(px, py))
self.assertListEqual(py, clf.predict(px).tolist())
self.assertListEqual(py, clf.classes_.tolist())
def test_muticlass_dataset(self):
datasets = {
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
"Iris": load_iris(return_X_y=True),
}
outcomes = {
"Synt": {
"max_samples linear": 0.9533333333333334,
"max_samples rbf": 0.836,
"max_samples poly": 0.9473333333333334,
"min_distance linear": 0.9533333333333334,
"min_distance rbf": 0.836,
"min_distance poly": 0.9473333333333334,
"max_distance linear": 0.9533333333333334,
"max_distance rbf": 0.836,
"max_distance poly": 0.9473333333333334,
},
"Iris": {
"max_samples linear": 0.98,
"max_samples rbf": 1.0,
"max_samples poly": 1.0,
"min_distance linear": 0.98,
"min_distance rbf": 1.0,
"min_distance poly": 1.0,
"max_distance linear": 0.98,
"max_distance rbf": 1.0,
"max_distance poly": 1.0,
},
}
for name, dataset in datasets.items():
px, py = dataset
for criteria in ["max_samples", "min_distance", "max_distance"]:
for kernel in self._kernels:
clf = Stree(
C=1e4,
max_iter=1e4,
kernel=kernel,
random_state=self._random_state,
)
clf.fit(px, py)
outcome = outcomes[name][f"{criteria} {kernel}"]
self.assertAlmostEqual(outcome, clf.score(px, py))
def test_max_features(self):
n_features = 16
expected_values = [
("auto", 4),
("log2", 4),
("sqrt", 4),
(0.5, 8),
(3, 3),
(None, 16),
]
clf = Stree()
clf.n_features_in_ = n_features
for max_features, expected in expected_values:
clf.set_params(**dict(max_features=max_features))
computed = clf._initialize_max_features()
self.assertEqual(expected, computed)
# Check bogus max_features
values = ["duck", -0.1, 0.0]
for max_features in values:
clf.set_params(**dict(max_features=max_features))
with self.assertRaises(ValueError):
_ = clf._initialize_max_features()
def test_get_subspaces(self):
dataset = np.random.random((10, 16))
y = np.random.randint(0, 2, 10)
expected_values = [
("auto", 4),
("log2", 4),
("sqrt", 4),
(0.5, 8),
(3, 3),
(None, 16),
]
clf = Stree()
for max_features, expected in expected_values:
clf.set_params(**dict(max_features=max_features))
clf.fit(dataset, y)
computed, indices = clf.splitter_.get_subspace(
dataset, y, clf.max_features_
)
self.assertListEqual(
dataset[:, indices].tolist(), computed.tolist()
)
self.assertEqual(expected, len(indices))
def test_bogus_criterion(self):
clf = Stree(criterion="duck")
with self.assertRaises(ValueError):
clf.fit(*load_dataset())
def test_predict_feature_dimensions(self):
X = np.random.rand(10, 5)
y = np.random.randint(0, 2, 10)
clf = Stree()
clf.fit(X, y)
with self.assertRaises(ValueError):
clf.predict(X[:, :3])
# Tests of score
def test_score_binary(self):
X, y = load_dataset(self._random_state)
accuracies = [
0.9506666666666667,
0.9606666666666667,
0.9433333333333334,
]
for kernel, accuracy_expected in zip(self._kernels, accuracies):
clf = Stree(random_state=self._random_state, kernel=kernel,)
clf.fit(X, y)
accuracy_score = clf.score(X, y)
yp = clf.predict(X)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertAlmostEqual(accuracy_expected, accuracy_score)
def test_score_max_features(self):
X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y)
self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))
def test_score_multi_class(self):
warnings.filterwarnings("ignore")
accuracies = [
0.8258427, # Wine linear min_distance
0.6741573, # Wine linear max_distance
0.8314607, # Wine linear max_samples
0.6629213, # Wine rbf min_distance
1.0000000, # Wine rbf max_distance
0.4044944, # Wine rbf max_samples
0.9157303, # Wine poly min_distance
1.0000000, # Wine poly max_distance
0.7640449, # Wine poly max_samples
0.9933333, # Iris linear min_distance
0.9666667, # Iris linear max_distance
0.9666667, # Iris linear max_samples
0.9800000, # Iris rbf min_distance
0.9800000, # Iris rbf max_distance
0.9800000, # Iris rbf max_samples
1.0000000, # Iris poly min_distance
1.0000000, # Iris poly max_distance
1.0000000, # Iris poly max_samples
0.8993333, # Synthetic linear min_distance
0.6533333, # Synthetic linear max_distance
0.9313333, # Synthetic linear max_samples
0.8320000, # Synthetic rbf min_distance
0.6660000, # Synthetic rbf max_distance
0.8320000, # Synthetic rbf max_samples
0.6066667, # Synthetic poly min_distance
0.6840000, # Synthetic poly max_distance
0.6340000, # Synthetic poly max_samples
]
datasets = [
("Wine", load_wine(return_X_y=True)),
("Iris", load_iris(return_X_y=True)),
(
"Synthetic",
load_dataset(self._random_state, n_classes=3, n_features=5),
),
]
for dataset_name, dataset in datasets:
X, y = dataset
for kernel in self._kernels:
for criteria in [
"min_distance",
"max_distance",
"max_samples",
]:
clf = Stree(
C=17,
random_state=self._random_state,
kernel=kernel,
split_criteria=criteria,
degree=5,
gamma="auto",
)
clf.fit(X, y)
accuracy_score = clf.score(X, y)
yp = clf.predict(X)
accuracy_computed = np.mean(yp == y)
# print(
# "{:.7f}, # {:7} {:5} {}".format(
# accuracy_score, dataset_name, kernel, criteria
# )
# )
accuracy_expected = accuracies.pop(0)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertAlmostEqual(accuracy_expected, accuracy_score)
def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck")
with self.assertRaises(ValueError):
clf.fit(*load_dataset())
def test_weights_removing_class(self):
# This patch solves an stderr message from sklearn svm lib
# "WARNING: class label x specified in weight is not found"
X = np.array(
[
[0.1, 0.1],
[0.1, 0.2],
[0.2, 0.1],
[5, 6],
[8, 9],
[6, 7],
[0.2, 0.2],
]
)
y = np.array([0, 0, 0, 1, 1, 1, 0])
epsilon = 1e-5
weights = [1, 1, 1, 0, 0, 0, 1]
weights = np.array(weights, dtype="float64")
weights_epsilon = [x + epsilon for x in weights]
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
original = weights_no_zero.copy()
clf = Stree()
clf.fit(X, y)
node = clf.train(X, y, weights, 1, "test",)
# if a class is lost with zero weights the patch adds epsilon
self.assertListEqual(weights.tolist(), weights_epsilon)
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
# zero weights are ok when they don't erase a class
_ = clf.train(X, y, weights_no_zero, 1, "test")
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
def test_build_predictor(self):
X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state)
with self.assertRaises(ValueError):
clf.tree_ = None
clf._build_predictor()
clf.fit(X, y)
node = clf.tree_.get_down().get_down()
expected_impurity = 0.04686951386893923
expected_class = 1
expected_belief = 0.9759887005649718
self.assertAlmostEqual(expected_impurity, node._impurity)
self.assertAlmostEqual(expected_belief, node._belief)
self.assertEqual(expected_class, node._class)
node._belief = 0.0
node._class = None
clf._build_predictor()
node = clf.tree_.get_down().get_down()
self.assertAlmostEqual(expected_belief, node._belief)
self.assertEqual(expected_class, node._class)

View File

@@ -1,211 +0,0 @@
import os
import imghdr
import unittest
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
from sklearn.datasets import make_classification
from stree import Stree_grapher, Snode_graph, Snode
def get_dataset(random_state=0, n_features=3):
X, y = make_classification(
n_samples=1500,
n_features=n_features,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
weights=[0.5, 0.5],
random_state=random_state,
)
return X, y
class Stree_grapher_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree_grapher(dict(random_state=self._random_state))
self._clf.fit(*get_dataset(self._random_state, n_features=4))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_iterator(self):
"""Check preorder iterator
"""
expected = [
"root",
"root - Down",
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.976023 counts"
"=(array([0, 1]), array([ 17, 692]))",
"root - Down - Up",
"root - Down - Up - Down, <cgaf> - Leaf class=0 belief= 0.500000 "
"counts=(array([0, 1]), array([1, 1]))",
"root - Down - Up - Up, <cgaf> - Leaf class=0 belief= 0.888889 "
"counts=(array([0, 1]), array([8, 1]))",
"root - Up, <cgaf> - Leaf class=0 belief= 0.928205 counts=(array("
"[0, 1]), array([724, 56]))",
]
computed = []
for node in self._clf:
computed.append(str(node))
self.assertListEqual(expected, computed)
def test_score(self):
X, y = get_dataset(self._random_state)
accuracy_score = self._clf.score(X, y)
yp = self._clf.predict(X)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.86)
def test_save_all(self):
folder_name = os.path.join(os.sep, "tmp", "stree")
if os.path.isdir(folder_name):
os.rmdir(folder_name)
file_names = [
os.path.join(folder_name, f"STnode{i}.png") for i in range(1, 8)
]
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
self._clf.save_all(save_folder=folder_name)
for file_name in file_names:
self.assertTrue(os.path.exists(file_name))
self.assertEqual("png", imghdr.what(file_name))
os.remove(file_name)
os.rmdir(folder_name)
def test_plot_all(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf.plot_all()
num_figures_after = plt.gcf().number
self.assertEqual(7, num_figures_after - num_figures_before)
class Snode_graph_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree_grapher(dict(random_state=self._random_state))
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_plot_size(self):
default = self._clf._tree_gr.get_plot_size()
expected = (17, 3)
self._clf._tree_gr.set_plot_size(expected)
self.assertEqual(expected, self._clf._tree_gr.get_plot_size())
self._clf._tree_gr.set_plot_size(default)
self.assertEqual(default, self._clf._tree_gr.get_plot_size())
def test_attributes_in_leaves_graph(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode_graph):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf._tree_gr)
def test_nodes_graph_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode_graph):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf._tree_gr)
def test_save_hyperplane(self):
folder_name = "/tmp/"
file_name = os.path.join(folder_name, "STnode1.png")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
self._clf._tree_gr.save_hyperplane(folder_name)
self.assertTrue(os.path.exists(file_name))
self.assertEqual("png", imghdr.what(file_name))
os.remove(file_name)
def test_plot_hyperplane_with_distribution(self):
plt.close()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_hyperplane(plot_distribution=True)
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_plot_hyperplane_without_distribution(self):
plt.close()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_hyperplane(plot_distribution=False)
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_plot_distribution(self):
plt.close()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_distribution()
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_set_axis_limits(self):
node = Snode_graph(Snode(None, None, None, "test"))
limits = (-2, 2), (-3, 3), (-4, 4)
node.set_axis_limits(limits)
computed = node.get_axis_limits()
x, y, z = limits
xx, yy, zz = computed
self.assertEqual(x, xx)
self.assertEqual(y, yy)
self.assertEqual(z, zz)

View File

@@ -1,340 +0,0 @@
import os
import unittest
import numpy as np
from sklearn.datasets import make_classification
from stree import Stree, Snode
def get_dataset(random_state=0):
X, y = make_classification(
n_samples=1500,
n_features=3,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
weights=[0.5, 0.5],
random_state=random_state,
)
return X, y
class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._kernels = ["linear", "rbf", "poly"]
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def _check_tree(self, node: Snode):
"""Check recursively that the nodes that are not leaves have the
correct number of labels and its sons have the right number of elements
in their dataset
Arguments:
node {Snode} -- node to check
"""
if node.is_leaf():
return
y_prediction = node._clf.predict(node._X)
y_down = node.get_down()._y
y_up = node.get_up()._y
# Is a correct partition in terms of cadinality?
# i.e. The partition algorithm didn't forget any sample
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
unique_y, count_y = np.unique(node._y, return_counts=True)
_, count_d = np.unique(y_down, return_counts=True)
_, count_u = np.unique(y_up, return_counts=True)
#
for i in unique_y:
try:
number_down = count_d[i]
except IndexError:
number_down = 0
try:
number_up = count_u[i]
except IndexError:
number_up = 0
self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction?
# as the node is not a leaf...
_, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[0], y_up.shape[0])
self.assertEqual(count_yp[1], y_down.shape[0])
self._check_tree(node.get_down())
self._check_tree(node.get_up())
def test_build_tree(self):
"""Check if the tree is built the same way as predictions of models
"""
import warnings
warnings.filterwarnings("ignore")
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
clf.fit(*get_dataset(self._random_state))
self._check_tree(clf.tree_)
def _find_out(
self, px: np.array, x_original: np.array, y_original
) -> list:
"""Find the original values of y for a given array of samples
Arguments:
px {np.array} -- array of samples to search for
x_original {np.array} -- original dataset
y_original {[type]} -- original classes
Returns:
np.array -- classes of the given samples
"""
res = []
for needle in px:
for row in range(x_original.shape[0]):
if all(x_original[row, :] == needle):
res.append(y_original[row])
return res
def test_single_prediction(self):
probs = [0.29026400766, 0.73105613, 0.0307635]
X, y = get_dataset(self._random_state)
for kernel, prob in zip(self._kernels, probs):
clf = Stree(kernel=kernel, random_state=self._random_state)
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
self.assertEqual(yp[0], y[0])
def test_multiple_prediction(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = get_dataset(self._random_state)
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
yp = clf.fit(X, y).predict(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp.tolist())
def test_score(self):
X, y = get_dataset(self._random_state)
for kernel, accuracy_expected in zip(
self._kernels,
[0.9506666666666667, 0.9606666666666667, 0.9433333333333334],
):
clf = Stree(random_state=self._random_state, kernel=kernel,)
clf.fit(X, y)
accuracy_score = clf.score(X, y)
yp = clf.predict(X)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertAlmostEqual(accuracy_expected, accuracy_score)
def test_single_predict_proba(self):
"""Check the element 28 probability of being 1
"""
decimals = 5
element = 28
probs = [0.29026400766, 0.73105613, 0.0307635]
X, y = get_dataset(self._random_state)
self.assertEqual(1, y[element])
for kernel, prob in zip(self._kernels, probs):
clf = Stree(kernel=kernel, random_state=self._random_state)
yp = clf.fit(X, y).predict_proba(
X[element, :].reshape(-1, X.shape[1])
)
self.assertAlmostEqual(
np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)
)
self.assertAlmostEqual(
round(prob, decimals), round(yp[0, 1], decimals), decimals
)
def test_multiple_predict_proba(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = get_dataset(self._random_state)
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
clf.fit(X, y)
yp = clf.predict_proba(X[:num, :])
self.assertListEqual(
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()
)
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as
predicting all samples at once
"""
X, y = get_dataset(self._random_state)
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
clf.fit(X, y)
# Compute prediction line by line
yp_line = np.array([], dtype=int)
for xp in X:
yp_line = np.append(
yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
)
# Compute prediction at once
yp_once = clf.predict(X)
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
def test_iterator_and_str(self):
"""Check preorder iterator
"""
expected = [
"root",
"root - Down",
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts"
"=(array([0, 1]), array([ 17, 691]))",
"root - Down - Up",
"root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 "
"counts=(array([0, 1]), array([1, 3]))",
"root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 "
"counts=(array([0]), array([7]))",
"root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array("
"[0, 1]), array([725, 56]))",
]
computed = []
expected_string = ""
clf = Stree(kernel="linear", random_state=self._random_state)
clf.fit(*get_dataset(self._random_state))
for node in clf:
computed.append(str(node))
expected_string += str(node) + "\n"
self.assertListEqual(expected, computed)
self.assertEqual(expected_string, str(clf))
def test_is_a_sklearn_classifier(self):
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator
check_estimator(Stree())
def test_exception_if_C_is_negative(self):
tclf = Stree(C=-1)
with self.assertRaises(ValueError):
tclf.fit(*get_dataset(self._random_state))
def test_check_max_depth_is_positive_or_None(self):
tcl = Stree()
self.assertIsNone(tcl.max_depth)
tcl = Stree(max_depth=1)
self.assertGreaterEqual(1, tcl.max_depth)
with self.assertRaises(ValueError):
tcl = Stree(max_depth=-1)
tcl.fit(*get_dataset(self._random_state))
def test_check_max_depth(self):
depths = (3, 4)
for depth in depths:
tcl = Stree(random_state=self._random_state, max_depth=depth)
tcl.fit(*get_dataset(self._random_state))
self.assertEqual(depth, tcl.depth_)
def test_unfitted_tree_is_iterable(self):
tcl = Stree()
self.assertEqual(0, len(list(tcl)))
def test_min_samples_split(self):
tcl_split = Stree(min_samples_split=3)
tcl_nosplit = Stree(min_samples_split=4)
dataset = [[1], [2], [3]], [1, 1, 0]
tcl_split.fit(*dataset)
self.assertIsNotNone(tcl_split.tree_.get_down())
self.assertIsNotNone(tcl_split.tree_.get_up())
tcl_nosplit.fit(*dataset)
self.assertIsNone(tcl_nosplit.tree_.get_down())
self.assertIsNone(tcl_nosplit.tree_.get_up())
def test_muticlass_dataset(self):
for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state)
px = [[1, 2], [3, 4], [5, 6]]
py = [1, 2, 3]
clf.fit(px, py)
self.assertEqual(1.0, clf.score(px, py))
self.assertListEqual([1, 2, 3], clf.predict(px).tolist())
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
self._random_state = 1
self._clf = Stree(random_state=self._random_state)
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def setUp(cls):
os.environ["TESTING"] = "1"
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf.tree_)
def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf.tree_)
def test_make_predictor_on_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.make_predictor()
self.assertEqual(1, test._class)
self.assertEqual(0.75, test._belief)
def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.set_up(Snode(None, [1], [1], "another_test"))
test.make_predictor()
self.assertIsNone(test._class)
self.assertEqual(0, test._belief)
def test_make_predictor_on_leaf_bogus_data(self):
test = Snode(None, [1, 2, 3, 4], [], "test")
test.make_predictor()
self.assertIsNone(test._class)

View File

@@ -1,9 +1,6 @@
from .Strees_test import Stree_test, Snode_test
from .Strees_grapher_test import Stree_grapher_test, Snode_graph_test
# type: ignore
from .Stree_test import Stree_test
from .Snode_test import Snode_test
from .Splitter_test import Splitter_test
__all__ = [
"Stree_test",
"Snode_test",
"Stree_grapher_test",
"Snode_graph_test",
]
__all__ = ["Stree_test", "Snode_test", "Splitter_test"]

18
stree/tests/utils.py Normal file
View File

@@ -0,0 +1,18 @@
# type: ignore
from sklearn.datasets import make_classification
def load_dataset(random_state=0, n_classes=2, n_features=3):
X, y = make_classification(
n_samples=1500,
n_features=n_features,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=n_classes,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
random_state=random_state,
)
return X, y