#4 Add code coverage & codacy badge

Add code coverage configuration in codecov
Add some tests
This commit is contained in:
2020-06-06 03:04:18 +02:00
parent b4816b2995
commit b9f14aec05
14 changed files with 608 additions and 204 deletions

14
.coveragerc Normal file
View File

@@ -0,0 +1,14 @@
[run]
branch = True
source = stree
[report]
exclude_lines =
if self.debug:
pragma: no cover
raise NotImplementedError
if __name__ == .__main__.:
ignore_errors = True
omit =
stree/tests/*
stree/__init__.py

3
.gitignore vendored
View File

@@ -129,4 +129,5 @@ dmypy.json
.pyre/
.idea
.vscode
.vscode
.pre-commit-config.yaml

View File

@@ -3,6 +3,7 @@ os: linux
dist: xenial
install:
- pip install -r requirements.txt
- pip install --upgrade codecov coverage black flake8
notifications:
email:
recipients:
@@ -10,4 +11,10 @@ notifications:
on_success: never # default: change
on_failure: always # default: always
# command to run tests
script: python -m unittest stree.tests
script:
- black --check --diff stree
- flake8 --count --exclude __init__.py stree
- coverage run -m unittest -v stree.tests
after_success:
- codecov
- bash <(curl -Ls https://coverage.codacy.com/get.sh)

View File

@@ -1,5 +1,6 @@
[![Build Status](https://travis-ci.com/Doctorado-ML/STree.svg?branch=master)](https://travis-ci.com/Doctorado-ML/STree)
[![codecov](https://codecov.io/gh/doctorado-ml/stree/branch/master/graph/badge.svg)](https://codecov.io/gh/doctorado-ml/stree)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/35fa3dfd53a24a339344b33d9f9f2f3d)](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=Doctorado-ML/STree&amp;utm_campaign=Badge_Grade)
# Stree
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.

15
codecov.yml Normal file
View File

@@ -0,0 +1,15 @@
overage:
status:
project:
default:
target: auto
patch:
default:
target: auto
comment:
layout: "reach, diff, flags, files"
behavior: default
require_changes: false
require_base: yes
require_head: yes
branches: null

57
main.py
View File

@@ -2,17 +2,29 @@ import time
from sklearn.model_selection import train_test_split
from stree import Stree
random_state=1
random_state = 1
def load_creditcard(n_examples=0):
import pandas as pd
import numpy as np
import random
df = pd.read_csv('data/creditcard.csv')
print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
df = pd.read_csv("data/creditcard.csv")
print(
"Fraud: {0:.3f}% {1}".format(
df.Class[df.Class == 1].count() * 100 / df.shape[0],
df.Class[df.Class == 1].count(),
)
)
print(
"Valid: {0:.3f}% {1}".format(
df.Class[df.Class == 0].count() * 100 / df.shape[0],
df.Class[df.Class == 0].count(),
)
)
y = np.expand_dims(df.Class.values, axis=1)
X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
X = df.drop(["Class", "Time", "Amount"], axis=1).values
if n_examples > 0:
# Take first n_examples samples
X = X[:n_examples, :]
@@ -26,14 +38,30 @@ def load_creditcard(n_examples=0):
X = np.append(Xt, X[indices], axis=0)
y = np.append(yt, y[indices], axis=0)
print("X.shape", X.shape, " y.shape", y.shape)
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
print(
"Fraud: {0:.3f}% {1}".format(
len(y[y == 1]) * 100 / X.shape[0], len(y[y == 1])
)
)
print(
"Valid: {0:.3f}% {1}".format(
len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])
)
)
Xtrain, Xtest, ytrain, ytest = train_test_split(
X,
y,
train_size=0.7,
shuffle=True,
random_state=random_state,
stratify=y,
)
return Xtrain, Xtest, ytrain, ytest
# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
# data = load_creditcard(5000) # Take the first 5000 samples
data = load_creditcard() # Take all the samples
data = load_creditcard() # Take all the samples
Xtrain = data[0]
Xtest = data[1]
@@ -41,17 +69,20 @@ ytrain = data[2]
ytest = data[3]
now = time.time()
clf = Stree(C=.01, random_state=random_state)
clf = Stree(C=0.01, random_state=random_state)
clf.fit(Xtrain, ytrain)
print(f"Took {time.time() - now:.2f} seconds to train")
print(clf)
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
proba = clf.predict_proba(Xtest)
print("Checking that we have correct probabilities, these are probabilities of sample belonging to class 1")
print(
"Checking that we have correct probabilities, these are probabilities of "
"sample belonging to class 1"
)
res0 = proba[proba[:, 0] == 0]
res1 = proba[proba[:, 0] == 1]
print("++++++++++res0 > .8++++++++++++")
print(res0[res0[:, 1] > .8])
print(res0[res0[:, 1] > 0.8])
print("**********res1 < .4************")
print(res1[res1[:, 1] < .4])
print(res1[res1[:, 1] < 0.4])

16
pyproject.toml Normal file
View File

@@ -0,0 +1,16 @@
[tool.black]
line-length = 79
include = '\.pyi?$'
exclude = '''
/(
\.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
)/
'''

View File

@@ -5,37 +5,32 @@ __author__ = "Ricardo Montañana Gómez"
def readme():
with open('README.md') as f:
with open("README.md") as f:
return f.read()
setuptools.setup(
name='STree',
name="STree",
version=__version__,
license='MIT License',
description='Oblique decision tree with svm nodes',
license="MIT License",
description="Oblique decision tree with svm nodes",
long_description=readme(),
long_description_content_type='text/markdown',
long_description_content_type="text/markdown",
packages=setuptools.find_packages(),
url='https://github.com/doctorado-ml/stree',
url="https://github.com/doctorado-ml/stree",
author=__author__,
author_email='ricardo.montanana@alu.uclm.es',
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-\
tree svm svc',
author_email="ricardo.montanana@alu.uclm.es",
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
tree svm svc",
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3.7',
'Natural Language :: English',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Intended Audience :: Science/Research'
],
install_requires=[
'scikit-learn>=0.23.0',
'numpy',
'matplotlib',
'ipympl'
"Development Status :: 4 - Beta",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3.7",
"Natural Language :: English",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Intended Audience :: Science/Research",
],
install_requires=["scikit-learn>=0.23.0", "numpy", "matplotlib", "ipympl"],
test_suite="stree.tests",
zip_safe=False
zip_safe=False,
)

View File

@@ -1,11 +1,11 @@
'''
"""
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Build an oblique tree classifier based on SVM Trees
Uses LinearSVC
'''
"""
import os
@@ -13,8 +13,12 @@ import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, \
_check_sample_weight
from sklearn.utils.validation import (
check_X_y,
check_array,
check_is_fitted,
_check_sample_weight,
)
class Snode:
@@ -22,22 +26,23 @@ class Snode:
dataset assigned to it
"""
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray,
title: str):
def __init__(
self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str
):
self._clf = clf
self._vector = None if clf is None else clf.coef_
self._interceptor = 0. if clf is None else clf.intercept_
self._interceptor = 0.0 if clf is None else clf.intercept_
self._title = title
self._belief = 0.
self._belief = 0.0
# Only store dataset in Testing
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
self._y = y
self._down = None
self._up = None
self._class = None
@classmethod
def copy(cls, node: 'Snode') -> 'Snode':
def copy(cls, node: "Snode") -> "Snode":
return cls(node._clf, node._X, node._y, node._title)
def set_down(self, son):
@@ -49,10 +54,10 @@ class Snode:
def is_leaf(self) -> bool:
return self._up is None and self._down is None
def get_down(self) -> 'Snode':
def get_down(self) -> "Snode":
return self._down
def get_up(self) -> 'Snode':
def get_up(self) -> "Snode":
return self._up
def make_predictor(self):
@@ -68,7 +73,7 @@ class Snode:
try:
self._belief = max_card / (max_card + min_card)
except ZeroDivisionError:
self._belief = 0.
self._belief = 0.0
self._class = classes[card == max_card][0]
else:
self._belief = 1
@@ -77,8 +82,10 @@ class Snode:
def __str__(self) -> str:
if self.is_leaf():
count_values = np.unique(self._y, return_counts=True)
result = f"{self._title} - Leaf class={self._class} belief="\
result = (
f"{self._title} - Leaf class={self._class} belief="
f"{self._belief: .6f} counts={count_values}"
)
return result
else:
return f"{self._title}"
@@ -116,9 +123,15 @@ class Stree(BaseEstimator, ClassifierMixin):
with "classifier" as value
"""
def __init__(self, C: float = 1.0, max_iter: int = 1000,
random_state: int = None, max_depth: int = None,
tol: float = 1e-4, use_predictions: bool = False):
def __init__(
self,
C: float = 1.0,
max_iter: int = 1000,
random_state: int = None,
max_depth: int = None,
tol: float = 1e-4,
use_predictions: bool = False,
):
self.max_iter = max_iter
self.C = C
self.random_state = random_state
@@ -132,7 +145,7 @@ class Stree(BaseEstimator, ClassifierMixin):
:return: the tag required
:rtype: dict
"""
return {'binary_only': True, 'requires_y': True}
return {"binary_only": True, "requires_y": True}
def _linear_function(self, data: np.array, node: Snode) -> np.array:
"""Compute the distance of set of samples to a hyperplane, in
@@ -140,9 +153,9 @@ class Stree(BaseEstimator, ClassifierMixin):
hyperplane of each class
:param data: dataset of samples
:type data: np.array
:type data: np.array shape(m, n)
:param node: the node that contains the hyperplance coefficients
:type node: Snode
:type node: Snode shape(1, n)
:return: array of distances of each sample to the hyperplane
:rtype: np.array
"""
@@ -160,8 +173,10 @@ class Stree(BaseEstimator, ClassifierMixin):
:rtype: list
"""
up = ~down
return origin[up[:, 0]] if any(up) else None, \
origin[down[:, 0]] if any(down) else None
return (
origin[up[:, 0]] if any(up) else None,
origin[down[:, 0]] if any(down) else None,
)
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node
@@ -194,8 +209,9 @@ class Stree(BaseEstimator, ClassifierMixin):
"""
return data > 0
def fit(self, X: np.ndarray, y: np.ndarray,
sample_weight: np.array = None) -> 'Stree':
def fit(
self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
) -> "Stree":
"""Build the tree based on the dataset of samples and its labels
:raises ValueError: if parameters C or max_depth are out of bounds
@@ -203,17 +219,22 @@ class Stree(BaseEstimator, ClassifierMixin):
:rtype: Stree
"""
# Check parameters are Ok.
if type(y).__name__ == 'np.ndarray':
if type(y).__name__ == "np.ndarray":
y = y.ravel()
if self.C < 0:
raise ValueError(
f"Penalty term must be positive... got (C={self.C:f})")
self.__max_depth = np.iinfo(
np.int32).max if self.max_depth is None else self.max_depth
f"Penalty term must be positive... got (C={self.C:f})"
)
self.__max_depth = (
np.iinfo(np.int32).max
if self.max_depth is None
else self.max_depth
)
if self.__max_depth < 1:
raise ValueError(
f"Maximum depth has to be greater than 1... got (max_depth=\
{self.max_depth})")
{self.max_depth})"
)
check_classification_targets(y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
@@ -223,13 +244,14 @@ class Stree(BaseEstimator, ClassifierMixin):
self.n_iter_ = self.max_iter
self.depth_ = 0
self.n_features_in_ = X.shape[1]
self.tree_ = self.train(X, y, sample_weight, 1, 'root')
self.tree_ = self.train(X, y, sample_weight, 1, "root")
self._build_predictor()
return self
def _build_predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Snode):
if node.is_leaf():
node.make_predictor()
@@ -239,8 +261,14 @@ class Stree(BaseEstimator, ClassifierMixin):
run_tree(self.tree_)
def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray,
depth: int, title: str) -> Snode:
def train(
self,
X: np.ndarray,
y: np.ndarray,
sample_weight: np.ndarray,
depth: int,
title: str,
) -> Snode:
"""Recursive function to split the original dataset into predictor
nodes (leaves)
@@ -261,10 +289,11 @@ class Stree(BaseEstimator, ClassifierMixin):
return None
if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset
return Snode(None, X, y, title + ', <pure>')
return Snode(None, X, y, title + ", <pure>")
# Train the model
clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
C=self.C) # , sample_weight=sample_weight)
clf = LinearSVC(
max_iter=self.max_iter, random_state=self.random_state, C=self.C
) # , sample_weight=sample_weight)
clf.fit(X, y, sample_weight=sample_weight)
tree = Snode(clf, X, y, title)
self.depth_ = max(depth, self.depth_)
@@ -274,9 +303,9 @@ class Stree(BaseEstimator, ClassifierMixin):
sw_u, sw_d = self._split_array(sample_weight, down)
if X_U is None or X_D is None:
# didn't part anything
return Snode(clf, X, y, title + ', <cgaf>')
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + ' - Up'))
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + ' - Down'))
return Snode(clf, X, y, title + ", <cgaf>")
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
return tree
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
@@ -308,8 +337,10 @@ class Stree(BaseEstimator, ClassifierMixin):
:return: array of labels
:rtype: np.array
"""
def predict_class(xp: np.array, indices: np.array,
node: Snode) -> np.array:
def predict_class(
xp: np.array, indices: np.array, node: Snode
) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
@@ -322,14 +353,18 @@ class Stree(BaseEstimator, ClassifierMixin):
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ['tree_'])
check_is_fitted(self, ["tree_"])
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
result = self._reorder_results(
*predict_class(X, indices, self.tree_)).astype(int).ravel()
result = (
self._reorder_results(*predict_class(X, indices, self.tree_))
.astype(int)
.ravel()
)
return self.classes_[result]
def predict_proba(self, X: np.array) -> np.array:
@@ -341,8 +376,10 @@ class Stree(BaseEstimator, ClassifierMixin):
each class
:rtype: np.array
"""
def predict_class(xp: np.array, indices: np.array, dist: np.array,
node: Snode) -> np.array:
def predict_class(
xp: np.array, indices: np.array, dist: np.array, node: Snode
) -> np.array:
"""Run the tree to compute predictions
:param xp: subdataset of samples
@@ -375,7 +412,7 @@ class Stree(BaseEstimator, ClassifierMixin):
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ['tree_'])
check_is_fitted(self, ["tree_"])
# Input validation
X = check_array(X)
# setup prediction & make it happen
@@ -426,7 +463,7 @@ class Stree(BaseEstimator, ClassifierMixin):
:return: description of nodes in the tree in preorder
:rtype: str
"""
output = ''
output = ""
for i in self:
output += str(i) + '\n'
output += str(i) + "\n"
return output

View File

@@ -1,10 +1,10 @@
'''
"""
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Plot 3D views of nodes in Stree
'''
"""
import os
@@ -17,7 +17,6 @@ from .Strees import Stree, Snode, Siterator
class Snode_graph(Snode):
def __init__(self, node: Stree):
self._plot_size = (8, 8)
self._xlimits = (None, None)
@@ -29,34 +28,36 @@ class Snode_graph(Snode):
def set_plot_size(self, size: tuple):
self._plot_size = size
def get_plot_size(self) -> tuple:
return self._plot_size
def _is_pure(self) -> bool:
"""is considered pure a leaf node with one label
"""
if self.is_leaf():
return self._belief == 1.
return self._belief == 1.0
return False
def set_axis_limits(self, limits: tuple):
self._xlimits = limits[0]
self._ylimits = limits[1]
self._zlimits = limits[2]
self._xlimits, self._ylimits, self._zlimits = limits
def _set_graphics_axis(self, ax: Axes3D):
ax.set_xlim(self._xlimits)
ax.set_ylim(self._ylimits)
ax.set_zlim(self._zlimits)
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '',
save_seq: int = 1):
def save_hyperplane(
self, save_folder: str = "./", save_prefix: str = "", save_seq: int = 1
):
_, fig = self.plot_hyperplane()
name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
fig.savefig(name, bbox_inches='tight')
fig.savefig(name, bbox_inches="tight")
plt.close(fig)
def _get_cmap(self):
cmap = 'jet'
cmap = "jet"
if self._is_pure() and self._class == 1:
cmap = 'jet_r'
cmap = "jet_r"
return cmap
def _graph_title(self):
@@ -65,22 +66,31 @@ class Snode_graph(Snode):
def plot_hyperplane(self, plot_distribution: bool = True):
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax = fig.add_subplot(1, 1, 1, projection="3d")
if not self._is_pure():
# Can't plot hyperplane of leaves with one label because it hasn't
# classiffier
# get the splitting hyperplane
def hyperplane(x, y): return (-self._interceptor
- self._vector[0][0] * x
- self._vector[0][1] * y) \
/ self._vector[0][2]
def hyperplane(x, y):
return (
-self._interceptor
- self._vector[0][0] * x
- self._vector[0][1] * y
) / self._vector[0][2]
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
xx, yy = np.meshgrid(tmpx, tmpy)
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5,
antialiased=True, rstride=1, cstride=1,
cmap='seismic')
ax.plot_surface(
xx,
yy,
hyperplane(xx, yy),
alpha=0.5,
antialiased=True,
rstride=1,
cstride=1,
cmap="seismic",
)
self._set_graphics_axis(ax)
if plot_distribution:
self.plot_distribution(ax)
@@ -92,14 +102,15 @@ class Snode_graph(Snode):
def plot_distribution(self, ax: Axes3D = None):
if ax is None:
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax = fig.add_subplot(1, 1, 1, projection="3d")
plt.title(self._graph_title())
cmap = self._get_cmap()
ax.scatter(self._X[:, 0], self._X[:, 1],
self._X[:, 2], c=self._y, cmap=cmap)
ax.set_xlabel('X0')
ax.set_ylabel('X1')
ax.set_zlabel('X2')
ax.scatter(
self._X[:, 0], self._X[:, 1], self._X[:, 2], c=self._y, cmap=cmap
)
ax.set_xlabel("X0")
ax.set_ylabel("X1")
ax.set_zlabel("X2")
plt.show()
@@ -112,17 +123,17 @@ class Stree_grapher(Stree):
self._plot_size = (8, 8)
self._tree_gr = None
# make Snode store X's
os.environ['TESTING'] = '1'
os.environ["TESTING"] = "1"
self._fitted = False
self._pca = None
super().__init__(**params)
def __del__(self):
try:
os.environ.pop('TESTING')
os.environ.pop("TESTING")
except KeyError:
pass
plt.close('all')
plt.close("all")
def _copy_tree(self, node: Snode) -> Snode_graph:
mirror = Snode_graph(node)
@@ -161,9 +172,9 @@ class Stree_grapher(Stree):
def _check_fitted(self):
if not self._fitted:
raise Exception('Have to fit the grapher first!')
raise Exception("Have to fit the grapher first!")
def save_all(self, save_folder: str = './', save_prefix: str = ''):
def save_all(self, save_folder: str = "./", save_prefix: str = ""):
"""Save all the node plots in png format, each with a sequence number
:param save_folder: folder where the plots are saved, defaults to './'
@@ -174,8 +185,9 @@ class Stree_grapher(Stree):
os.mkdir(save_folder)
seq = 1
for node in self:
node.save_hyperplane(save_folder=save_folder,
save_prefix=save_prefix, save_seq=seq)
node.save_hyperplane(
save_folder=save_folder, save_prefix=save_prefix, save_seq=seq
)
seq += 1
def plot_all(self):

View File

@@ -1,2 +1,4 @@
from .Strees import Stree, Snode, Siterator
from .Strees_grapher import Stree_grapher, Snode_graph
from .Strees_grapher import Stree_grapher, Snode_graph
__all__ = ["Stree", "Snode", "Siterator", "Stree_grapher", "Snode_graph"]

View File

@@ -0,0 +1,211 @@
import os
import imghdr
import unittest
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
from sklearn.datasets import make_classification
from stree import Stree_grapher, Snode_graph
def get_dataset(random_state=0, n_features=3):
X, y = make_classification(
n_samples=1500,
n_features=n_features,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
weights=[0.5, 0.5],
random_state=random_state,
)
return X, y
class Stree_grapher_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
os.environ["TESTING"] = "1"
self._random_state = 1
self._clf = Stree_grapher(
dict(random_state=self._random_state, use_predictions=False)
)
self._clf.fit(*get_dataset(self._random_state, n_features=4))
super().__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
try:
os.environ.pop("TESTING")
except KeyError:
pass
def test_iterator(self):
"""Check preorder iterator
"""
expected = [
"root",
"root - Down",
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.976023 counts"
"=(array([0, 1]), array([ 17, 692]))",
"root - Down - Up",
"root - Down - Up - Down, <cgaf> - Leaf class=0 belief= 0.500000 "
"counts=(array([0, 1]), array([1, 1]))",
"root - Down - Up - Up, <cgaf> - Leaf class=0 belief= 0.888889 "
"counts=(array([0, 1]), array([8, 1]))",
"root - Up, <cgaf> - Leaf class=0 belief= 0.928205 counts=(array("
"[0, 1]), array([724, 56]))",
]
computed = []
for node in self._clf:
computed.append(str(node))
self.assertListEqual(expected, computed)
def test_score(self):
X, y = get_dataset(self._random_state)
accuracy_score = self._clf.score(X, y)
yp = self._clf.predict(X)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.86)
def test_save_all(self):
folder_name = "/tmp/"
file_names = [f"{folder_name}STnode{i}.png" for i in range(1, 8)]
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
self._clf.save_all(save_folder=folder_name)
for file_name in file_names:
self.assertTrue(os.path.exists(file_name))
self.assertEqual("png", imghdr.what(file_name))
os.remove(file_name)
def test_plot_all(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf.plot_all()
num_figures_after = plt.gcf().number
self.assertEqual(7, num_figures_after - num_figures_before)
def test_filt_4_dims_dataset(self):
self._clf.fit(*get_dataset(self._random_state, n_features=4))
class Snode_graph_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
os.environ["TESTING"] = "1"
self._random_state = 1
self._clf = Stree_grapher(
dict(random_state=self._random_state, use_predictions=False)
)
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
"""Remove the testing environ variable
"""
try:
os.environ.pop("TESTING")
except KeyError:
pass
def test_plot_size(self):
default = self._clf._tree_gr.get_plot_size()
expected = (17, 3)
self._clf._tree_gr.set_plot_size(expected)
self.assertEqual(expected, self._clf._tree_gr.get_plot_size())
self._clf._tree_gr.set_plot_size(default)
self.assertEqual(default, self._clf._tree_gr.get_plot_size())
def test_attributes_in_leaves_graph(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode_graph):
if not node.is_leaf():
check_leave(node.get_down())
check_leave(node.get_up())
return
# Check Belief in leave
classes, card = np.unique(node._y, return_counts=True)
max_card = max(card)
min_card = min(card)
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)
# Check Class
class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class)
check_leave(self._clf._tree_gr)
def test_nodes_graph_coefs(self):
"""Check if the nodes of the tree have the right attributes filled
"""
def run_tree(node: Snode_graph):
if node._belief < 1:
# only exclude pure leaves
self.assertIsNotNone(node._clf)
self.assertIsNotNone(node._clf.coef_)
self.assertIsNotNone(node._vector)
self.assertIsNotNone(node._interceptor)
if node.is_leaf():
return
run_tree(node.get_down())
run_tree(node.get_up())
run_tree(self._clf._tree_gr)
def test_save_hyperplane(self):
folder_name = "/tmp/"
file_name = f"{folder_name}STnode1.png"
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
self._clf._tree_gr.save_hyperplane(folder_name)
self.assertTrue(os.path.exists(file_name))
self.assertEqual("png", imghdr.what(file_name))
os.remove(file_name)
def test_plot_hyperplane_with_distribution(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_hyperplane(plot_distribution=True)
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_plot_hyperplane_without_distribution(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_hyperplane(plot_distribution=False)
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)
def test_plot_distribution(self):
with warnings.catch_warnings():
warnings.simplefilter("ignore")
matplotlib.use("Agg")
num_figures_before = plt.gcf().number
self._clf._tree_gr.plot_distribution()
num_figures_after = plt.gcf().number
self.assertEqual(1, num_figures_after - num_figures_before)

View File

@@ -7,30 +7,54 @@ from sklearn.datasets import make_classification
from stree import Stree, Snode
class Stree_test(unittest.TestCase):
def get_dataset(random_state=0):
X, y = make_classification(
n_samples=1500,
n_features=3,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
weights=[0.5, 0.5],
random_state=random_state,
)
return X, y
class Stree_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
os.environ['TESTING'] = '1'
os.environ["TESTING"] = "1"
self._random_state = 1
self._clf = Stree(random_state=self._random_state,
use_predictions=False)
self._clf.fit(*self._get_Xy())
self._clf = Stree(
random_state=self._random_state, use_predictions=False
)
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
def tearDownClass(cls):
try:
os.environ.pop('TESTING')
os.environ.pop("TESTING")
except KeyError:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3,
n_informative=3, n_redundant=0,
n_repeated=0, n_classes=2,
n_clusters_per_class=2, class_sep=1.5,
flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
X, y = make_classification(
n_samples=1500,
n_features=3,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
n_clusters_per_class=2,
class_sep=1.5,
flip_y=0,
weights=[0.5, 0.5],
random_state=self._random_state,
)
return X, y
def _check_tree(self, node: Snode):
@@ -85,15 +109,16 @@ class Stree_test(unittest.TestCase):
Returns:
tuple -- tuple with samples, categories
"""
data = np.genfromtxt(file_name, delimiter=',')
data = np.genfromtxt(file_name, delimiter=",")
data = np.array(data)
column_y = data.shape[1] - 1
fy = data[:, column_y]
fx = np.delete(data, column_y, axis=1)
return fx, fy
def _find_out(self, px: np.array, x_original: np.array,
y_original) -> list:
def _find_out(
self, px: np.array, x_original: np.array, y_original
) -> list:
"""Find the original values of y for a given array of samples
Arguments:
@@ -112,19 +137,19 @@ class Stree_test(unittest.TestCase):
return res
def test_single_prediction(self):
X, y = self._get_Xy()
X, y = get_dataset(self._random_state)
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
self.assertEqual(yp[0], y[0])
def test_multiple_prediction(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = self._get_Xy()
X, y = get_dataset(self._random_state)
yp = self._clf.predict(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp.tolist())
def test_score(self):
X, y = self._get_Xy()
X, y = get_dataset(self._random_state)
accuracy_score = self._clf.score(X, y)
yp = self._clf.predict(X)
accuracy_computed = np.mean(yp == y)
@@ -138,35 +163,55 @@ class Stree_test(unittest.TestCase):
# Element 28 has a different prediction than the truth
decimals = 5
prob = 0.29026400766
X, y = self._get_Xy()
X, y = get_dataset(self._random_state)
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(np.round(1 - prob, decimals),
np.round(yp[0:, 0], decimals))
self.assertEqual(
np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)
)
self.assertEqual(1, y[28])
self.assertAlmostEqual(
round(prob, decimals),
round(yp[0, 1], decimals),
decimals
round(prob, decimals), round(yp[0, 1], decimals), decimals
)
def test_multiple_predict_proba(self):
# First 27 elements the predictions are the same as the truth
num = 27
decimals = 5
X, y = self._get_Xy()
X, y = get_dataset(self._random_state)
yp = self._clf.predict_proba(X[:num, :])
self.assertListEqual(
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist())
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833,
0.14269291, 0.85193236,
0.29876058, 0.7282164, 0.85958616, 0.89517877,
0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624,
0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257,
0.28906333, 0.32643692,
0.29788483, 0.01657364, 0.81149083]
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()
)
expected_proba = [
0.88395641,
0.36746962,
0.84158767,
0.34106833,
0.14269291,
0.85193236,
0.29876058,
0.7282164,
0.85958616,
0.89517877,
0.99745224,
0.18860349,
0.30756427,
0.8318412,
0.18981198,
0.15564624,
0.25740655,
0.22923355,
0.87365959,
0.49928689,
0.95574351,
0.28761257,
0.28906333,
0.32643692,
0.29788483,
0.01657364,
0.81149083,
]
expected = np.round(expected_proba, decimals=decimals).tolist()
computed = np.round(yp[:, 1], decimals=decimals).tolist()
for i in range(len(expected)):
@@ -178,11 +223,13 @@ class Stree_test(unittest.TestCase):
use vector of coefficients to compute both predictions and splitted
data
"""
model_clf = Stree(random_state=self._random_state,
use_predictions=True)
model_computed = Stree(random_state=self._random_state,
use_predictions=False)
X, y = self._get_Xy()
model_clf = Stree(
random_state=self._random_state, use_predictions=True
)
model_computed = Stree(
random_state=self._random_state, use_predictions=False
)
X, y = get_dataset(self._random_state)
model_clf.fit(X, y)
model_computed.fit(X, y)
return model_clf, model_computed, X, y
@@ -194,74 +241,76 @@ class Stree_test(unittest.TestCase):
"""
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual(
use_clf.predict(X).tolist(),
use_math.predict(X).tolist()
use_clf.predict(X).tolist(), use_math.predict(X).tolist()
)
def test_use_model_score(self):
use_clf, use_math, X, y = self.build_models()
b = use_math.score(X, y)
self.assertEqual(
use_clf.score(X, y),
b
)
self.assertGreater(b, .95)
self.assertEqual(use_clf.score(X, y), b)
self.assertGreater(b, 0.95)
def test_use_model_predict_proba(self):
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual(
use_clf.predict_proba(X).tolist(),
use_math.predict_proba(X).tolist()
use_math.predict_proba(X).tolist(),
)
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as
predicting all samples at once
"""
X, _ = self._get_Xy()
X, _ = get_dataset(self._random_state)
# Compute prediction line by line
yp_line = np.array([], dtype=int)
for xp in X:
yp_line = np.append(yp_line, self._clf.predict(
xp.reshape(-1, X.shape[1])))
yp_line = np.append(
yp_line, self._clf.predict(xp.reshape(-1, X.shape[1]))
)
# Compute prediction at once
yp_once = self._clf.predict(X)
#
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
def test_iterator(self):
def test_iterator_and_str(self):
"""Check preorder iterator
"""
expected = [
'root',
'root - Down',
'root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts'
'=(array([0, 1]), array([ 17, 691]))',
'root - Down - Up',
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 '
'counts=(array([0, 1]), array([1, 3]))',
'root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 '
'counts=(array([0]), array([7]))',
'root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array('
'[0, 1]), array([725, 56]))',
"root",
"root - Down",
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts"
"=(array([0, 1]), array([ 17, 691]))",
"root - Down - Up",
"root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 "
"counts=(array([0, 1]), array([1, 3]))",
"root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 "
"counts=(array([0]), array([7]))",
"root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array("
"[0, 1]), array([725, 56]))",
]
computed = []
expected_string = ""
for node in self._clf:
computed.append(str(node))
expected_string += str(node) + "\n"
self.assertListEqual(expected, computed)
self.assertEqual(expected_string, str(self._clf))
def test_is_a_sklearn_classifier(self):
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator
check_estimator(Stree())
def test_exception_if_C_is_negative(self):
tclf = Stree(C=-1)
with self.assertRaises(ValueError):
tclf.fit(*self._get_Xy())
tclf.fit(*get_dataset(self._random_state))
def test_check_max_depth_is_positive_or_None(self):
tcl = Stree()
@@ -270,12 +319,12 @@ class Stree_test(unittest.TestCase):
self.assertGreaterEqual(1, tcl.max_depth)
with self.assertRaises(ValueError):
tcl = Stree(max_depth=-1)
tcl.fit(*self._get_Xy())
tcl.fit(*get_dataset(self._random_state))
def test_check_max_depth(self):
depth = 3
tcl = Stree(random_state=self._random_state, max_depth=depth)
tcl.fit(*self._get_Xy())
tcl.fit(*get_dataset(self._random_state))
self.assertEqual(depth, tcl.depth_)
def test_unfitted_tree_is_iterable(self):
@@ -284,13 +333,13 @@ class Stree_test(unittest.TestCase):
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
os.environ['TESTING'] = '1'
os.environ["TESTING"] = "1"
self._random_state = 1
self._clf = Stree(random_state=self._random_state,
use_predictions=True)
self._clf.fit(*self._get_Xy())
self._clf = Stree(
random_state=self._random_state, use_predictions=True
)
self._clf.fit(*get_dataset(self._random_state))
super().__init__(*args, **kwargs)
@classmethod
@@ -298,18 +347,10 @@ class Snode_test(unittest.TestCase):
"""[summary]
"""
try:
os.environ.pop('TESTING')
os.environ.pop("TESTING")
except KeyError:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3,
n_informative=3, n_redundant=0, n_classes=2,
n_repeated=0, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
return X, y
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a
predictor
@@ -328,7 +369,7 @@ class Snode_test(unittest.TestCase):
try:
belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.
belief = 0.0
else:
belief = 1
self.assertEqual(belief, node._belief)
@@ -355,3 +396,16 @@ class Snode_test(unittest.TestCase):
run_tree(node.get_up())
run_tree(self._clf.tree_)
def test_make_predictor_on_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.make_predictor()
self.assertEqual(1, test._class)
self.assertEqual(0.75, test._belief)
def test_make_predictor_on_not_leaf(self):
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
test.set_up(Snode(None, [1], [1], "another_test"))
test.make_predictor()
self.assertIsNone(test._class)
self.assertEqual(0, test._belief)

View File

@@ -1 +1,9 @@
from .Strees_test import Stree_test, Snode_test
from .Strees_test import Stree_test, Snode_test
from .Strees_grapher_test import Stree_grapher_test, Snode_graph_test
__all__ = [
"Stree_test",
"Snode_test",
"Stree_grapher_test",
"Snode_graph_test",
]