mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 00:16:07 +00:00
Make project python package friendly
- Add setup.py - Move classes to module files - Move tests folder inside module folder
This commit is contained in:
311
stree/Strees.py
Normal file
311
stree/Strees.py
Normal file
@@ -0,0 +1,311 @@
|
||||
'''
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
||||
__license__ = "MIT"
|
||||
__version__ = "0.9"
|
||||
Build an oblique tree classifier based on SVM Trees
|
||||
Uses LinearSVC
|
||||
'''
|
||||
|
||||
import typing
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
|
||||
class Snode:
|
||||
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
|
||||
self._clf = clf
|
||||
self._vector = None if clf is None else clf.coef_
|
||||
self._interceptor = 0. if clf is None else clf.intercept_
|
||||
self._title = title
|
||||
self._belief = 0. # belief of the prediction in a leaf node based on samples
|
||||
# Only store dataset in Testing
|
||||
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
|
||||
self._y = y
|
||||
self._down = None
|
||||
self._up = None
|
||||
self._class = None
|
||||
|
||||
@classmethod
|
||||
def copy(cls, node: 'Snode') -> 'Snode':
|
||||
return cls(node._clf, node._X, node._y, node._title)
|
||||
|
||||
def set_down(self, son):
|
||||
self._down = son
|
||||
|
||||
def set_up(self, son):
|
||||
self._up = son
|
||||
|
||||
def is_leaf(self) -> bool:
|
||||
return self._up is None and self._down is None
|
||||
|
||||
def get_down(self) -> 'Snode':
|
||||
return self._down
|
||||
|
||||
def get_up(self) -> 'Snode':
|
||||
return self._up
|
||||
|
||||
def make_predictor(self):
|
||||
"""Compute the class of the predictor and its belief based on the subdataset of the node
|
||||
only if it is a leaf
|
||||
"""
|
||||
if not self.is_leaf():
|
||||
return
|
||||
classes, card = np.unique(self._y, return_counts=True)
|
||||
if len(classes) > 1:
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
try:
|
||||
self._belief = max_card / (max_card + min_card)
|
||||
except:
|
||||
self._belief = 0.
|
||||
self._class = classes[card == max_card][0]
|
||||
else:
|
||||
self._belief = 1
|
||||
self._class = classes[0]
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.is_leaf():
|
||||
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}"
|
||||
else:
|
||||
return f"{self._title}"
|
||||
|
||||
|
||||
class Siterator:
|
||||
"""Stree preorder iterator
|
||||
"""
|
||||
|
||||
def __init__(self, tree: Snode):
|
||||
self._stack = []
|
||||
self._push(tree)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def _push(self, node: Snode):
|
||||
if node is not None:
|
||||
self._stack.append(node)
|
||||
|
||||
def __next__(self) -> Snode:
|
||||
if len(self._stack) == 0:
|
||||
raise StopIteration()
|
||||
node = self._stack.pop()
|
||||
self._push(node.get_up())
|
||||
self._push(node.get_down())
|
||||
return node
|
||||
|
||||
class Stree(BaseEstimator, ClassifierMixin):
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
|
||||
self._max_iter = max_iter
|
||||
self._C = C
|
||||
self._random_state = random_state
|
||||
self._tree = None
|
||||
self.__folder = 'data/'
|
||||
self.__use_predictions = use_predictions
|
||||
self.__trained = False
|
||||
self.__proba = False
|
||||
|
||||
def get_params(self, deep=True):
|
||||
"""Get dict with hyperparameters and its values to accomplish sklearn rules
|
||||
"""
|
||||
return {"C": self._C, "random_state": self._random_state, 'max_iter': self._max_iter}
|
||||
|
||||
def set_params(self, **parameters):
|
||||
"""Set hyperparmeters as specified by sklearn, needed in Gridsearchs
|
||||
"""
|
||||
for parameter, value in parameters.items():
|
||||
setattr(self, parameter, value)
|
||||
return self
|
||||
|
||||
def _linear_function(self, data: np.array, node: Snode) -> np.array:
|
||||
coef = node._vector[0, :].reshape(-1, data.shape[1])
|
||||
return data.dot(coef.T) + node._interceptor[0]
|
||||
|
||||
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
|
||||
if self.__use_predictions:
|
||||
yp = node._clf.predict(data)
|
||||
down = (yp == 1).reshape(-1, 1)
|
||||
res = np.expand_dims(node._clf.decision_function(data), 1)
|
||||
else:
|
||||
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
|
||||
# computes positition of every sample is w.r.t. the hyperplane
|
||||
res = self._linear_function(data, node)
|
||||
down = res > 0
|
||||
up = ~down
|
||||
data_down = data[down[:, 0]] if any(down) else None
|
||||
indices_down = indices[down[:, 0]] if any(down) else None
|
||||
res_down = res[down[:, 0]] if any(down) else None
|
||||
data_up = data[up[:, 0]] if any(up) else None
|
||||
indices_up = indices[up[:, 0]] if any(up) else None
|
||||
res_up = res[up[:, 0]] if any(up) else None
|
||||
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
|
||||
X, y = check_X_y(X, y.ravel())
|
||||
self.n_features_in_ = X.shape[1]
|
||||
self._tree = self.train(X, y.ravel(), title)
|
||||
self._build_predictor()
|
||||
self.__trained = True
|
||||
return self
|
||||
|
||||
def _build_predictor(self):
|
||||
"""Process the leaves to make them predictors
|
||||
"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node.is_leaf():
|
||||
node.make_predictor()
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
|
||||
run_tree(self._tree)
|
||||
|
||||
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
|
||||
if np.unique(y).shape[0] == 1:
|
||||
# only 1 class => pure dataset
|
||||
return Snode(None, X, y, title + ', <pure>')
|
||||
# Train the model
|
||||
clf = LinearSVC(max_iter=self._max_iter, C=self._C,
|
||||
random_state=self._random_state)
|
||||
clf.fit(X, y)
|
||||
tree = Snode(clf, X, y, title)
|
||||
X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
|
||||
if X_U is None or X_D is None:
|
||||
# didn't part anything
|
||||
return Snode(clf, X, y, title + ', <cgaf>')
|
||||
tree.set_up(self.train(X_U, y_u, title + ' - Up'))
|
||||
tree.set_down(self.train(X_D, y_d, title + ' - Down'))
|
||||
return tree
|
||||
|
||||
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
|
||||
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
|
||||
indices = indices.astype(int)
|
||||
for i, index in enumerate(indices):
|
||||
y_ordered[index] = y[i]
|
||||
return y_ordered
|
||||
|
||||
def predict(self, X: np.array) -> np.array:
|
||||
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
|
||||
if xp is None:
|
||||
return [], []
|
||||
if node.is_leaf():
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
return prediction, indices
|
||||
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
|
||||
k, l = predict_class(d, i_d, node.get_down())
|
||||
m, n = predict_class(u, i_u, node.get_up())
|
||||
return np.append(k, m), np.append(l, n)
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self)
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
# setup prediction & make it happen
|
||||
indices = np.arange(X.shape[0])
|
||||
return self._reorder_results(*predict_class(X, indices, self._tree))
|
||||
|
||||
def predict_proba(self, X: np.array) -> np.array:
|
||||
"""Computes an approximation of the probability of samples belonging to class 1
|
||||
(nothing more, nothing less)
|
||||
|
||||
:param X: dataset
|
||||
:type X: np.array
|
||||
"""
|
||||
|
||||
def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
|
||||
"""Run the tree to compute predictions
|
||||
|
||||
:param xp: subdataset of samples
|
||||
:type xp: np.array
|
||||
:param indices: indices of subdataset samples to rebuild original order
|
||||
:type indices: np.array
|
||||
:param dist: distances of every sample to the hyperplane or the father node
|
||||
:type dist: np.array
|
||||
:param node: node of the leaf with the class
|
||||
:type node: Snode
|
||||
:return: array of labels and distances, array of indices
|
||||
:rtype: np.array
|
||||
"""
|
||||
if xp is None:
|
||||
return [], []
|
||||
if node.is_leaf():
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
prediction_proba = dist
|
||||
return np.append(prediction, prediction_proba, axis=1), indices
|
||||
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
|
||||
k, l = predict_class(d, i_d, r_d, node.get_down())
|
||||
m, n = predict_class(u, i_u, r_u, node.get_up())
|
||||
return np.append(k, m), np.append(l, n)
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self)
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
# setup prediction & make it happen
|
||||
indices = np.arange(X.shape[0])
|
||||
result, indices = predict_class(X, indices, [], self._tree)
|
||||
result = result.reshape(X.shape[0], 2)
|
||||
# Turn distances to hyperplane into probabilities based on fitting distances
|
||||
# of samples to its hyperplane that classified them, to the sigmoid function
|
||||
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
|
||||
return self._reorder_results(result, indices)
|
||||
|
||||
def score(self, X: np.array, y: np.array) -> float:
|
||||
"""Return accuracy
|
||||
"""
|
||||
if not self.__trained:
|
||||
self.fit(X, y)
|
||||
yp = self.predict(X).reshape(y.shape)
|
||||
right = (yp == y).astype(int)
|
||||
return np.sum(right) / len(y)
|
||||
|
||||
def __iter__(self):
|
||||
return Siterator(self._tree)
|
||||
|
||||
def __str__(self) -> str:
|
||||
output = ''
|
||||
for i in self:
|
||||
output += str(i) + '\n'
|
||||
return output
|
||||
|
||||
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
|
||||
"""Save the dataset of the node in a csv file
|
||||
|
||||
:param tree: node with data to save
|
||||
:type tree: Snode
|
||||
:param catalog: catalog file handler
|
||||
:type catalog: typing.TextIO
|
||||
:param number: sequential number for the generated file name
|
||||
:type number: int
|
||||
"""
|
||||
data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
|
||||
name = f"{self.__folder}dataset{number}.csv"
|
||||
np.savetxt(name, data, delimiter=",")
|
||||
catalog.write(f"{name}, - {str(tree)}")
|
||||
if tree.is_leaf():
|
||||
return
|
||||
self._save_datasets(tree.get_down(), catalog, number + 1)
|
||||
self._save_datasets(tree.get_up(), catalog, number + 2)
|
||||
|
||||
def get_catalog_name(self):
|
||||
return self.__folder + "catalog.txt"
|
||||
|
||||
def save_sub_datasets(self):
|
||||
"""Save the every dataset stored in the tree to check with manual classifier
|
||||
"""
|
||||
if not os.path.isdir(self.__folder):
|
||||
os.mkdir(self.__folder)
|
||||
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
|
||||
self._save_datasets(self._tree, catalog, 1)
|
||||
|
||||
|
||||
|
182
stree/Strees_grapher.py
Normal file
182
stree/Strees_grapher.py
Normal file
@@ -0,0 +1,182 @@
|
||||
'''
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
||||
__license__ = "MIT"
|
||||
__version__ = "0.9"
|
||||
Plot 3D views of nodes in Stree
|
||||
'''
|
||||
|
||||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from sklearn.decomposition import PCA
|
||||
from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
from .Strees import Stree, Snode, Siterator
|
||||
|
||||
class Snode_graph(Snode):
|
||||
|
||||
def __init__(self, node: Stree):
|
||||
self._plot_size = (8, 8)
|
||||
self._xlimits = (None, None)
|
||||
self._ylimits = (None, None)
|
||||
self._zlimits = (None, None)
|
||||
n = Snode.copy(node)
|
||||
super().__init__(n._clf, n._X, n._y, n._title)
|
||||
|
||||
def set_plot_size(self, size: tuple):
|
||||
self._plot_size = size
|
||||
|
||||
def _is_pure(self) -> bool:
|
||||
"""is considered pure a leaf node with one label
|
||||
"""
|
||||
if self.is_leaf():
|
||||
return self._belief == 1.
|
||||
return False
|
||||
|
||||
def set_axis_limits(self, limits: tuple):
|
||||
self._xlimits = limits[0]
|
||||
self._ylimits = limits[1]
|
||||
self._zlimits = limits[2]
|
||||
|
||||
def _set_graphics_axis(self, ax: Axes3D):
|
||||
ax.set_xlim(self._xlimits)
|
||||
ax.set_ylim(self._ylimits)
|
||||
ax.set_zlim(self._zlimits)
|
||||
|
||||
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '', save_seq: int = 1):
|
||||
_, fig = self.plot_hyperplane()
|
||||
name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
|
||||
fig.savefig(name, bbox_inches='tight')
|
||||
plt.close(fig)
|
||||
|
||||
def _get_cmap(self):
|
||||
cmap = 'jet'
|
||||
if self._is_pure():
|
||||
if self._class == 1:
|
||||
cmap = 'jet_r'
|
||||
return cmap
|
||||
|
||||
def _graph_title(self):
|
||||
n_class, card = np.unique(self._y, return_counts=True)
|
||||
return f"{self._title} {n_class} {card}"
|
||||
|
||||
def plot_hyperplane(self, plot_distribution: bool = True):
|
||||
fig = plt.figure(figsize=self._plot_size)
|
||||
ax = fig.add_subplot(1, 1, 1, projection='3d')
|
||||
if not self._is_pure():
|
||||
# Can't plot hyperplane of leaves with one label because it hasn't classiffier
|
||||
# get the splitting hyperplane
|
||||
def hyperplane(x, y): return (-self._interceptor - self._vector[0][0] * x
|
||||
- self._vector[0][1] * y) / self._vector[0][2]
|
||||
|
||||
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
|
||||
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
|
||||
xx, yy = np.meshgrid(tmpx, tmpy)
|
||||
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5, antialiased=True,
|
||||
rstride=1, cstride=1, cmap='seismic')
|
||||
self._set_graphics_axis(ax)
|
||||
if plot_distribution:
|
||||
self.plot_distribution(ax)
|
||||
else:
|
||||
plt.title(self._graph_title())
|
||||
plt.show()
|
||||
return ax, fig
|
||||
|
||||
def plot_distribution(self, ax: Axes3D = None):
|
||||
if ax is None:
|
||||
fig = plt.figure(figsize=self._plot_size)
|
||||
ax = fig.add_subplot(1, 1, 1, projection='3d')
|
||||
plt.title(self._graph_title())
|
||||
cmap = self._get_cmap()
|
||||
ax.scatter(self._X[:, 0], self._X[:, 1],
|
||||
self._X[:, 2], c=self._y, cmap=cmap)
|
||||
ax.set_xlabel('X0')
|
||||
ax.set_ylabel('X1')
|
||||
ax.set_zlabel('X2')
|
||||
plt.show()
|
||||
|
||||
class Stree_grapher(Stree):
|
||||
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
|
||||
make its magic
|
||||
"""
|
||||
|
||||
def __init__(self, params: dict):
|
||||
self._plot_size = (8, 8)
|
||||
self._tree_gr = None
|
||||
# make Snode store X's
|
||||
os.environ['TESTING'] = '1'
|
||||
self._fitted = False
|
||||
self._pca = None
|
||||
super().__init__(**params)
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
os.environ.pop('TESTING')
|
||||
except:
|
||||
pass
|
||||
plt.close('all')
|
||||
|
||||
def _copy_tree(self, node: Snode) -> Snode_graph:
|
||||
mirror = Snode_graph(node)
|
||||
# clone node
|
||||
mirror._class = node._class
|
||||
mirror._belief = node._belief
|
||||
if node.get_down() is not None:
|
||||
mirror.set_down(self._copy_tree(node.get_down()))
|
||||
if node.get_up() is not None:
|
||||
mirror.set_up(self._copy_tree(node.get_up()))
|
||||
return mirror
|
||||
|
||||
def fit(self, X: np.array, y: np.array) -> Stree:
|
||||
"""Fit the Stree and copy the tree in a Snode_graph tree
|
||||
|
||||
:param X: Dataset
|
||||
:type X: np.array
|
||||
:param y: Labels
|
||||
:type y: np.array
|
||||
:return: Stree model
|
||||
:rtype: Stree
|
||||
"""
|
||||
if X.shape[1] != 3:
|
||||
self._pca = PCA(n_components=3)
|
||||
X = self._pca.fit_transform(X)
|
||||
res = super().fit(X, y)
|
||||
self._tree_gr = self._copy_tree(self._tree)
|
||||
self._fitted = True
|
||||
return res
|
||||
|
||||
def score(self, X: np.array, y: np.array) -> float:
|
||||
self._check_fitted()
|
||||
if X.shape[1] != 3:
|
||||
X = self._pca.transform(X)
|
||||
return super().score(X, y)
|
||||
|
||||
def _check_fitted(self):
|
||||
if not self._fitted:
|
||||
raise Exception('Have to fit the grapher first!')
|
||||
|
||||
def save_all(self, save_folder: str = './', save_prefix: str = ''):
|
||||
"""Save all the node plots in png format, each with a sequence number
|
||||
|
||||
:param save_folder: folder where the plots are saved, defaults to './'
|
||||
:type save_folder: str, optional
|
||||
"""
|
||||
self._check_fitted()
|
||||
seq = 1
|
||||
for node in self:
|
||||
node.save_hyperplane(save_folder=save_folder,
|
||||
save_prefix=save_prefix, save_seq=seq)
|
||||
seq += 1
|
||||
|
||||
def plot_all(self):
|
||||
"""Plots all the nodes
|
||||
"""
|
||||
self._check_fitted()
|
||||
for node in self:
|
||||
node.plot_hyperplane()
|
||||
|
||||
def __iter__(self):
|
||||
return Siterator(self._tree_gr)
|
||||
|
4
stree/__init__.py
Normal file
4
stree/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
__version__ = "0.9rc1"
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
from .Strees import Stree, Snode, Siterator
|
||||
from .Strees_grapher import Stree_grapher, Snode_graph
|
313
stree/tests/Strees_test.py
Normal file
313
stree/tests/Strees_test.py
Normal file
@@ -0,0 +1,313 @@
|
||||
import csv
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
from stree import Stree, Snode
|
||||
|
||||
|
||||
class Stree_test(unittest.TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
os.environ['TESTING'] = '1'
|
||||
self._random_state = 1
|
||||
self._clf = Stree(random_state=self._random_state,
|
||||
use_predictions=False)
|
||||
self._clf.fit(*self._get_Xy())
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
try:
|
||||
os.environ.pop('TESTING')
|
||||
except:
|
||||
pass
|
||||
|
||||
def _get_Xy(self):
|
||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
|
||||
return X, y
|
||||
|
||||
def _check_tree(self, node: Snode):
|
||||
"""Check recursively that the nodes that are not leaves have the correct
|
||||
number of labels and its sons have the right number of elements in their dataset
|
||||
|
||||
Arguments:
|
||||
node {Snode} -- node to check
|
||||
"""
|
||||
if node.is_leaf():
|
||||
return
|
||||
y_prediction = node._clf.predict(node._X)
|
||||
y_down = node.get_down()._y
|
||||
y_up = node.get_up()._y
|
||||
# Is a correct partition in terms of cadinality?
|
||||
# i.e. The partition algorithm didn't forget any sample
|
||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||
_, count_d = np.unique(y_down, return_counts=True)
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
except:
|
||||
number_down = 0
|
||||
try:
|
||||
number_up = count_u[i]
|
||||
except:
|
||||
number_up = 0
|
||||
self.assertEqual(count_y[i], number_down + number_up)
|
||||
# Is the partition made the same as the prediction?
|
||||
# as the node is not a leaf...
|
||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
||||
self._check_tree(node.get_down())
|
||||
self._check_tree(node.get_up())
|
||||
|
||||
def test_build_tree(self):
|
||||
"""Check if the tree is built the same way as predictions of models
|
||||
"""
|
||||
self._check_tree(self._clf._tree)
|
||||
|
||||
def _get_file_data(self, file_name: str) -> tuple:
|
||||
"""Return X, y from data, y is the last column in array
|
||||
|
||||
Arguments:
|
||||
file_name {str} -- the file name
|
||||
|
||||
Returns:
|
||||
tuple -- tuple with samples, categories
|
||||
"""
|
||||
data = np.genfromtxt(file_name, delimiter=',')
|
||||
data = np.array(data)
|
||||
column_y = data.shape[1] - 1
|
||||
fy = data[:, column_y]
|
||||
fx = np.delete(data, column_y, axis=1)
|
||||
return fx, fy
|
||||
|
||||
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
|
||||
"""Find the original values of y for a given array of samples
|
||||
|
||||
Arguments:
|
||||
px {np.array} -- array of samples to search for
|
||||
x_original {np.array} -- original dataset
|
||||
y_original {[type]} -- original classes
|
||||
|
||||
Returns:
|
||||
np.array -- classes of the given samples
|
||||
"""
|
||||
res = []
|
||||
for needle in px:
|
||||
for row in range(x_original.shape[0]):
|
||||
if all(x_original[row, :] == needle):
|
||||
res.append(y_original[row])
|
||||
return res
|
||||
|
||||
def test_subdatasets(self):
|
||||
"""Check if the subdatasets files have the same labels as the original dataset
|
||||
"""
|
||||
self._clf.save_sub_datasets()
|
||||
with open(self._clf.get_catalog_name()) as cat_file:
|
||||
catalog = csv.reader(cat_file, delimiter=',')
|
||||
for row in catalog:
|
||||
X, y = self._get_Xy()
|
||||
x_file, y_file = self._get_file_data(row[0])
|
||||
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
|
||||
self.assertTrue(np.array_equal(y_file, y_original))
|
||||
|
||||
def test_single_prediction(self):
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
|
||||
self.assertEqual(yp[0], y[0])
|
||||
|
||||
def test_multiple_prediction(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||
|
||||
def test_score(self):
|
||||
X, y = self._get_Xy()
|
||||
accuracy_score = self._clf.score(X, y)
|
||||
yp = self._clf.predict(X)
|
||||
right = (yp == y).astype(int)
|
||||
accuracy_computed = sum(right) / len(y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertGreater(accuracy_score, 0.8)
|
||||
|
||||
def test_single_predict_proba(self):
|
||||
"""Check that element 28 has a prediction different that the current label
|
||||
"""
|
||||
# Element 28 has a different prediction than the truth
|
||||
decimals = 5
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
||||
self.assertEqual(0, yp[0:, 0])
|
||||
self.assertEqual(1, y[28])
|
||||
self.assertAlmostEqual(
|
||||
round(0.29026400766, decimals),
|
||||
round(yp[0, 1], decimals),
|
||||
decimals
|
||||
)
|
||||
|
||||
def test_multiple_predict_proba(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
decimals = 5
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
|
||||
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236,
|
||||
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349,
|
||||
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355,
|
||||
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692,
|
||||
0.29788483, 0.01657364, 0.81149083]
|
||||
expected = np.round(expected_proba, decimals=decimals).tolist()
|
||||
computed = np.round(yp[:, 1], decimals=decimals).tolist()
|
||||
for i in range(len(expected)):
|
||||
self.assertAlmostEqual(expected[i], computed[i], decimals)
|
||||
|
||||
def build_models(self):
|
||||
"""Build and train two models, model_clf will use the sklearn classifier to
|
||||
compute predictions and split data. model_computed will use vector of
|
||||
coefficients to compute both predictions and splitted data
|
||||
"""
|
||||
model_clf = Stree(random_state=self._random_state,
|
||||
use_predictions=True)
|
||||
model_computed = Stree(random_state=self._random_state,
|
||||
use_predictions=False)
|
||||
X, y = self._get_Xy()
|
||||
model_clf.fit(X, y)
|
||||
model_computed.fit(X, y)
|
||||
return model_clf, model_computed, X, y
|
||||
|
||||
def test_use_model_predict(self):
|
||||
"""Check that we get the same results wether we use the estimator in nodes
|
||||
to compute labels or we use the hyperplane and the position of samples wrt to it
|
||||
"""
|
||||
use_clf, use_math, X, _ = self.build_models()
|
||||
self.assertListEqual(
|
||||
use_clf.predict(X).tolist(),
|
||||
use_math.predict(X).tolist()
|
||||
)
|
||||
|
||||
def test_use_model_score(self):
|
||||
use_clf, use_math, X, y = self.build_models()
|
||||
b = use_math.score(X, y)
|
||||
self.assertEqual(
|
||||
use_clf.score(X, y),
|
||||
b
|
||||
)
|
||||
self.assertGreater(b, .95)
|
||||
|
||||
def test_use_model_predict_proba(self):
|
||||
use_clf, use_math, X, _ = self.build_models()
|
||||
self.assertListEqual(
|
||||
use_clf.predict_proba(X).tolist(),
|
||||
use_math.predict_proba(X).tolist()
|
||||
)
|
||||
|
||||
def test_single_vs_multiple_prediction(self):
|
||||
"""Check if predicting sample by sample gives the same result as predicting
|
||||
all samples at once
|
||||
"""
|
||||
X, _ = self._get_Xy()
|
||||
# Compute prediction line by line
|
||||
yp_line = np.array([], dtype=int)
|
||||
for xp in X:
|
||||
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
|
||||
# Compute prediction at once
|
||||
yp_once = self._clf.predict(X)
|
||||
#
|
||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||
|
||||
def test_iterator(self):
|
||||
"""Check preorder iterator
|
||||
"""
|
||||
expected = [
|
||||
'root',
|
||||
'root - Down',
|
||||
'root - Down - Down, <cgaf> - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))',
|
||||
'root - Down - Up',
|
||||
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))',
|
||||
'root - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))',
|
||||
'root - Up, <cgaf> - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))',
|
||||
]
|
||||
computed = []
|
||||
for node in self._clf:
|
||||
computed.append(str(node))
|
||||
self.assertListEqual(expected, computed)
|
||||
|
||||
class Snode_test(unittest.TestCase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
os.environ['TESTING'] = '1'
|
||||
self._random_state = 1
|
||||
self._clf = Stree(random_state=self._random_state,
|
||||
use_predictions=True)
|
||||
self._clf.fit(*self._get_Xy())
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
try:
|
||||
os.environ.pop('TESTING')
|
||||
except:
|
||||
pass
|
||||
|
||||
def _get_Xy(self):
|
||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
|
||||
return X, y
|
||||
|
||||
def test_attributes_in_leaves(self):
|
||||
"""Check if the attributes in leaves have correct values so they form a predictor
|
||||
"""
|
||||
|
||||
def check_leave(node: Snode):
|
||||
if not node.is_leaf():
|
||||
check_leave(node.get_down())
|
||||
check_leave(node.get_up())
|
||||
return
|
||||
# Check Belief in leave
|
||||
classes, card = np.unique(node._y, return_counts=True)
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
if len(classes) > 1:
|
||||
try:
|
||||
belief = max_card / (max_card + min_card)
|
||||
except:
|
||||
belief = 0.
|
||||
else:
|
||||
belief = 1
|
||||
self.assertEqual(belief, node._belief)
|
||||
# Check Class
|
||||
class_computed = classes[card == max_card]
|
||||
self.assertEqual(class_computed, node._class)
|
||||
|
||||
check_leave(self._clf._tree)
|
||||
|
||||
def test_nodes_coefs(self):
|
||||
"""Check if the nodes of the tree have the right attributes filled
|
||||
"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node._belief < 1:
|
||||
# only exclude pure leaves
|
||||
self.assertIsNotNone(node._clf)
|
||||
self.assertIsNotNone(node._clf.coef_)
|
||||
self.assertIsNotNone(node._vector)
|
||||
self.assertIsNotNone(node._interceptor)
|
||||
if node.is_leaf():
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
|
||||
run_tree(self._clf._tree)
|
||||
|
1
stree/tests/__init__.py
Normal file
1
stree/tests/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .Strees_test import Stree_test, Snode_test
|
Reference in New Issue
Block a user