mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 15:36:00 +00:00
241 lines
9.0 KiB
Python
241 lines
9.0 KiB
Python
import csv
|
|
import os
|
|
import unittest
|
|
|
|
import numpy as np
|
|
from sklearn.datasets import make_classification
|
|
|
|
from trees.Stree import Stree, Snode
|
|
|
|
|
|
class Stree_test(unittest.TestCase):
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
os.environ['TESTING'] = '1'
|
|
self._random_state = 1
|
|
self._clf = Stree(random_state=self._random_state,
|
|
use_predictions=False)
|
|
self._clf.fit(*self._get_Xy())
|
|
super().__init__(*args, **kwargs)
|
|
|
|
@classmethod
|
|
def tearDownClass(cls):
|
|
try:
|
|
os.environ.pop('TESTING')
|
|
except:
|
|
pass
|
|
|
|
def _get_Xy(self):
|
|
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
|
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
|
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
|
|
return X, y
|
|
|
|
def _check_tree(self, node: Snode):
|
|
"""Check recursively that the nodes that are not leaves have the correct
|
|
number of labels and its sons have the right number of elements in their dataset
|
|
|
|
Arguments:
|
|
node {Snode} -- node to check
|
|
"""
|
|
if node.is_leaf():
|
|
return
|
|
y_prediction = node._clf.predict(node._X)
|
|
y_down = node.get_down()._y
|
|
y_up = node.get_up()._y
|
|
# Is a correct partition in terms of cadinality?
|
|
# i.e. The partition algorithm didn't forget any sample
|
|
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
|
unique_y, count_y = np.unique(node._y, return_counts=True)
|
|
_, count_d = np.unique(y_down, return_counts=True)
|
|
_, count_u = np.unique(y_up, return_counts=True)
|
|
#
|
|
for i in unique_y:
|
|
try:
|
|
number_down = count_d[i]
|
|
except:
|
|
number_down = 0
|
|
try:
|
|
number_up = count_u[i]
|
|
except:
|
|
number_up = 0
|
|
self.assertEqual(count_y[i], number_down + number_up)
|
|
# Is the partition made the same as the prediction?
|
|
# as the node is not a leaf...
|
|
_, count_yp = np.unique(y_prediction, return_counts=True)
|
|
self.assertEqual(count_yp[0], y_up.shape[0])
|
|
self.assertEqual(count_yp[1], y_down.shape[0])
|
|
self._check_tree(node.get_down())
|
|
self._check_tree(node.get_up())
|
|
|
|
def test_build_tree(self):
|
|
"""Check if the tree is built the same way as predictions of models
|
|
"""
|
|
self._check_tree(self._clf._tree)
|
|
|
|
def _get_file_data(self, file_name: str) -> tuple:
|
|
"""Return X, y from data, y is the last column in array
|
|
|
|
Arguments:
|
|
file_name {str} -- the file name
|
|
|
|
Returns:
|
|
tuple -- tuple with samples, categories
|
|
"""
|
|
data = np.genfromtxt(file_name, delimiter=',')
|
|
data = np.array(data)
|
|
column_y = data.shape[1] - 1
|
|
fy = data[:, column_y]
|
|
fx = np.delete(data, column_y, axis=1)
|
|
return fx, fy
|
|
|
|
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
|
|
"""Find the original values of y for a given array of samples
|
|
|
|
Arguments:
|
|
px {np.array} -- array of samples to search for
|
|
x_original {np.array} -- original dataset
|
|
y_original {[type]} -- original classes
|
|
|
|
Returns:
|
|
np.array -- classes of the given samples
|
|
"""
|
|
res = []
|
|
for needle in px:
|
|
for row in range(x_original.shape[0]):
|
|
if all(x_original[row, :] == needle):
|
|
res.append(y_original[row])
|
|
return res
|
|
|
|
def test_subdatasets(self):
|
|
"""Check if the subdatasets files have the same labels as the original dataset
|
|
"""
|
|
self._clf.save_sub_datasets()
|
|
with open(self._clf.get_catalog_name()) as cat_file:
|
|
catalog = csv.reader(cat_file, delimiter=',')
|
|
for row in catalog:
|
|
X, y = self._get_Xy()
|
|
x_file, y_file = self._get_file_data(row[0])
|
|
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
|
|
self.assertTrue(np.array_equal(y_file, y_original))
|
|
|
|
def test_single_prediction(self):
|
|
X, y = self._get_Xy()
|
|
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
|
|
self.assertEqual(yp[0], y[0])
|
|
|
|
def test_multiple_prediction(self):
|
|
# First 27 elements the predictions are the same as the truth
|
|
num = 27
|
|
X, y = self._get_Xy()
|
|
yp = self._clf.predict(X[:num, :])
|
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
|
|
|
def test_score(self):
|
|
X, y = self._get_Xy()
|
|
accuracy_score = self._clf.score(X, y)
|
|
yp = self._clf.predict(X)
|
|
right = (yp == y).astype(int)
|
|
accuracy_computed = sum(right) / len(y)
|
|
self.assertEqual(accuracy_score, accuracy_computed)
|
|
self.assertGreater(accuracy_score, 0.8)
|
|
|
|
def test_single_predict_proba(self):
|
|
"""Check that element 28 has a prediction different that the current label
|
|
"""
|
|
# Element 28 has a different prediction than the truth
|
|
X, y = self._get_Xy()
|
|
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
|
self.assertEqual(0, yp[0:, 0])
|
|
self.assertEqual(1, y[28])
|
|
self.assertEqual(0.29026400766, round(yp[0, 1], 11))
|
|
|
|
def test_multiple_predict_proba(self):
|
|
# First 27 elements the predictions are the same as the truth
|
|
num = 27
|
|
X, y = self._get_Xy()
|
|
yp = self._clf.predict_proba(X[:num, :])
|
|
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
|
|
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236,
|
|
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349,
|
|
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355,
|
|
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692,
|
|
0.29788483, 0.01657364, 0.81149083]
|
|
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
|
|
|
|
def build_models(self):
|
|
"""Build and train two models, model_clf will use the sklearn classifier to
|
|
compute predictions and split data. model_computed will use vector of
|
|
coefficients to compute both predictions and splitted data
|
|
"""
|
|
model_clf = Stree(random_state=self._random_state,
|
|
use_predictions=True)
|
|
model_computed = Stree(random_state=self._random_state,
|
|
use_predictions=False)
|
|
X, y = self._get_Xy()
|
|
model_clf.fit(X, y)
|
|
model_computed.fit(X, y)
|
|
return model_clf, model_computed, X, y
|
|
|
|
def test_use_model_predict(self):
|
|
"""Check that we get the same results wether we use the estimator in nodes
|
|
to compute labels or we use the hyperplane and the position of samples wrt to it
|
|
"""
|
|
use_clf, use_math, X, _ = self.build_models()
|
|
self.assertListEqual(
|
|
use_clf.predict(X).tolist(),
|
|
use_math.predict(X).tolist()
|
|
)
|
|
|
|
def test_use_model_score(self):
|
|
use_clf, use_math, X, y = self.build_models()
|
|
b = use_math.score(X, y)
|
|
self.assertEqual(
|
|
use_clf.score(X, y),
|
|
b
|
|
)
|
|
self.assertGreater(b, .95)
|
|
|
|
def test_use_model_predict_proba(self):
|
|
use_clf, use_math, X, _ = self.build_models()
|
|
self.assertListEqual(
|
|
use_clf.predict_proba(X).tolist(),
|
|
use_math.predict_proba(X).tolist()
|
|
)
|
|
|
|
def test_single_vs_multiple_prediction(self):
|
|
"""Check if predicting sample by sample gives the same result as predicting
|
|
all samples at once
|
|
"""
|
|
X, _ = self._get_Xy()
|
|
# Compute prediction line by line
|
|
yp_line = np.array([], dtype=int)
|
|
for xp in X:
|
|
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
|
|
# Compute prediction at once
|
|
yp_once = self._clf.predict(X)
|
|
#
|
|
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
|
|
|
def test_iterator(self):
|
|
"""Check preorder iterator
|
|
"""
|
|
expected = [
|
|
'root',
|
|
'root - Down',
|
|
'root - Down - Down, <cgaf> - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))',
|
|
'root - Down - Up',
|
|
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))',
|
|
'root - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))',
|
|
'root - Up, <cgaf> - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))',
|
|
]
|
|
computed = []
|
|
for node in self._clf:
|
|
computed.append(str(node))
|
|
self.assertListEqual(expected, computed)
|
|
|
|
|
|
|
|
|
|
|