mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-16 07:56:06 +00:00
Implement predict & predict_proba optimization
reduces time in two orders of magnitude in creditcard dataset
This commit is contained in:
@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
|
||||
os.environ.pop('TESTING')
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def _get_Xy(self):
|
||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||
@@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase):
|
||||
return X, y
|
||||
|
||||
def _check_tree(self, node: Snode):
|
||||
"""Check recursively that the nodes that are not leaves have the correct
|
||||
number of labels and its sons have the right number of elements in their dataset
|
||||
|
||||
Arguments:
|
||||
node {Snode} -- node to check
|
||||
"""
|
||||
if node.is_leaf():
|
||||
return
|
||||
y_prediction = node._clf.predict(node._X)
|
||||
@@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase):
|
||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||
_, count_d = np.unique(y_down, return_counts=True)
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
@@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase):
|
||||
self.assertEqual(count_y[i], number_down + number_up)
|
||||
# Is the partition made the same as the prediction?
|
||||
# as the node is not a leaf...
|
||||
unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[1], y_up.shape[0])
|
||||
self.assertEqual(count_yp[0], y_down.shape[0])
|
||||
self._check_tree(node.get_down())
|
||||
self._check_tree(node.get_up())
|
||||
|
||||
@@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase):
|
||||
return res
|
||||
|
||||
def test_subdatasets(self):
|
||||
"""Check if the subdatasets files have the same predictions as the tree itself
|
||||
"""Check if the subdatasets files have the same labels as the original dataset
|
||||
"""
|
||||
model = self._clf._tree._clf
|
||||
X, y = self._get_Xy()
|
||||
model.fit(X, y)
|
||||
self._clf.save_sub_datasets()
|
||||
with open(self._clf.get_catalog_name()) as cat_file:
|
||||
catalog = csv.reader(cat_file, delimiter=',')
|
||||
@@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase):
|
||||
right = (yp == y).astype(int)
|
||||
accuracy_computed = sum(right) / len(y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertGreater(accuracy_score, 0.8)
|
||||
|
||||
def test_single_predict_proba(self):
|
||||
"""Check that element 28 has a prediction different that the current label
|
||||
"""
|
||||
# Element 28 has a different prediction than the truth
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
||||
self.assertEqual(0, yp[0:, 0])
|
||||
self.assertEqual(1, y[28])
|
||||
self.assertEqual(0.9282970550576184, yp[0:, 1])
|
||||
|
||||
def test_multiple_predict_proba(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[:num, :])
|
||||
yp = self._clf.predict_proba(X[:num,:])
|
||||
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
|
||||
expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887,
|
||||
0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706,
|
||||
@@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase):
|
||||
0.92829706, 0.92829706, 0.9759887 ]
|
||||
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
|
||||
|
||||
def test_use_model_predictions(self):
|
||||
"""Check that we get the same results wether we use the estimator in nodes
|
||||
to compute labes or we use the hyperplane and the position of samples wrt to it
|
||||
"""
|
||||
model_predictions = Stree(random_state=self._random_state,
|
||||
use_predictions=True)
|
||||
model_hyperplane = Stree(random_state=self._random_state,
|
||||
use_predictions=False)
|
||||
X, y = self._get_Xy()
|
||||
model_predictions.fit(X, y)
|
||||
model_hyperplane.fit(X, y)
|
||||
self.assertListEqual(
|
||||
model_predictions.predict(X).tolist(),
|
||||
model_hyperplane.predict(X).tolist()
|
||||
)
|
||||
a = model_predictions.score(X, y, print_out=False),
|
||||
b = model_hyperplane.score(X, y, print_out=False)
|
||||
self.assertEqual(a, b)
|
||||
self.assertGreater(b, .95)
|
||||
|
||||
def test_single_vs_multiple_prediction(self):
|
||||
"""Check if predicting sample by sample gives the same result as predicting
|
||||
all samples at once
|
||||
"""
|
||||
X, _ = self._get_Xy()
|
||||
# Compute prediction line by line
|
||||
yp_line = np.array([], dtype=int)
|
||||
for xp in X:
|
||||
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
|
||||
# Compute prediction at once
|
||||
yp_once = self._clf.predict(X)
|
||||
#
|
||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user