Implement predict & predict_proba optimization

reduces time in two orders of magnitude in creditcard dataset
This commit is contained in:
2020-05-15 23:35:33 +02:00
parent e56b955b92
commit 80b5cf8e72
6 changed files with 129 additions and 59 deletions

View File

@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
os.environ.pop('TESTING')
except:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
@@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase):
return X, y
def _check_tree(self, node: Snode):
"""Check recursively that the nodes that are not leaves have the correct
number of labels and its sons have the right number of elements in their dataset
Arguments:
node {Snode} -- node to check
"""
if node.is_leaf():
return
y_prediction = node._clf.predict(node._X)
@@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase):
unique_y, count_y = np.unique(node._y, return_counts=True)
_, count_d = np.unique(y_down, return_counts=True)
_, count_u = np.unique(y_up, return_counts=True)
#
for i in unique_y:
try:
number_down = count_d[i]
@@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase):
self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction?
# as the node is not a leaf...
unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[1], y_down.shape[0])
self.assertEqual(count_yp[0], y_up.shape[0])
_, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[1], y_up.shape[0])
self.assertEqual(count_yp[0], y_down.shape[0])
self._check_tree(node.get_down())
self._check_tree(node.get_up())
@@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase):
return res
def test_subdatasets(self):
"""Check if the subdatasets files have the same predictions as the tree itself
"""Check if the subdatasets files have the same labels as the original dataset
"""
model = self._clf._tree._clf
X, y = self._get_Xy()
model.fit(X, y)
self._clf.save_sub_datasets()
with open(self._clf.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',')
@@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase):
right = (yp == y).astype(int)
accuracy_computed = sum(right) / len(y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.8)
def test_single_predict_proba(self):
"""Check that element 28 has a prediction different that the current label
"""
# Element 28 has a different prediction than the truth
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(0, yp[0:, 0])
self.assertEqual(1, y[28])
self.assertEqual(0.9282970550576184, yp[0:, 1])
def test_multiple_predict_proba(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num, :])
yp = self._clf.predict_proba(X[:num,:])
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887,
0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706,
@@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase):
0.92829706, 0.92829706, 0.9759887 ]
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
def test_use_model_predictions(self):
"""Check that we get the same results wether we use the estimator in nodes
to compute labes or we use the hyperplane and the position of samples wrt to it
"""
model_predictions = Stree(random_state=self._random_state,
use_predictions=True)
model_hyperplane = Stree(random_state=self._random_state,
use_predictions=False)
X, y = self._get_Xy()
model_predictions.fit(X, y)
model_hyperplane.fit(X, y)
self.assertListEqual(
model_predictions.predict(X).tolist(),
model_hyperplane.predict(X).tolist()
)
a = model_predictions.score(X, y, print_out=False),
b = model_hyperplane.score(X, y, print_out=False)
self.assertEqual(a, b)
self.assertGreater(b, .95)
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as predicting
all samples at once
"""
X, _ = self._get_Xy()
# Compute prediction line by line
yp_line = np.array([], dtype=int)
for xp in X:
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
# Compute prediction at once
yp_once = self._clf.predict(X)
#
self.assertListEqual(yp_line.tolist(), yp_once.tolist())