From 80b5cf8e722a0fc648624fbab69cd686847f906d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Fri, 15 May 2020 23:35:33 +0200 Subject: [PATCH] Implement predict & predict_proba optimization reduces time in two orders of magnitude in creditcard dataset --- README.md | 2 +- main.py | 3 +- test.ipynb | 41 ++++++++++++++++--------- tests/Stree_test.py | 63 ++++++++++++++++++++++++++++++++------ trees/Snode.py | 6 ++-- trees/Stree.py | 73 ++++++++++++++++++++++++++------------------- 6 files changed, 129 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 3bbea76..713d726 100644 --- a/README.md +++ b/README.md @@ -10,5 +10,5 @@ python main.py ## Tests ```python -python -m unittest tests.Stree_test tests.Snode_test +python -m unittest -v tests.Stree_test tests.Snode_test ``` diff --git a/main.py b/main.py index 9179ccb..ad7598d 100644 --- a/main.py +++ b/main.py @@ -33,8 +33,9 @@ def load_creditcard(n_examples=0): print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1]))) print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0]))) return X, y -X, y = load_creditcard(-5000) +#X, y = load_creditcard(-5000) #X, y = load_creditcard() +X, y = load_creditcard() clf = Stree(C=.01, max_iter=100, random_state=random_state) clf.fit(X, y) diff --git a/test.ipynb b/test.ipynb index 0cb291e..beee72b 100644 --- a/test.ipynb +++ b/test.ipynb @@ -19,13 +19,7 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n" - } - ], + "outputs": [], "source": [ "def load_creditcard(n_examples=0):\n", " df = pd.read_csv('data/creditcard.csv')\n", @@ -61,6 +55,7 @@ "\n", "#X, y = load_wine(return_X_y=True)\n", "#X, y = load_iris(return_X_y=True)\n", + "#y[y==2]=0\n", "\n", "X, y = load_creditcard()" ] @@ -73,7 +68,7 @@ { "output_type": "stream", "name": "stdout", - "text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\n\n44.3767 secs\n" + "text": "root\nroot - Down\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\nroot - Down - Up\nroot - Down - Up - Down\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n60.9873 secs\n" } ], "source": [ @@ -92,7 +87,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Accuracy: 0.999512\n33.1651 secs\n" + "text": "Accuracy: 0.999512\n0.3226 secs\n" } ], "source": [ @@ -109,7 +104,7 @@ { "output_type": "stream", "name": "stdout", - "text": "(284807, 2)\n87.5212 secs\n" + "text": "(284807, 2)\n0.4148 secs\n" } ], "source": [ @@ -119,6 +114,15 @@ "print(f\"{time.time() - t:.4f} secs\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# outcomes without optimization executing predict_proba. 87 seconds\n", + "(284807, 2)\n", + "87.5212 secs" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -127,7 +131,7 @@ { "output_type": "stream", "name": "stdout", - "text": "0.9991397683343457\n12.6601 secs\n" + "text": "0.9991397683343457\n20.9481 secs\n" } ], "source": [ @@ -146,7 +150,7 @@ { "output_type": "stream", "name": "stdout", - "text": "1.0\n18.2638 secs\n" + "text": "1.0\n32.2779 secs\n" } ], "source": [ @@ -156,6 +160,15 @@ "print(clf3.score(X, y))\n", "print(f\"{time.time() - t:.4f} secs\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from sklearn.utils.estimator_checks import check_estimator\n", + "clf = Stree()\n", + "check_estimator(clf)" + ] } ], "metadata": { @@ -173,8 +186,8 @@ }, "orig_nbformat": 2, "kernelspec": { - "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39", - "display_name": "Python 3.7.6 64-bit ('general': venv)" + "name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee", + "display_name": "Python 3.7.6 64-bit ('stree': venv)" } }, "nbformat": 4, diff --git a/tests/Stree_test.py b/tests/Stree_test.py index 60db265..c89d7f5 100644 --- a/tests/Stree_test.py +++ b/tests/Stree_test.py @@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase): os.environ.pop('TESTING') except: pass - + def _get_Xy(self): X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, @@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase): return X, y def _check_tree(self, node: Snode): + """Check recursively that the nodes that are not leaves have the correct + number of labels and its sons have the right number of elements in their dataset + + Arguments: + node {Snode} -- node to check + """ if node.is_leaf(): return y_prediction = node._clf.predict(node._X) @@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase): unique_y, count_y = np.unique(node._y, return_counts=True) _, count_d = np.unique(y_down, return_counts=True) _, count_u = np.unique(y_up, return_counts=True) + # for i in unique_y: try: number_down = count_d[i] @@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase): self.assertEqual(count_y[i], number_down + number_up) # Is the partition made the same as the prediction? # as the node is not a leaf... - unique_yp, count_yp = np.unique(y_prediction, return_counts=True) - self.assertEqual(count_yp[1], y_down.shape[0]) - self.assertEqual(count_yp[0], y_up.shape[0]) + _, count_yp = np.unique(y_prediction, return_counts=True) + self.assertEqual(count_yp[1], y_up.shape[0]) + self.assertEqual(count_yp[0], y_down.shape[0]) self._check_tree(node.get_down()) self._check_tree(node.get_up()) @@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase): return res def test_subdatasets(self): - """Check if the subdatasets files have the same predictions as the tree itself + """Check if the subdatasets files have the same labels as the original dataset """ - model = self._clf._tree._clf - X, y = self._get_Xy() - model.fit(X, y) self._clf.save_sub_datasets() with open(self._clf.get_catalog_name()) as cat_file: catalog = csv.reader(cat_file, delimiter=',') @@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase): right = (yp == y).astype(int) accuracy_computed = sum(right) / len(y) self.assertEqual(accuracy_score, accuracy_computed) + self.assertGreater(accuracy_score, 0.8) def test_single_predict_proba(self): + """Check that element 28 has a prediction different that the current label + """ # Element 28 has a different prediction than the truth X, y = self._get_Xy() yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1])) self.assertEqual(0, yp[0:, 0]) + self.assertEqual(1, y[28]) self.assertEqual(0.9282970550576184, yp[0:, 1]) def test_multiple_predict_proba(self): # First 27 elements the predictions are the same as the truth num = 27 X, y = self._get_Xy() - yp = self._clf.predict_proba(X[:num, :]) + yp = self._clf.predict_proba(X[:num,:]) self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist()) expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887, 0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706, @@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase): 0.92829706, 0.92829706, 0.9759887 ] self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist()) + def test_use_model_predictions(self): + """Check that we get the same results wether we use the estimator in nodes + to compute labes or we use the hyperplane and the position of samples wrt to it + """ + model_predictions = Stree(random_state=self._random_state, + use_predictions=True) + model_hyperplane = Stree(random_state=self._random_state, + use_predictions=False) + X, y = self._get_Xy() + model_predictions.fit(X, y) + model_hyperplane.fit(X, y) + self.assertListEqual( + model_predictions.predict(X).tolist(), + model_hyperplane.predict(X).tolist() + ) + a = model_predictions.score(X, y, print_out=False), + b = model_hyperplane.score(X, y, print_out=False) + self.assertEqual(a, b) + self.assertGreater(b, .95) + + def test_single_vs_multiple_prediction(self): + """Check if predicting sample by sample gives the same result as predicting + all samples at once + """ + X, _ = self._get_Xy() + # Compute prediction line by line + yp_line = np.array([], dtype=int) + for xp in X: + yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1]))) + # Compute prediction at once + yp_once = self._clf.predict(X) + # + self.assertListEqual(yp_line.tolist(), yp_once.tolist()) + + + + diff --git a/trees/Snode.py b/trees/Snode.py index 8e1ca59..c3d8473 100644 --- a/trees/Snode.py +++ b/trees/Snode.py @@ -18,8 +18,8 @@ class Snode: self._interceptor = 0. if clf is None else clf.intercept_ self._title = title self._belief = 0. # belief of the prediction in a leaf node based on samples - self._X = X if os.environ.get( - 'TESTING', 'Not Set') != 'Not Set' else None + # Only store dataset in Testing + self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None self._y = y self._down = None self._up = None @@ -64,6 +64,6 @@ class Snode: def __str__(self) -> str: if self.is_leaf(): - return f"Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n" + return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n" else: return f"{self._title}\n" diff --git a/trees/Stree.py b/trees/Stree.py index 25c9fe9..75d34ca 100644 --- a/trees/Stree.py +++ b/trees/Stree.py @@ -43,26 +43,25 @@ class Stree(BaseEstimator, ClassifierMixin): setattr(self, parameter, value) return self - def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list: + def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list: if self.__use_predictions: - yp = clf.predict(X) + yp = node._clf.predict(data) down = (yp == 1).reshape(-1, 1) else: # doesn't work with multiclass as each sample has to do inner product with its own coeficients # computes positition of every sample is w.r.t. the hyperplane - coef = clf.coef_[0, :].reshape(-1, X.shape[1]) - intercept = clf.intercept_[0] - res = X.dot(coef.T) + intercept + coef = node._vector[0, :].reshape(-1, data.shape[1]) + res = data.dot(coef.T) + node._interceptor[0] down = res > 0 up = ~down - X_down = X[down[:, 0]] if any(down) else None - y_down = y[down[:, 0]] if any(down) else None - X_up = X[up[:, 0]] if any(up) else None - y_up = y[up[:, 0]] if any(up) else None - return [X_up, y_up, X_down, y_down] + data_down = data[down[:, 0]] if any(down) else None + indices_down = indices[down[:, 0]] if any(down) else None + data_up = data[up[:, 0]] if any(up) else None + indices_up = indices[up[:, 0]] if any(up) else None + return [data_down, indices_down, data_up, indices_up] def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': - X, y = check_X_y(X, y) + X, y = check_X_y(X, y.ravel()) self.n_features_in_ = X.shape[1] self._tree = self.train(X, y.ravel(), title) self._build_predictor() @@ -83,47 +82,59 @@ class Stree(BaseEstimator, ClassifierMixin): def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode: if np.unique(y).shape[0] == 1: # only 1 class => pure dataset - return Snode(None, X, y, title + ', ') + return Snode(None, X, y, title + ', ') # Train the model clf = LinearSVC(max_iter=self._max_iter, C=self._C, random_state=self._random_state) clf.fit(X, y) tree = Snode(clf, X, y, title) - X_U, y_u, X_D, y_d = self._split_data(clf, X, y) + X_U, y_u, X_D, y_d = self._split_data(tree, X, y) if X_U is None or X_D is None: # didn't part anything - return Snode(clf, X, y, title + ', ') + return Snode(clf, X, y, title + ', ') tree.set_up(self.train(X_U, y_u, title + ' - Up')) tree.set_down(self.train(X_D, y_d, title + ' - Down')) return tree - def predict(self, X: np.array) -> np.array: - def predict_class(xp: np.array, tree: Snode) -> np.array: - if tree.is_leaf(): + def _predict_values(self, X: np.array) -> np.array: + def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array: + if xp is None: + return [], [] + if node.is_leaf(): + # set a class for every sample in dataset + prediction = np.full((xp.shape[0], 1), node._class) if self.__proba: - return [tree._class, tree._belief] + prediction_proba = np.full((xp.shape[0], 1), node._belief) + return np.append(prediction, prediction_proba, axis=1), indices else: - return tree._class - coef = tree._vector[0, :].reshape(-1, xp.shape[1]) - if xp.dot(coef.T) + tree._interceptor[0] > 0: - return predict_class(xp, tree.get_down()) - return predict_class(xp, tree.get_up()) - + return prediction, indices + u, i_u, d, i_d = self._split_data(node, xp, indices) + k, l = predict_class(d, i_d, node.get_down()) + m, n = predict_class(u, i_u, node.get_up()) + return np.append(k, m), np.append(l, n) # sklearn check check_is_fitted(self) # Input validation X = check_array(X) # setup prediction & make it happen - y = np.array([], dtype=int) - for xp in X: - y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree)) - return y + indices = np.arange(X.shape[0]) + return predict_class(X, indices, self._tree) + + def _reorder_results(self, y: np.array, indices: np.array) -> np.array: + y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float) + indices = indices.astype(int) + for i, index in enumerate(indices): + y_ordered[index] = y[i] + return y_ordered + + def predict(self, X: np.array) -> np.array: + return self._reorder_results(*self._predict_values(X)) def predict_proba(self, X: np.array) -> np.array: self.__proba = True - result = self.predict(X).reshape(X.shape[0], 2) + result, indices = self._predict_values(X) self.__proba = False - return result + return self._reorder_results(result.reshape(X.shape[0], 2), indices) def score(self, X: np.array, y: np.array, print_out=True) -> float: if not self.__trained: @@ -180,4 +191,4 @@ class Stree(BaseEstimator, ClassifierMixin): """Save the every dataset stored in the tree to check with manual classifier """ with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog: - self._save_datasets(self._tree, catalog, 1) + self._save_datasets(self._tree, catalog, 1) \ No newline at end of file