From e52cbbb192fe246df4ef16d307ce4c4e14df4792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 18 May 2020 11:51:27 +0200 Subject: [PATCH] First approach to Platt scaling --- test2.ipynb | 46 +++++++++++++++++++++++++------- tests/Stree_test.py | 53 ++++++++++++++++++++++++------------ trees/Stree.py | 65 ++++++++++++++++++++++++++++----------------- 3 files changed, 114 insertions(+), 50 deletions(-) diff --git a/test2.ipynb b/test2.ipynb index 349bdab..404123e 100644 --- a/test2.ipynb +++ b/test2.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -29,9 +29,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n" + } + ], "source": [ "import time\n", "from sklearn.model_selection import train_test_split\n", @@ -68,7 +74,7 @@ "\n", "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n", "# data = load_creditcard(5000) # Take the first 5000 samples\n", - "data = load_creditcard() # Take all the samples\n", + "data = load_creditcard(-1000) # Take all the samples\n", "\n", "Xtrain = data[0]\n", "Xtest = data[1]\n", @@ -78,9 +84,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\nroot\nroot - Down, - Leaf class=1 belief=0.983923 counts=(array([0, 1]), array([ 5, 306]))\nroot - Up, - Leaf class=0 belief=0.945430 counts=(array([0, 1]), array([693, 40]))\n\n\n0.0277 secs\n" + } + ], "source": [ "t = time.time()\n", "clf = Stree(C=.01, random_state=random_state)\n", @@ -90,6 +102,22 @@ "print(f\"{time.time() - t:.4f} secs\")" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n****** (311, 1) (311, 1)\n****** (733, 1) (733, 1)\n[[0. 0.94542974]\n [1. 0.98392283]\n [0. 0.94542974]\n ...\n [0. 0.94542974]\n [0. 0.94542974]\n [1. 0.98392283]]\n" + } + ], + "source": [ + "k = clf.predict_proba(Xtrain)\n", + "print(k)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -154,9 +182,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.6-final" } }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/tests/Stree_test.py b/tests/Stree_test.py index 8fe5785..344e3bf 100644 --- a/tests/Stree_test.py +++ b/tests/Stree_test.py @@ -63,8 +63,8 @@ class Stree_test(unittest.TestCase): # Is the partition made the same as the prediction? # as the node is not a leaf... _, count_yp = np.unique(y_prediction, return_counts=True) - self.assertEqual(count_yp[1], y_up.shape[0]) - self.assertEqual(count_yp[0], y_down.shape[0]) + self.assertEqual(count_yp[0], y_up.shape[0]) + self.assertEqual(count_yp[1], y_down.shape[0]) self._check_tree(node.get_down()) self._check_tree(node.get_up()) @@ -154,35 +154,55 @@ class Stree_test(unittest.TestCase): # First 27 elements the predictions are the same as the truth num = 27 X, y = self._get_Xy() - yp = self._clf.predict_proba(X[:num,:]) + yp = self._clf.predict_proba(X[:num, :]) self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist()) expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887, 0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706, 0.92829706, 0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706, - 0.92829706, 0.92829706, 0.9759887 ] + 0.92829706, 0.92829706, 0.9759887] self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist()) - def test_use_model_predictions(self): - """Check that we get the same results wether we use the estimator in nodes - to compute labes or we use the hyperplane and the position of samples wrt to it + def build_models(self): + """Build and train two models, model_clf will use the sklearn classifier to + compute predictions and split data. model_computed will use vector of + coefficients to compute both predictions and splitted data """ - model_predictions = Stree(random_state=self._random_state, + model_clf = Stree(random_state=self._random_state, use_predictions=True) - model_hyperplane = Stree(random_state=self._random_state, + model_computed = Stree(random_state=self._random_state, use_predictions=False) X, y = self._get_Xy() - model_predictions.fit(X, y) - model_hyperplane.fit(X, y) + model_clf.fit(X, y) + model_computed.fit(X, y) + return model_clf, model_computed, X, y + + def test_use_model_predict(self): + """Check that we get the same results wether we use the estimator in nodes + to compute labels or we use the hyperplane and the position of samples wrt to it + """ + use_clf, use_math, X, _ = self.build_models() self.assertListEqual( - model_predictions.predict(X).tolist(), - model_hyperplane.predict(X).tolist() + use_clf.predict(X).tolist(), + use_math.predict(X).tolist() + ) + + def test_use_model_score(self): + use_clf, use_math, X, y = self.build_models() + b = use_math.score(X, y) + self.assertEqual( + use_clf.score(X, y), + b ) - a = model_predictions.score(X, y), - b = model_hyperplane.score(X, y) - self.assertEqual(a, b) self.assertGreater(b, .95) + def test_use_model_predict_proba(self): + use_clf, use_math, X, _ = self.build_models() + self.assertListEqual( + use_clf.predict_proba(X).tolist(), + use_math.predict_proba(X).tolist() + ) + def test_single_vs_multiple_prediction(self): """Check if predicting sample by sample gives the same result as predicting all samples at once @@ -196,7 +216,6 @@ class Stree_test(unittest.TestCase): yp_once = self._clf.predict(X) # self.assertListEqual(yp_line.tolist(), yp_once.tolist()) - diff --git a/trees/Stree.py b/trees/Stree.py index 1f93df6..86156a6 100644 --- a/trees/Stree.py +++ b/trees/Stree.py @@ -52,6 +52,7 @@ class Stree(BaseEstimator, ClassifierMixin): if self.__use_predictions: yp = node._clf.predict(data) down = (yp == 1).reshape(-1, 1) + res = node._clf.decision_function(data) else: # doesn't work with multiclass as each sample has to do inner product with its own coeficients # computes positition of every sample is w.r.t. the hyperplane @@ -60,9 +61,15 @@ class Stree(BaseEstimator, ClassifierMixin): up = ~down data_down = data[down[:, 0]] if any(down) else None indices_down = indices[down[:, 0]] if any(down) else None + res_down = res[down[:, 0]] if any(down) else None data_up = data[up[:, 0]] if any(up) else None indices_up = indices[up[:, 0]] if any(up) else None - return [data_down, indices_down, data_up, indices_up] + res_up = res[up[:, 0]] if any(up) else None + #if any(up): + # print("+++++up", data_up.shape, indices_up.shape, res_up.shape) + #if any(down): + # print("+++++down", data_down.shape, indices_down.shape, res_down.shape ) + return [data_up, indices_up, data_down, indices_down, res_up, res_down] def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': X, y = check_X_y(X, y.ravel()) @@ -92,7 +99,7 @@ class Stree(BaseEstimator, ClassifierMixin): random_state=self._random_state) clf.fit(X, y) tree = Snode(clf, X, y, title) - X_U, y_u, X_D, y_d = self._split_data(tree, X, y) + X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y) if X_U is None or X_D is None: # didn't part anything return Snode(clf, X, y, title + ', ') @@ -100,20 +107,22 @@ class Stree(BaseEstimator, ClassifierMixin): tree.set_down(self.train(X_D, y_d, title + ' - Down')) return tree - def _predict_values(self, X: np.array) -> np.array: + def _reorder_results(self, y: np.array, indices: np.array) -> np.array: + y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float) + indices = indices.astype(int) + for i, index in enumerate(indices): + y_ordered[index] = y[i] + return y_ordered + + def predict(self, X: np.array) -> np.array: def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array: if xp is None: return [], [] if node.is_leaf(): # set a class for every sample in dataset prediction = np.full((xp.shape[0], 1), node._class) - if self.__proba: - prediction_proba = np.full((xp.shape[0], 1), node._belief) - #prediction_proba = self._linear_function(xp, node) - return np.append(prediction, prediction_proba, axis=1), indices - else: - return prediction, indices - u, i_u, d, i_d = self._split_data(node, xp, indices) + return prediction, indices + u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices) k, l = predict_class(d, i_d, node.get_down()) m, n = predict_class(u, i_u, node.get_up()) return np.append(k, m), np.append(l, n) @@ -123,22 +132,30 @@ class Stree(BaseEstimator, ClassifierMixin): X = check_array(X) # setup prediction & make it happen indices = np.arange(X.shape[0]) - return predict_class(X, indices, self._tree) - - def _reorder_results(self, y: np.array, indices: np.array) -> np.array: - y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float) - indices = indices.astype(int) - for i, index in enumerate(indices): - y_ordered[index] = y[i] - return y_ordered - - def predict(self, X: np.array) -> np.array: - return self._reorder_results(*self._predict_values(X)) + return self._reorder_results(*predict_class(X, indices, self._tree)) def predict_proba(self, X: np.array) -> np.array: - self.__proba = True - result, indices = self._predict_values(X) - self.__proba = False + def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array: + if xp is None: + return [], [] + if node.is_leaf(): + # set a class for every sample in dataset + prediction = np.full((xp.shape[0], 1), node._class) + prediction_proba = np.full((xp.shape[0], 1), node._belief) + #prediction_proba = dist + #print("******", prediction.shape, prediction_proba.shape) + return np.append(prediction, prediction_proba, axis=1), indices + u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices) + k, l = predict_class(d, i_d, r_u, node.get_down()) + m, n = predict_class(u, i_u, r_d, node.get_up()) + return np.append(k, m), np.append(l, n) + # sklearn check + check_is_fitted(self) + # Input validation + X = check_array(X) + # setup prediction & make it happen + indices = np.arange(X.shape[0]) + result, indices = predict_class(X, indices, [], self._tree) result = result.reshape(X.shape[0], 2) # Sigmoidize distance like in sklearn based on Platt(1999) #result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))