From e52cbbb192fe246df4ef16d307ce4c4e14df4792 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Mon, 18 May 2020 11:51:27 +0200
Subject: [PATCH] First approach to Platt scaling

---
 test2.ipynb         | 46 +++++++++++++++++++++++++-------
 tests/Stree_test.py | 53 ++++++++++++++++++++++++------------
 trees/Stree.py      | 65 ++++++++++++++++++++++++++++-----------------
 3 files changed, 114 insertions(+), 50 deletions(-)
diff --git a/test2.ipynb b/test2.ipynb
index 349bdab..404123e 100644
--- a/test2.ipynb
+++ b/test2.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,9 +29,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28)  y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
+    }
+   ],
    "source": [
     "import time\n",
     "from sklearn.model_selection import train_test_split\n",
@@ -68,7 +74,7 @@
     "\n",
     "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
     "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
-    "data = load_creditcard() # Take all the samples\n",
+    "data = load_creditcard(-1000) # Take all the samples\n",
     "\n",
     "Xtrain = data[0]\n",
     "Xtest = data[1]\n",
@@ -78,9 +84,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.983923 counts=(array([0, 1]), array([  5, 306]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.945430 counts=(array([0, 1]), array([693,  40]))\n\n\n0.0277 secs\n"
+    }
+   ],
    "source": [
     "t = time.time()\n",
     "clf = Stree(C=.01, random_state=random_state)\n",
@@ -90,6 +102,22 @@
     "print(f\"{time.time() - t:.4f} secs\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n****** (311, 1) (311, 1)\n****** (733, 1) (733, 1)\n[[0.         0.94542974]\n [1.         0.98392283]\n [0.         0.94542974]\n ...\n [0.         0.94542974]\n [0.         0.94542974]\n [1.         0.98392283]]\n"
+    }
+   ],
+   "source": [
+    "k = clf.predict_proba(Xtrain)\n",
+    "print(k)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -154,9 +182,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.6-final"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
\ No newline at end of file
diff --git a/tests/Stree_test.py b/tests/Stree_test.py
index 8fe5785..344e3bf 100644
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -63,8 +63,8 @@ class Stree_test(unittest.TestCase):
         # Is the partition made the same as the prediction?
         # as the node is not a leaf...
         _, count_yp = np.unique(y_prediction, return_counts=True)
-        self.assertEqual(count_yp[1], y_up.shape[0])
-        self.assertEqual(count_yp[0], y_down.shape[0])
+        self.assertEqual(count_yp[0], y_up.shape[0])
+        self.assertEqual(count_yp[1], y_down.shape[0])
         self._check_tree(node.get_down())
         self._check_tree(node.get_up())
 
@@ -154,35 +154,55 @@ class Stree_test(unittest.TestCase):
         # First 27 elements the predictions are the same as the truth
         num = 27
         X, y = self._get_Xy()
-        yp = self._clf.predict_proba(X[:num,:])
+        yp = self._clf.predict_proba(X[:num, :])
         self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
         expected_proba = [0.9759887,  0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.9759887, 
                         0.92829706, 0.9759887,  0.9759887,  0.9759887,  0.9759887,  0.92829706, 
                         0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.92829706, 0.92829706, 
                         0.9759887,  0.92829706, 0.9759887,  0.92829706, 0.92829706, 0.92829706,
-                        0.92829706, 0.92829706, 0.9759887 ]
+                        0.92829706, 0.92829706, 0.9759887]
         self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
 
-    def test_use_model_predictions(self):
-        """Check that we get the same results wether we use the estimator in nodes
-        to compute labes or we use the hyperplane and the position of samples wrt to it
+    def build_models(self):
+        """Build and train two models, model_clf will use the sklearn classifier to 
+        compute predictions and split data. model_computed will use vector of 
+        coefficients to compute both predictions and splitted data
         """
-        model_predictions = Stree(random_state=self._random_state,
+        model_clf = Stree(random_state=self._random_state,
                             use_predictions=True)
-        model_hyperplane = Stree(random_state=self._random_state,
+        model_computed = Stree(random_state=self._random_state,
                             use_predictions=False)
         X, y = self._get_Xy()
-        model_predictions.fit(X, y)
-        model_hyperplane.fit(X, y)
+        model_clf.fit(X, y)
+        model_computed.fit(X, y)
+        return model_clf, model_computed, X, y
+
+    def test_use_model_predict(self):
+        """Check that we get the same results wether we use the estimator in nodes
+        to compute labels or we use the hyperplane and the position of samples wrt to it
+        """
+        use_clf, use_math, X, _ = self.build_models()
         self.assertListEqual(
-            model_predictions.predict(X).tolist(),
-            model_hyperplane.predict(X).tolist()
+            use_clf.predict(X).tolist(),
+            use_math.predict(X).tolist()
+        )
+    
+    def test_use_model_score(self):
+        use_clf, use_math, X, y = self.build_models()
+        b = use_math.score(X, y)
+        self.assertEqual(
+            use_clf.score(X, y),
+           b
         )
-        a = model_predictions.score(X, y),
-        b = model_hyperplane.score(X, y)
-        self.assertEqual(a, b)
         self.assertGreater(b, .95)
 
+    def test_use_model_predict_proba(self):
+        use_clf, use_math, X, _ = self.build_models()
+        self.assertListEqual(
+            use_clf.predict_proba(X).tolist(),
+            use_math.predict_proba(X).tolist()
+        )
+
     def test_single_vs_multiple_prediction(self):
         """Check if predicting sample by sample gives the same result as predicting
         all samples at once
@@ -196,7 +216,6 @@ class Stree_test(unittest.TestCase):
         yp_once = self._clf.predict(X)
         #
         self.assertListEqual(yp_line.tolist(), yp_once.tolist())
-        
 
 
 
diff --git a/trees/Stree.py b/trees/Stree.py
index 1f93df6..86156a6 100644
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -52,6 +52,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         if self.__use_predictions:
             yp = node._clf.predict(data)
             down = (yp == 1).reshape(-1, 1)
+            res = node._clf.decision_function(data)
         else:
             # doesn't work with multiclass as each sample has to do inner product with its own coeficients
             # computes positition of every sample is w.r.t. the hyperplane
@@ -60,9 +61,15 @@ class Stree(BaseEstimator, ClassifierMixin):
         up = ~down
         data_down = data[down[:, 0]] if any(down) else None
         indices_down = indices[down[:, 0]] if any(down) else None
+        res_down = res[down[:, 0]] if any(down) else None
         data_up = data[up[:, 0]] if any(up) else None
         indices_up = indices[up[:, 0]] if any(up) else None
-        return [data_down, indices_down, data_up, indices_up]
+        res_up = res[up[:, 0]] if any(up) else None
+        #if any(up):
+        #    print("+++++up", data_up.shape, indices_up.shape, res_up.shape)
+        #if any(down):
+        #    print("+++++down", data_down.shape, indices_down.shape, res_down.shape )
+        return [data_up, indices_up, data_down, indices_down, res_up, res_down]
 
     def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
         X, y = check_X_y(X, y.ravel())
@@ -92,7 +99,7 @@ class Stree(BaseEstimator, ClassifierMixin):
                         random_state=self._random_state)
         clf.fit(X, y)
         tree = Snode(clf, X, y, title)
-        X_U, y_u, X_D, y_d = self._split_data(tree, X, y)
+        X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
         if X_U is None or X_D is None:
             # didn't part anything
             return Snode(clf, X, y, title + ', <cgaf>')
@@ -100,20 +107,22 @@ class Stree(BaseEstimator, ClassifierMixin):
         tree.set_down(self.train(X_D, y_d, title + ' - Down'))
         return tree
 
-    def _predict_values(self, X: np.array) -> np.array:
+    def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
+        y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
+        indices = indices.astype(int)
+        for i, index in enumerate(indices):
+            y_ordered[index] = y[i]
+        return y_ordered
+
+    def predict(self, X: np.array) -> np.array:
         def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
             if xp is None:
                 return [], []
             if node.is_leaf():
                 # set a class for every sample in dataset
                 prediction = np.full((xp.shape[0], 1), node._class)
-                if self.__proba:
-                    prediction_proba = np.full((xp.shape[0], 1), node._belief)
-                    #prediction_proba = self._linear_function(xp, node)
-                    return np.append(prediction, prediction_proba, axis=1), indices
-                else:
-                    return prediction, indices
-            u, i_u, d, i_d = self._split_data(node, xp, indices)
+                return prediction, indices
+            u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
             k, l = predict_class(d, i_d, node.get_down())
             m, n = predict_class(u, i_u, node.get_up())
             return np.append(k, m), np.append(l, n)
@@ -123,22 +132,30 @@ class Stree(BaseEstimator, ClassifierMixin):
         X = check_array(X)
         # setup prediction & make it happen
         indices = np.arange(X.shape[0])
-        return predict_class(X, indices, self._tree)
-    
-    def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
-        y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
-        indices = indices.astype(int)
-        for i, index in enumerate(indices):
-            y_ordered[index] = y[i]
-        return y_ordered
-
-    def predict(self, X: np.array) -> np.array:
-        return self._reorder_results(*self._predict_values(X))
+        return self._reorder_results(*predict_class(X, indices, self._tree))
 
     def predict_proba(self, X: np.array) -> np.array:
-        self.__proba = True
-        result, indices = self._predict_values(X)
-        self.__proba = False
+        def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
+            if xp is None:
+                return [], []
+            if node.is_leaf():
+                # set a class for every sample in dataset
+                prediction = np.full((xp.shape[0], 1), node._class)
+                prediction_proba = np.full((xp.shape[0], 1), node._belief)
+                #prediction_proba = dist
+                #print("******", prediction.shape, prediction_proba.shape)
+                return np.append(prediction, prediction_proba, axis=1), indices
+            u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
+            k, l = predict_class(d, i_d, r_u, node.get_down())
+            m, n = predict_class(u, i_u, r_d, node.get_up())
+            return np.append(k, m), np.append(l, n)
+        # sklearn check
+        check_is_fitted(self)
+        # Input validation
+        X = check_array(X)
+        # setup prediction & make it happen
+        indices = np.arange(X.shape[0])
+        result, indices = predict_class(X, indices, [], self._tree)
         result = result.reshape(X.shape[0], 2)
         # Sigmoidize distance like in sklearn based on Platt(1999)
         #result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))