First approach to Platt scaling

2025-08-16 16:06:01 +00:00 · 2020-05-18 11:51:27 +02:00
parent 86a9ef2f3a
commit e52cbbb192
3 changed files with 114 additions and 50 deletions
--- a/test2.ipynb
+++ b/test2.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -29,9 +29,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28)  y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
    }
   ],
   "source": [
    "import time\n",
    "from sklearn.model_selection import train_test_split\n",
@@ -68,7 +74,7 @@
    "\n",
    "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
    "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
-    "data = load_creditcard() # Take all the samples\n",
+    "data = load_creditcard(-1000) # Take all the samples\n",
    "\n",
    "Xtrain = data[0]\n",
    "Xtest = data[1]\n",
@@ -78,9 +84,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.983923 counts=(array([0, 1]), array([  5, 306]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.945430 counts=(array([0, 1]), array([693,  40]))\n\n\n0.0277 secs\n"
    }
   ],
   "source": [
    "t = time.time()\n",
    "clf = Stree(C=.01, random_state=random_state)\n",
@@ -90,6 +102,22 @@
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n****** (311, 1) (311, 1)\n****** (733, 1) (733, 1)\n[[0.         0.94542974]\n [1.         0.98392283]\n [0.         0.94542974]\n ...\n [0.         0.94542974]\n [0.         0.94542974]\n [1.         0.98392283]]\n"
    }
   ],
   "source": [
    "k = clf.predict_proba(Xtrain)\n",
    "print(k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -154,7 +182,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.6-final"
  }
 },
 "nbformat": 4,
--- a/tests/Stree_test.py
+++ b/tests/Stree_test.py
@@ -63,8 +63,8 @@ class Stree_test(unittest.TestCase):
        # Is the partition made the same as the prediction?
        # as the node is not a leaf...
        _, count_yp = np.unique(y_prediction, return_counts=True)
-        self.assertEqual(count_yp[1], y_up.shape[0])
+        self.assertEqual(count_yp[0], y_up.shape[0])
-        self.assertEqual(count_yp[0], y_down.shape[0])
+        self.assertEqual(count_yp[1], y_down.shape[0])
        self._check_tree(node.get_down())
        self._check_tree(node.get_up())
@@ -163,26 +163,46 @@ class Stree_test(unittest.TestCase):
                        0.92829706, 0.92829706, 0.9759887]
        self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
-    def test_use_model_predictions(self):
+    def build_models(self):
-        """Check that we get the same results wether we use the estimator in nodes
+        """Build and train two models, model_clf will use the sklearn classifier to 
-        to compute labes or we use the hyperplane and the position of samples wrt to it
+        compute predictions and split data. model_computed will use vector of 
        coefficients to compute both predictions and splitted data
        """
-        model_predictions = Stree(random_state=self._random_state,
+        model_clf = Stree(random_state=self._random_state,
                            use_predictions=True)
-        model_hyperplane = Stree(random_state=self._random_state,
+        model_computed = Stree(random_state=self._random_state,
                            use_predictions=False)
        X, y = self._get_Xy()
-        model_predictions.fit(X, y)
+        model_clf.fit(X, y)
-        model_hyperplane.fit(X, y)
+        model_computed.fit(X, y)
        return model_clf, model_computed, X, y
    def test_use_model_predict(self):
        """Check that we get the same results wether we use the estimator in nodes
        to compute labels or we use the hyperplane and the position of samples wrt to it
        """
        use_clf, use_math, X, _ = self.build_models()
        self.assertListEqual(
-            model_predictions.predict(X).tolist(),
+            use_clf.predict(X).tolist(),
-            model_hyperplane.predict(X).tolist()
+            use_math.predict(X).tolist()
        )
    def test_use_model_score(self):
        use_clf, use_math, X, y = self.build_models()
        b = use_math.score(X, y)
        self.assertEqual(
            use_clf.score(X, y),
           b
        )
        a = model_predictions.score(X, y),
        b = model_hyperplane.score(X, y)
        self.assertEqual(a, b)
        self.assertGreater(b, .95)
    def test_use_model_predict_proba(self):
        use_clf, use_math, X, _ = self.build_models()
        self.assertListEqual(
            use_clf.predict_proba(X).tolist(),
            use_math.predict_proba(X).tolist()
        )
    def test_single_vs_multiple_prediction(self):
        """Check if predicting sample by sample gives the same result as predicting
        all samples at once
@@ -201,4 +221,3 @@ class Stree_test(unittest.TestCase):
--- a/trees/Stree.py
+++ b/trees/Stree.py
@@ -52,6 +52,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        if self.__use_predictions:
            yp = node._clf.predict(data)
            down = (yp == 1).reshape(-1, 1)
            res = node._clf.decision_function(data)
        else:
            # doesn't work with multiclass as each sample has to do inner product with its own coeficients
            # computes positition of every sample is w.r.t. the hyperplane
@@ -60,9 +61,15 @@ class Stree(BaseEstimator, ClassifierMixin):
        up = ~down
        data_down = data[down[:, 0]] if any(down) else None
        indices_down = indices[down[:, 0]] if any(down) else None
        res_down = res[down[:, 0]] if any(down) else None
        data_up = data[up[:, 0]] if any(up) else None
        indices_up = indices[up[:, 0]] if any(up) else None
-        return [data_down, indices_down, data_up, indices_up]
+        res_up = res[up[:, 0]] if any(up) else None
        #if any(up):
        #    print("+++++up", data_up.shape, indices_up.shape, res_up.shape)
        #if any(down):
        #    print("+++++down", data_down.shape, indices_down.shape, res_down.shape )
        return [data_up, indices_up, data_down, indices_down, res_up, res_down]
    def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
        X, y = check_X_y(X, y.ravel())
@@ -92,7 +99,7 @@ class Stree(BaseEstimator, ClassifierMixin):
                        random_state=self._random_state)
        clf.fit(X, y)
        tree = Snode(clf, X, y, title)
-        X_U, y_u, X_D, y_d = self._split_data(tree, X, y)
+        X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
        if X_U is None or X_D is None:
            # didn't part anything
            return Snode(clf, X, y, title + ', <cgaf>')
@@ -100,31 +107,6 @@ class Stree(BaseEstimator, ClassifierMixin):
        tree.set_down(self.train(X_D, y_d, title + ' - Down'))
        return tree
    def _predict_values(self, X: np.array) -> np.array:
        def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
            if xp is None:
                return [], []
            if node.is_leaf():
                # set a class for every sample in dataset
                prediction = np.full((xp.shape[0], 1), node._class)
                if self.__proba:
                    prediction_proba = np.full((xp.shape[0], 1), node._belief)
                    #prediction_proba = self._linear_function(xp, node)
                    return np.append(prediction, prediction_proba, axis=1), indices
                else:
                    return prediction, indices
            u, i_u, d, i_d = self._split_data(node, xp, indices)
            k, l = predict_class(d, i_d, node.get_down())
            m, n = predict_class(u, i_u, node.get_up())
            return np.append(k, m), np.append(l, n)
        # sklearn check
        check_is_fitted(self)
        # Input validation
        X = check_array(X)
        # setup prediction & make it happen
        indices = np.arange(X.shape[0])
        return predict_class(X, indices, self._tree)
    def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
        y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
        indices = indices.astype(int)
@@ -133,12 +115,47 @@ class Stree(BaseEstimator, ClassifierMixin):
        return y_ordered
    def predict(self, X: np.array) -> np.array:
-        return self._reorder_results(*self._predict_values(X))
+        def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
            if xp is None:
                return [], []
            if node.is_leaf():
                # set a class for every sample in dataset
                prediction = np.full((xp.shape[0], 1), node._class)
                return prediction, indices
            u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
            k, l = predict_class(d, i_d, node.get_down())
            m, n = predict_class(u, i_u, node.get_up())
            return np.append(k, m), np.append(l, n)
        # sklearn check
        check_is_fitted(self)
        # Input validation
        X = check_array(X)
        # setup prediction & make it happen
        indices = np.arange(X.shape[0])
        return self._reorder_results(*predict_class(X, indices, self._tree))
    def predict_proba(self, X: np.array) -> np.array:
-        self.__proba = True
+        def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
-        result, indices = self._predict_values(X)
+            if xp is None:
-        self.__proba = False
+                return [], []
            if node.is_leaf():
                # set a class for every sample in dataset
                prediction = np.full((xp.shape[0], 1), node._class)
                prediction_proba = np.full((xp.shape[0], 1), node._belief)
                #prediction_proba = dist
                #print("******", prediction.shape, prediction_proba.shape)
                return np.append(prediction, prediction_proba, axis=1), indices
            u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
            k, l = predict_class(d, i_d, r_u, node.get_down())
            m, n = predict_class(u, i_u, r_d, node.get_up())
            return np.append(k, m), np.append(l, n)
        # sklearn check
        check_is_fitted(self)
        # Input validation
        X = check_array(X)
        # setup prediction & make it happen
        indices = np.arange(X.shape[0])
        result, indices = predict_class(X, indices, [], self._tree)
        result = result.reshape(X.shape[0], 2)
        # Sigmoidize distance like in sklearn based on Platt(1999)
        #result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))