First approach to Platt scaling

This commit is contained in:
2020-05-18 11:51:27 +02:00
parent 86a9ef2f3a
commit e52cbbb192
3 changed files with 114 additions and 50 deletions

View File

@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -17,7 +17,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@@ -29,9 +29,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
}
],
"source": [ "source": [
"import time\n", "import time\n",
"from sklearn.model_selection import train_test_split\n", "from sklearn.model_selection import train_test_split\n",
@@ -68,7 +74,7 @@
"\n", "\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n", "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n", "# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard() # Take all the samples\n", "data = load_creditcard(-1000) # Take all the samples\n",
"\n", "\n",
"Xtrain = data[0]\n", "Xtrain = data[0]\n",
"Xtest = data[1]\n", "Xtest = data[1]\n",
@@ -78,9 +84,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.983923 counts=(array([0, 1]), array([ 5, 306]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.945430 counts=(array([0, 1]), array([693, 40]))\n\n\n0.0277 secs\n"
}
],
"source": [ "source": [
"t = time.time()\n", "t = time.time()\n",
"clf = Stree(C=.01, random_state=random_state)\n", "clf = Stree(C=.01, random_state=random_state)\n",
@@ -90,6 +102,22 @@
"print(f\"{time.time() - t:.4f} secs\")" "print(f\"{time.time() - t:.4f} secs\")"
] ]
}, },
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n****** (311, 1) (311, 1)\n****** (733, 1) (733, 1)\n[[0. 0.94542974]\n [1. 0.98392283]\n [0. 0.94542974]\n ...\n [0. 0.94542974]\n [0. 0.94542974]\n [1. 0.98392283]]\n"
}
],
"source": [
"k = clf.predict_proba(Xtrain)\n",
"print(k)"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@@ -154,7 +182,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.7.6" "version": "3.7.6-final"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@@ -63,8 +63,8 @@ class Stree_test(unittest.TestCase):
# Is the partition made the same as the prediction? # Is the partition made the same as the prediction?
# as the node is not a leaf... # as the node is not a leaf...
_, count_yp = np.unique(y_prediction, return_counts=True) _, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[1], y_up.shape[0]) self.assertEqual(count_yp[0], y_up.shape[0])
self.assertEqual(count_yp[0], y_down.shape[0]) self.assertEqual(count_yp[1], y_down.shape[0])
self._check_tree(node.get_down()) self._check_tree(node.get_down())
self._check_tree(node.get_up()) self._check_tree(node.get_up())
@@ -163,26 +163,46 @@ class Stree_test(unittest.TestCase):
0.92829706, 0.92829706, 0.9759887] 0.92829706, 0.92829706, 0.9759887]
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist()) self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
def test_use_model_predictions(self): def build_models(self):
"""Check that we get the same results wether we use the estimator in nodes """Build and train two models, model_clf will use the sklearn classifier to
to compute labes or we use the hyperplane and the position of samples wrt to it compute predictions and split data. model_computed will use vector of
coefficients to compute both predictions and splitted data
""" """
model_predictions = Stree(random_state=self._random_state, model_clf = Stree(random_state=self._random_state,
use_predictions=True) use_predictions=True)
model_hyperplane = Stree(random_state=self._random_state, model_computed = Stree(random_state=self._random_state,
use_predictions=False) use_predictions=False)
X, y = self._get_Xy() X, y = self._get_Xy()
model_predictions.fit(X, y) model_clf.fit(X, y)
model_hyperplane.fit(X, y) model_computed.fit(X, y)
return model_clf, model_computed, X, y
def test_use_model_predict(self):
"""Check that we get the same results wether we use the estimator in nodes
to compute labels or we use the hyperplane and the position of samples wrt to it
"""
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual( self.assertListEqual(
model_predictions.predict(X).tolist(), use_clf.predict(X).tolist(),
model_hyperplane.predict(X).tolist() use_math.predict(X).tolist()
)
def test_use_model_score(self):
use_clf, use_math, X, y = self.build_models()
b = use_math.score(X, y)
self.assertEqual(
use_clf.score(X, y),
b
) )
a = model_predictions.score(X, y),
b = model_hyperplane.score(X, y)
self.assertEqual(a, b)
self.assertGreater(b, .95) self.assertGreater(b, .95)
def test_use_model_predict_proba(self):
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual(
use_clf.predict_proba(X).tolist(),
use_math.predict_proba(X).tolist()
)
def test_single_vs_multiple_prediction(self): def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as predicting """Check if predicting sample by sample gives the same result as predicting
all samples at once all samples at once
@@ -201,4 +221,3 @@ class Stree_test(unittest.TestCase):

View File

@@ -52,6 +52,7 @@ class Stree(BaseEstimator, ClassifierMixin):
if self.__use_predictions: if self.__use_predictions:
yp = node._clf.predict(data) yp = node._clf.predict(data)
down = (yp == 1).reshape(-1, 1) down = (yp == 1).reshape(-1, 1)
res = node._clf.decision_function(data)
else: else:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients # doesn't work with multiclass as each sample has to do inner product with its own coeficients
# computes positition of every sample is w.r.t. the hyperplane # computes positition of every sample is w.r.t. the hyperplane
@@ -60,9 +61,15 @@ class Stree(BaseEstimator, ClassifierMixin):
up = ~down up = ~down
data_down = data[down[:, 0]] if any(down) else None data_down = data[down[:, 0]] if any(down) else None
indices_down = indices[down[:, 0]] if any(down) else None indices_down = indices[down[:, 0]] if any(down) else None
res_down = res[down[:, 0]] if any(down) else None
data_up = data[up[:, 0]] if any(up) else None data_up = data[up[:, 0]] if any(up) else None
indices_up = indices[up[:, 0]] if any(up) else None indices_up = indices[up[:, 0]] if any(up) else None
return [data_down, indices_down, data_up, indices_up] res_up = res[up[:, 0]] if any(up) else None
#if any(up):
# print("+++++up", data_up.shape, indices_up.shape, res_up.shape)
#if any(down):
# print("+++++down", data_down.shape, indices_down.shape, res_down.shape )
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
X, y = check_X_y(X, y.ravel()) X, y = check_X_y(X, y.ravel())
@@ -92,7 +99,7 @@ class Stree(BaseEstimator, ClassifierMixin):
random_state=self._random_state) random_state=self._random_state)
clf.fit(X, y) clf.fit(X, y)
tree = Snode(clf, X, y, title) tree = Snode(clf, X, y, title)
X_U, y_u, X_D, y_d = self._split_data(tree, X, y) X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
if X_U is None or X_D is None: if X_U is None or X_D is None:
# didn't part anything # didn't part anything
return Snode(clf, X, y, title + ', <cgaf>') return Snode(clf, X, y, title + ', <cgaf>')
@@ -100,31 +107,6 @@ class Stree(BaseEstimator, ClassifierMixin):
tree.set_down(self.train(X_D, y_d, title + ' - Down')) tree.set_down(self.train(X_D, y_d, title + ' - Down'))
return tree return tree
def _predict_values(self, X: np.array) -> np.array:
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
if self.__proba:
prediction_proba = np.full((xp.shape[0], 1), node._belief)
#prediction_proba = self._linear_function(xp, node)
return np.append(prediction, prediction_proba, axis=1), indices
else:
return prediction, indices
u, i_u, d, i_d = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, node.get_down())
m, n = predict_class(u, i_u, node.get_up())
return np.append(k, m), np.append(l, n)
# sklearn check
check_is_fitted(self)
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
return predict_class(X, indices, self._tree)
def _reorder_results(self, y: np.array, indices: np.array) -> np.array: def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float) y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
indices = indices.astype(int) indices = indices.astype(int)
@@ -133,12 +115,47 @@ class Stree(BaseEstimator, ClassifierMixin):
return y_ordered return y_ordered
def predict(self, X: np.array) -> np.array: def predict(self, X: np.array) -> np.array:
return self._reorder_results(*self._predict_values(X)) def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
return prediction, indices
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, node.get_down())
m, n = predict_class(u, i_u, node.get_up())
return np.append(k, m), np.append(l, n)
# sklearn check
check_is_fitted(self)
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
return self._reorder_results(*predict_class(X, indices, self._tree))
def predict_proba(self, X: np.array) -> np.array: def predict_proba(self, X: np.array) -> np.array:
self.__proba = True def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
result, indices = self._predict_values(X) if xp is None:
self.__proba = False return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
prediction_proba = np.full((xp.shape[0], 1), node._belief)
#prediction_proba = dist
#print("******", prediction.shape, prediction_proba.shape)
return np.append(prediction, prediction_proba, axis=1), indices
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, r_u, node.get_down())
m, n = predict_class(u, i_u, r_d, node.get_up())
return np.append(k, m), np.append(l, n)
# sklearn check
check_is_fitted(self)
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
result, indices = predict_class(X, indices, [], self._tree)
result = result.reshape(X.shape[0], 2) result = result.reshape(X.shape[0], 2)
# Sigmoidize distance like in sklearn based on Platt(1999) # Sigmoidize distance like in sklearn based on Platt(1999)
#result[:, 1] = 1 / (1 + np.exp(-result[:, 1])) #result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))