First approach to Platt scaling

This commit is contained in:
2020-05-18 11:51:27 +02:00
parent 86a9ef2f3a
commit e52cbbb192
3 changed files with 114 additions and 50 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -17,7 +17,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -29,9 +29,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
}
],
"source": [
"import time\n",
"from sklearn.model_selection import train_test_split\n",
@@ -68,7 +74,7 @@
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard() # Take all the samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
@@ -78,9 +84,15 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.983923 counts=(array([0, 1]), array([ 5, 306]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.945430 counts=(array([0, 1]), array([693, 40]))\n\n\n0.0277 secs\n"
}
],
"source": [
"t = time.time()\n",
"clf = Stree(C=.01, random_state=random_state)\n",
@@ -90,6 +102,22 @@
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "+++++up (733, 28) (733,) (733, 1)\n+++++down (311, 28) (311,) (311, 1)\n****** (311, 1) (311, 1)\n****** (733, 1) (733, 1)\n[[0. 0.94542974]\n [1. 0.98392283]\n [0. 0.94542974]\n ...\n [0. 0.94542974]\n [0. 0.94542974]\n [1. 0.98392283]]\n"
}
],
"source": [
"k = clf.predict_proba(Xtrain)\n",
"print(k)"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -154,9 +182,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@@ -63,8 +63,8 @@ class Stree_test(unittest.TestCase):
# Is the partition made the same as the prediction?
# as the node is not a leaf...
_, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[1], y_up.shape[0])
self.assertEqual(count_yp[0], y_down.shape[0])
self.assertEqual(count_yp[0], y_up.shape[0])
self.assertEqual(count_yp[1], y_down.shape[0])
self._check_tree(node.get_down())
self._check_tree(node.get_up())
@@ -154,35 +154,55 @@ class Stree_test(unittest.TestCase):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num,:])
yp = self._clf.predict_proba(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887,
0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706,
0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706, 0.92829706,
0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.92829706,
0.92829706, 0.92829706, 0.9759887 ]
0.92829706, 0.92829706, 0.9759887]
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
def test_use_model_predictions(self):
"""Check that we get the same results wether we use the estimator in nodes
to compute labes or we use the hyperplane and the position of samples wrt to it
def build_models(self):
"""Build and train two models, model_clf will use the sklearn classifier to
compute predictions and split data. model_computed will use vector of
coefficients to compute both predictions and splitted data
"""
model_predictions = Stree(random_state=self._random_state,
model_clf = Stree(random_state=self._random_state,
use_predictions=True)
model_hyperplane = Stree(random_state=self._random_state,
model_computed = Stree(random_state=self._random_state,
use_predictions=False)
X, y = self._get_Xy()
model_predictions.fit(X, y)
model_hyperplane.fit(X, y)
model_clf.fit(X, y)
model_computed.fit(X, y)
return model_clf, model_computed, X, y
def test_use_model_predict(self):
"""Check that we get the same results wether we use the estimator in nodes
to compute labels or we use the hyperplane and the position of samples wrt to it
"""
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual(
model_predictions.predict(X).tolist(),
model_hyperplane.predict(X).tolist()
use_clf.predict(X).tolist(),
use_math.predict(X).tolist()
)
def test_use_model_score(self):
use_clf, use_math, X, y = self.build_models()
b = use_math.score(X, y)
self.assertEqual(
use_clf.score(X, y),
b
)
a = model_predictions.score(X, y),
b = model_hyperplane.score(X, y)
self.assertEqual(a, b)
self.assertGreater(b, .95)
def test_use_model_predict_proba(self):
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual(
use_clf.predict_proba(X).tolist(),
use_math.predict_proba(X).tolist()
)
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as predicting
all samples at once
@@ -196,7 +216,6 @@ class Stree_test(unittest.TestCase):
yp_once = self._clf.predict(X)
#
self.assertListEqual(yp_line.tolist(), yp_once.tolist())

View File

@@ -52,6 +52,7 @@ class Stree(BaseEstimator, ClassifierMixin):
if self.__use_predictions:
yp = node._clf.predict(data)
down = (yp == 1).reshape(-1, 1)
res = node._clf.decision_function(data)
else:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
# computes positition of every sample is w.r.t. the hyperplane
@@ -60,9 +61,15 @@ class Stree(BaseEstimator, ClassifierMixin):
up = ~down
data_down = data[down[:, 0]] if any(down) else None
indices_down = indices[down[:, 0]] if any(down) else None
res_down = res[down[:, 0]] if any(down) else None
data_up = data[up[:, 0]] if any(up) else None
indices_up = indices[up[:, 0]] if any(up) else None
return [data_down, indices_down, data_up, indices_up]
res_up = res[up[:, 0]] if any(up) else None
#if any(up):
# print("+++++up", data_up.shape, indices_up.shape, res_up.shape)
#if any(down):
# print("+++++down", data_down.shape, indices_down.shape, res_down.shape )
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
X, y = check_X_y(X, y.ravel())
@@ -92,7 +99,7 @@ class Stree(BaseEstimator, ClassifierMixin):
random_state=self._random_state)
clf.fit(X, y)
tree = Snode(clf, X, y, title)
X_U, y_u, X_D, y_d = self._split_data(tree, X, y)
X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
if X_U is None or X_D is None:
# didn't part anything
return Snode(clf, X, y, title + ', <cgaf>')
@@ -100,20 +107,22 @@ class Stree(BaseEstimator, ClassifierMixin):
tree.set_down(self.train(X_D, y_d, title + ' - Down'))
return tree
def _predict_values(self, X: np.array) -> np.array:
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
indices = indices.astype(int)
for i, index in enumerate(indices):
y_ordered[index] = y[i]
return y_ordered
def predict(self, X: np.array) -> np.array:
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
if self.__proba:
prediction_proba = np.full((xp.shape[0], 1), node._belief)
#prediction_proba = self._linear_function(xp, node)
return np.append(prediction, prediction_proba, axis=1), indices
else:
return prediction, indices
u, i_u, d, i_d = self._split_data(node, xp, indices)
return prediction, indices
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, node.get_down())
m, n = predict_class(u, i_u, node.get_up())
return np.append(k, m), np.append(l, n)
@@ -123,22 +132,30 @@ class Stree(BaseEstimator, ClassifierMixin):
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
return predict_class(X, indices, self._tree)
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
indices = indices.astype(int)
for i, index in enumerate(indices):
y_ordered[index] = y[i]
return y_ordered
def predict(self, X: np.array) -> np.array:
return self._reorder_results(*self._predict_values(X))
return self._reorder_results(*predict_class(X, indices, self._tree))
def predict_proba(self, X: np.array) -> np.array:
self.__proba = True
result, indices = self._predict_values(X)
self.__proba = False
def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
prediction_proba = np.full((xp.shape[0], 1), node._belief)
#prediction_proba = dist
#print("******", prediction.shape, prediction_proba.shape)
return np.append(prediction, prediction_proba, axis=1), indices
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, r_u, node.get_down())
m, n = predict_class(u, i_u, r_d, node.get_up())
return np.append(k, m), np.append(l, n)
# sklearn check
check_is_fitted(self)
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
result, indices = predict_class(X, indices, [], self._tree)
result = result.reshape(X.shape[0], 2)
# Sigmoidize distance like in sklearn based on Platt(1999)
#result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))