Implement predict & predict_proba optimization

reduces time in two orders of magnitude in creditcard dataset
This commit is contained in:
2020-05-15 23:35:33 +02:00
parent e56b955b92
commit 80b5cf8e72
6 changed files with 129 additions and 59 deletions

View File

@@ -10,5 +10,5 @@ python main.py
## Tests
```python
python -m unittest tests.Stree_test tests.Snode_test
python -m unittest -v tests.Stree_test tests.Snode_test
```

View File

@@ -33,8 +33,9 @@ def load_creditcard(n_examples=0):
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
return X, y
X, y = load_creditcard(-5000)
#X, y = load_creditcard(-5000)
#X, y = load_creditcard()
X, y = load_creditcard()
clf = Stree(C=.01, max_iter=100, random_state=random_state)
clf.fit(X, y)

View File

@@ -19,13 +19,7 @@
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
}
],
"outputs": [],
"source": [
"def load_creditcard(n_examples=0):\n",
" df = pd.read_csv('data/creditcard.csv')\n",
@@ -61,6 +55,7 @@
"\n",
"#X, y = load_wine(return_X_y=True)\n",
"#X, y = load_iris(return_X_y=True)\n",
"#y[y==2]=0\n",
"\n",
"X, y = load_creditcard()"
]
@@ -73,7 +68,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\n\n44.3767 secs\n"
"text": "root\nroot - Down\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\nroot - Down - Up\nroot - Down - Up - Down\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n60.9873 secs\n"
}
],
"source": [
@@ -92,7 +87,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "Accuracy: 0.999512\n33.1651 secs\n"
"text": "Accuracy: 0.999512\n0.3226 secs\n"
}
],
"source": [
@@ -109,7 +104,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "(284807, 2)\n87.5212 secs\n"
"text": "(284807, 2)\n0.4148 secs\n"
}
],
"source": [
@@ -119,6 +114,15 @@
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# outcomes without optimization executing predict_proba. 87 seconds\n",
"(284807, 2)\n",
"87.5212 secs"
]
},
{
"cell_type": "code",
"execution_count": 6,
@@ -127,7 +131,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "0.9991397683343457\n12.6601 secs\n"
"text": "0.9991397683343457\n20.9481 secs\n"
}
],
"source": [
@@ -146,7 +150,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "1.0\n18.2638 secs\n"
"text": "1.0\n32.2779 secs\n"
}
],
"source": [
@@ -156,6 +160,15 @@
"print(clf3.score(X, y))\n",
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"from sklearn.utils.estimator_checks import check_estimator\n",
"clf = Stree()\n",
"check_estimator(clf)"
]
}
],
"metadata": {
@@ -173,8 +186,8 @@
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
"display_name": "Python 3.7.6 64-bit ('general': venv)"
"name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
"display_name": "Python 3.7.6 64-bit ('stree': venv)"
}
},
"nbformat": 4,

View File

@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
os.environ.pop('TESTING')
except:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
@@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase):
return X, y
def _check_tree(self, node: Snode):
"""Check recursively that the nodes that are not leaves have the correct
number of labels and its sons have the right number of elements in their dataset
Arguments:
node {Snode} -- node to check
"""
if node.is_leaf():
return
y_prediction = node._clf.predict(node._X)
@@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase):
unique_y, count_y = np.unique(node._y, return_counts=True)
_, count_d = np.unique(y_down, return_counts=True)
_, count_u = np.unique(y_up, return_counts=True)
#
for i in unique_y:
try:
number_down = count_d[i]
@@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase):
self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction?
# as the node is not a leaf...
unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[1], y_down.shape[0])
self.assertEqual(count_yp[0], y_up.shape[0])
_, count_yp = np.unique(y_prediction, return_counts=True)
self.assertEqual(count_yp[1], y_up.shape[0])
self.assertEqual(count_yp[0], y_down.shape[0])
self._check_tree(node.get_down())
self._check_tree(node.get_up())
@@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase):
return res
def test_subdatasets(self):
"""Check if the subdatasets files have the same predictions as the tree itself
"""Check if the subdatasets files have the same labels as the original dataset
"""
model = self._clf._tree._clf
X, y = self._get_Xy()
model.fit(X, y)
self._clf.save_sub_datasets()
with open(self._clf.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',')
@@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase):
right = (yp == y).astype(int)
accuracy_computed = sum(right) / len(y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.8)
def test_single_predict_proba(self):
"""Check that element 28 has a prediction different that the current label
"""
# Element 28 has a different prediction than the truth
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(0, yp[0:, 0])
self.assertEqual(1, y[28])
self.assertEqual(0.9282970550576184, yp[0:, 1])
def test_multiple_predict_proba(self):
# First 27 elements the predictions are the same as the truth
num = 27
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num, :])
yp = self._clf.predict_proba(X[:num,:])
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887,
0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706,
@@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase):
0.92829706, 0.92829706, 0.9759887 ]
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
def test_use_model_predictions(self):
"""Check that we get the same results wether we use the estimator in nodes
to compute labes or we use the hyperplane and the position of samples wrt to it
"""
model_predictions = Stree(random_state=self._random_state,
use_predictions=True)
model_hyperplane = Stree(random_state=self._random_state,
use_predictions=False)
X, y = self._get_Xy()
model_predictions.fit(X, y)
model_hyperplane.fit(X, y)
self.assertListEqual(
model_predictions.predict(X).tolist(),
model_hyperplane.predict(X).tolist()
)
a = model_predictions.score(X, y, print_out=False),
b = model_hyperplane.score(X, y, print_out=False)
self.assertEqual(a, b)
self.assertGreater(b, .95)
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as predicting
all samples at once
"""
X, _ = self._get_Xy()
# Compute prediction line by line
yp_line = np.array([], dtype=int)
for xp in X:
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
# Compute prediction at once
yp_once = self._clf.predict(X)
#
self.assertListEqual(yp_line.tolist(), yp_once.tolist())

View File

@@ -18,8 +18,8 @@ class Snode:
self._interceptor = 0. if clf is None else clf.intercept_
self._title = title
self._belief = 0. # belief of the prediction in a leaf node based on samples
self._X = X if os.environ.get(
'TESTING', 'Not Set') != 'Not Set' else None
# Only store dataset in Testing
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
self._y = y
self._down = None
self._up = None
@@ -64,6 +64,6 @@ class Snode:
def __str__(self) -> str:
if self.is_leaf():
return f"Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
else:
return f"{self._title}\n"

View File

@@ -43,26 +43,25 @@ class Stree(BaseEstimator, ClassifierMixin):
setattr(self, parameter, value)
return self
def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
if self.__use_predictions:
yp = clf.predict(X)
yp = node._clf.predict(data)
down = (yp == 1).reshape(-1, 1)
else:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
# computes positition of every sample is w.r.t. the hyperplane
coef = clf.coef_[0, :].reshape(-1, X.shape[1])
intercept = clf.intercept_[0]
res = X.dot(coef.T) + intercept
coef = node._vector[0, :].reshape(-1, data.shape[1])
res = data.dot(coef.T) + node._interceptor[0]
down = res > 0
up = ~down
X_down = X[down[:, 0]] if any(down) else None
y_down = y[down[:, 0]] if any(down) else None
X_up = X[up[:, 0]] if any(up) else None
y_up = y[up[:, 0]] if any(up) else None
return [X_up, y_up, X_down, y_down]
data_down = data[down[:, 0]] if any(down) else None
indices_down = indices[down[:, 0]] if any(down) else None
data_up = data[up[:, 0]] if any(up) else None
indices_up = indices[up[:, 0]] if any(up) else None
return [data_down, indices_down, data_up, indices_up]
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
X, y = check_X_y(X, y)
X, y = check_X_y(X, y.ravel())
self.n_features_in_ = X.shape[1]
self._tree = self.train(X, y.ravel(), title)
self._build_predictor()
@@ -83,47 +82,59 @@ class Stree(BaseEstimator, ClassifierMixin):
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset
return Snode(None, X, y, title + ', <pure> ')
return Snode(None, X, y, title + ', <pure>')
# Train the model
clf = LinearSVC(max_iter=self._max_iter, C=self._C,
random_state=self._random_state)
clf.fit(X, y)
tree = Snode(clf, X, y, title)
X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
X_U, y_u, X_D, y_d = self._split_data(tree, X, y)
if X_U is None or X_D is None:
# didn't part anything
return Snode(clf, X, y, title + ', <couldn\'t go any further>')
return Snode(clf, X, y, title + ', <cgaf>')
tree.set_up(self.train(X_U, y_u, title + ' - Up'))
tree.set_down(self.train(X_D, y_d, title + ' - Down'))
return tree
def predict(self, X: np.array) -> np.array:
def predict_class(xp: np.array, tree: Snode) -> np.array:
if tree.is_leaf():
def _predict_values(self, X: np.array) -> np.array:
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
if self.__proba:
return [tree._class, tree._belief]
prediction_proba = np.full((xp.shape[0], 1), node._belief)
return np.append(prediction, prediction_proba, axis=1), indices
else:
return tree._class
coef = tree._vector[0, :].reshape(-1, xp.shape[1])
if xp.dot(coef.T) + tree._interceptor[0] > 0:
return predict_class(xp, tree.get_down())
return predict_class(xp, tree.get_up())
return prediction, indices
u, i_u, d, i_d = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, node.get_down())
m, n = predict_class(u, i_u, node.get_up())
return np.append(k, m), np.append(l, n)
# sklearn check
check_is_fitted(self)
# Input validation
X = check_array(X)
# setup prediction & make it happen
y = np.array([], dtype=int)
for xp in X:
y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
return y
indices = np.arange(X.shape[0])
return predict_class(X, indices, self._tree)
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
indices = indices.astype(int)
for i, index in enumerate(indices):
y_ordered[index] = y[i]
return y_ordered
def predict(self, X: np.array) -> np.array:
return self._reorder_results(*self._predict_values(X))
def predict_proba(self, X: np.array) -> np.array:
self.__proba = True
result = self.predict(X).reshape(X.shape[0], 2)
result, indices = self._predict_values(X)
self.__proba = False
return result
return self._reorder_results(result.reshape(X.shape[0], 2), indices)
def score(self, X: np.array, y: np.array, print_out=True) -> float:
if not self.__trained:
@@ -180,4 +191,4 @@ class Stree(BaseEstimator, ClassifierMixin):
"""Save the every dataset stored in the tree to check with manual classifier
"""
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
self._save_datasets(self._tree, catalog, 1)
self._save_datasets(self._tree, catalog, 1)