mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 23:46:02 +00:00
Implement predict & predict_proba optimization
reduces time in two orders of magnitude in creditcard dataset
This commit is contained in:
@@ -10,5 +10,5 @@ python main.py
|
||||
## Tests
|
||||
|
||||
```python
|
||||
python -m unittest tests.Stree_test tests.Snode_test
|
||||
python -m unittest -v tests.Stree_test tests.Snode_test
|
||||
```
|
||||
|
3
main.py
3
main.py
@@ -33,8 +33,9 @@ def load_creditcard(n_examples=0):
|
||||
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
|
||||
print("Valid: {0:.3f}% {1}".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))
|
||||
return X, y
|
||||
X, y = load_creditcard(-5000)
|
||||
#X, y = load_creditcard(-5000)
|
||||
#X, y = load_creditcard()
|
||||
X, y = load_creditcard()
|
||||
|
||||
clf = Stree(C=.01, max_iter=100, random_state=random_state)
|
||||
clf.fit(X, y)
|
||||
|
41
test.ipynb
41
test.ipynb
@@ -19,13 +19,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def load_creditcard(n_examples=0):\n",
|
||||
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||
@@ -61,6 +55,7 @@
|
||||
"\n",
|
||||
"#X, y = load_wine(return_X_y=True)\n",
|
||||
"#X, y = load_iris(return_X_y=True)\n",
|
||||
"#y[y==2]=0\n",
|
||||
"\n",
|
||||
"X, y = load_creditcard()"
|
||||
]
|
||||
@@ -73,7 +68,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\nroot - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Down - Up - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Down - Up\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\n\n44.3767 secs\n"
|
||||
"text": "root\nroot - Down\nLeaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\nroot - Down - Up\nroot - Down - Up - Down\nLeaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nLeaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nLeaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nLeaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nLeaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nLeaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n60.9873 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -92,7 +87,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Accuracy: 0.999512\n33.1651 secs\n"
|
||||
"text": "Accuracy: 0.999512\n0.3226 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -109,7 +104,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "(284807, 2)\n87.5212 secs\n"
|
||||
"text": "(284807, 2)\n0.4148 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -119,6 +114,15 @@
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# outcomes without optimization executing predict_proba. 87 seconds\n",
|
||||
"(284807, 2)\n",
|
||||
"87.5212 secs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
@@ -127,7 +131,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "0.9991397683343457\n12.6601 secs\n"
|
||||
"text": "0.9991397683343457\n20.9481 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -146,7 +150,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "1.0\n18.2638 secs\n"
|
||||
"text": "1.0\n32.2779 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -156,6 +160,15 @@
|
||||
"print(clf3.score(X, y))\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||
"clf = Stree()\n",
|
||||
"check_estimator(clf)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -173,8 +186,8 @@
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||
"name": "python37664bitstreevenva9e4a4efdc1042b6b577bd15fbe145ee",
|
||||
"display_name": "Python 3.7.6 64-bit ('stree': venv)"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@@ -24,7 +24,7 @@ class Stree_test(unittest.TestCase):
|
||||
os.environ.pop('TESTING')
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def _get_Xy(self):
|
||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
||||
@@ -32,6 +32,12 @@ class Stree_test(unittest.TestCase):
|
||||
return X, y
|
||||
|
||||
def _check_tree(self, node: Snode):
|
||||
"""Check recursively that the nodes that are not leaves have the correct
|
||||
number of labels and its sons have the right number of elements in their dataset
|
||||
|
||||
Arguments:
|
||||
node {Snode} -- node to check
|
||||
"""
|
||||
if node.is_leaf():
|
||||
return
|
||||
y_prediction = node._clf.predict(node._X)
|
||||
@@ -43,6 +49,7 @@ class Stree_test(unittest.TestCase):
|
||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||
_, count_d = np.unique(y_down, return_counts=True)
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
@@ -55,9 +62,9 @@ class Stree_test(unittest.TestCase):
|
||||
self.assertEqual(count_y[i], number_down + number_up)
|
||||
# Is the partition made the same as the prediction?
|
||||
# as the node is not a leaf...
|
||||
unique_yp, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[1], y_up.shape[0])
|
||||
self.assertEqual(count_yp[0], y_down.shape[0])
|
||||
self._check_tree(node.get_down())
|
||||
self._check_tree(node.get_up())
|
||||
|
||||
@@ -101,11 +108,8 @@ class Stree_test(unittest.TestCase):
|
||||
return res
|
||||
|
||||
def test_subdatasets(self):
|
||||
"""Check if the subdatasets files have the same predictions as the tree itself
|
||||
"""Check if the subdatasets files have the same labels as the original dataset
|
||||
"""
|
||||
model = self._clf._tree._clf
|
||||
X, y = self._get_Xy()
|
||||
model.fit(X, y)
|
||||
self._clf.save_sub_datasets()
|
||||
with open(self._clf.get_catalog_name()) as cat_file:
|
||||
catalog = csv.reader(cat_file, delimiter=',')
|
||||
@@ -134,19 +138,23 @@ class Stree_test(unittest.TestCase):
|
||||
right = (yp == y).astype(int)
|
||||
accuracy_computed = sum(right) / len(y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertGreater(accuracy_score, 0.8)
|
||||
|
||||
def test_single_predict_proba(self):
|
||||
"""Check that element 28 has a prediction different that the current label
|
||||
"""
|
||||
# Element 28 has a different prediction than the truth
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
||||
self.assertEqual(0, yp[0:, 0])
|
||||
self.assertEqual(1, y[28])
|
||||
self.assertEqual(0.9282970550576184, yp[0:, 1])
|
||||
|
||||
def test_multiple_predict_proba(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict_proba(X[:num, :])
|
||||
yp = self._clf.predict_proba(X[:num,:])
|
||||
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
|
||||
expected_proba = [0.9759887, 0.92829706, 0.9759887, 0.92829706, 0.92829706, 0.9759887,
|
||||
0.92829706, 0.9759887, 0.9759887, 0.9759887, 0.9759887, 0.92829706,
|
||||
@@ -155,5 +163,42 @@ class Stree_test(unittest.TestCase):
|
||||
0.92829706, 0.92829706, 0.9759887 ]
|
||||
self.assertListEqual(expected_proba, np.round(yp[:, 1], decimals=8).tolist())
|
||||
|
||||
def test_use_model_predictions(self):
|
||||
"""Check that we get the same results wether we use the estimator in nodes
|
||||
to compute labes or we use the hyperplane and the position of samples wrt to it
|
||||
"""
|
||||
model_predictions = Stree(random_state=self._random_state,
|
||||
use_predictions=True)
|
||||
model_hyperplane = Stree(random_state=self._random_state,
|
||||
use_predictions=False)
|
||||
X, y = self._get_Xy()
|
||||
model_predictions.fit(X, y)
|
||||
model_hyperplane.fit(X, y)
|
||||
self.assertListEqual(
|
||||
model_predictions.predict(X).tolist(),
|
||||
model_hyperplane.predict(X).tolist()
|
||||
)
|
||||
a = model_predictions.score(X, y, print_out=False),
|
||||
b = model_hyperplane.score(X, y, print_out=False)
|
||||
self.assertEqual(a, b)
|
||||
self.assertGreater(b, .95)
|
||||
|
||||
def test_single_vs_multiple_prediction(self):
|
||||
"""Check if predicting sample by sample gives the same result as predicting
|
||||
all samples at once
|
||||
"""
|
||||
X, _ = self._get_Xy()
|
||||
# Compute prediction line by line
|
||||
yp_line = np.array([], dtype=int)
|
||||
for xp in X:
|
||||
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
|
||||
# Compute prediction at once
|
||||
yp_once = self._clf.predict(X)
|
||||
#
|
||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@@ -18,8 +18,8 @@ class Snode:
|
||||
self._interceptor = 0. if clf is None else clf.intercept_
|
||||
self._title = title
|
||||
self._belief = 0. # belief of the prediction in a leaf node based on samples
|
||||
self._X = X if os.environ.get(
|
||||
'TESTING', 'Not Set') != 'Not Set' else None
|
||||
# Only store dataset in Testing
|
||||
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
|
||||
self._y = y
|
||||
self._down = None
|
||||
self._up = None
|
||||
@@ -64,6 +64,6 @@ class Snode:
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.is_leaf():
|
||||
return f"Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
|
||||
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}\n"
|
||||
else:
|
||||
return f"{self._title}\n"
|
||||
|
@@ -43,26 +43,25 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
setattr(self, parameter, value)
|
||||
return self
|
||||
|
||||
def _split_data(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray) -> list:
|
||||
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
|
||||
if self.__use_predictions:
|
||||
yp = clf.predict(X)
|
||||
yp = node._clf.predict(data)
|
||||
down = (yp == 1).reshape(-1, 1)
|
||||
else:
|
||||
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
|
||||
# computes positition of every sample is w.r.t. the hyperplane
|
||||
coef = clf.coef_[0, :].reshape(-1, X.shape[1])
|
||||
intercept = clf.intercept_[0]
|
||||
res = X.dot(coef.T) + intercept
|
||||
coef = node._vector[0, :].reshape(-1, data.shape[1])
|
||||
res = data.dot(coef.T) + node._interceptor[0]
|
||||
down = res > 0
|
||||
up = ~down
|
||||
X_down = X[down[:, 0]] if any(down) else None
|
||||
y_down = y[down[:, 0]] if any(down) else None
|
||||
X_up = X[up[:, 0]] if any(up) else None
|
||||
y_up = y[up[:, 0]] if any(up) else None
|
||||
return [X_up, y_up, X_down, y_down]
|
||||
data_down = data[down[:, 0]] if any(down) else None
|
||||
indices_down = indices[down[:, 0]] if any(down) else None
|
||||
data_up = data[up[:, 0]] if any(up) else None
|
||||
indices_up = indices[up[:, 0]] if any(up) else None
|
||||
return [data_down, indices_down, data_up, indices_up]
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
|
||||
X, y = check_X_y(X, y)
|
||||
X, y = check_X_y(X, y.ravel())
|
||||
self.n_features_in_ = X.shape[1]
|
||||
self._tree = self.train(X, y.ravel(), title)
|
||||
self._build_predictor()
|
||||
@@ -83,47 +82,59 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
|
||||
if np.unique(y).shape[0] == 1:
|
||||
# only 1 class => pure dataset
|
||||
return Snode(None, X, y, title + ', <pure> ')
|
||||
return Snode(None, X, y, title + ', <pure>')
|
||||
# Train the model
|
||||
clf = LinearSVC(max_iter=self._max_iter, C=self._C,
|
||||
random_state=self._random_state)
|
||||
clf.fit(X, y)
|
||||
tree = Snode(clf, X, y, title)
|
||||
X_U, y_u, X_D, y_d = self._split_data(clf, X, y)
|
||||
X_U, y_u, X_D, y_d = self._split_data(tree, X, y)
|
||||
if X_U is None or X_D is None:
|
||||
# didn't part anything
|
||||
return Snode(clf, X, y, title + ', <couldn\'t go any further>')
|
||||
return Snode(clf, X, y, title + ', <cgaf>')
|
||||
tree.set_up(self.train(X_U, y_u, title + ' - Up'))
|
||||
tree.set_down(self.train(X_D, y_d, title + ' - Down'))
|
||||
return tree
|
||||
|
||||
def predict(self, X: np.array) -> np.array:
|
||||
def predict_class(xp: np.array, tree: Snode) -> np.array:
|
||||
if tree.is_leaf():
|
||||
def _predict_values(self, X: np.array) -> np.array:
|
||||
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
|
||||
if xp is None:
|
||||
return [], []
|
||||
if node.is_leaf():
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
if self.__proba:
|
||||
return [tree._class, tree._belief]
|
||||
prediction_proba = np.full((xp.shape[0], 1), node._belief)
|
||||
return np.append(prediction, prediction_proba, axis=1), indices
|
||||
else:
|
||||
return tree._class
|
||||
coef = tree._vector[0, :].reshape(-1, xp.shape[1])
|
||||
if xp.dot(coef.T) + tree._interceptor[0] > 0:
|
||||
return predict_class(xp, tree.get_down())
|
||||
return predict_class(xp, tree.get_up())
|
||||
|
||||
return prediction, indices
|
||||
u, i_u, d, i_d = self._split_data(node, xp, indices)
|
||||
k, l = predict_class(d, i_d, node.get_down())
|
||||
m, n = predict_class(u, i_u, node.get_up())
|
||||
return np.append(k, m), np.append(l, n)
|
||||
# sklearn check
|
||||
check_is_fitted(self)
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
# setup prediction & make it happen
|
||||
y = np.array([], dtype=int)
|
||||
for xp in X:
|
||||
y = np.append(y, predict_class(xp.reshape(-1, X.shape[1]), self._tree))
|
||||
return y
|
||||
indices = np.arange(X.shape[0])
|
||||
return predict_class(X, indices, self._tree)
|
||||
|
||||
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
|
||||
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
|
||||
indices = indices.astype(int)
|
||||
for i, index in enumerate(indices):
|
||||
y_ordered[index] = y[i]
|
||||
return y_ordered
|
||||
|
||||
def predict(self, X: np.array) -> np.array:
|
||||
return self._reorder_results(*self._predict_values(X))
|
||||
|
||||
def predict_proba(self, X: np.array) -> np.array:
|
||||
self.__proba = True
|
||||
result = self.predict(X).reshape(X.shape[0], 2)
|
||||
result, indices = self._predict_values(X)
|
||||
self.__proba = False
|
||||
return result
|
||||
return self._reorder_results(result.reshape(X.shape[0], 2), indices)
|
||||
|
||||
def score(self, X: np.array, y: np.array, print_out=True) -> float:
|
||||
if not self.__trained:
|
||||
@@ -180,4 +191,4 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
"""Save the every dataset stored in the tree to check with manual classifier
|
||||
"""
|
||||
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
|
||||
self._save_datasets(self._tree, catalog, 1)
|
||||
self._save_datasets(self._tree, catalog, 1)
|
Reference in New Issue
Block a user