Adapt some notebooks

This commit is contained in:
2020-05-30 11:09:59 +02:00
parent a22ae81b54
commit 724a4855fb
5 changed files with 279 additions and 78 deletions

View File

@@ -2,7 +2,7 @@
# Stree
Oblique Tree classifier based on SVM nodes
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png)

190
notebooks/adaboost.ipynb Normal file
View File

@@ -0,0 +1,190 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.datasets import load_iris\n",
"from stree import Stree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"# data = load_creditcard(0) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.986857825567503\nScore Test: 0.9805013927576601\nTook 0.12 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf = Stree(max_depth=3, random_state=random_state)\n",
"clf.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.997610513739546\nScore Test: 0.9721448467966574\nTook 7.80 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n",
"clf2.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf2.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.9796893667861409\nScore Test: 0.9554317548746518\nTook 0.48 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n",
"clf3.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf3.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 1.0\nScore Test: 0.9721448467966574\nTook 0.86 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n",
"clf4.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf4.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
"display_name": "Python 3.7.6 64-bit ('general': venv)"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long

View File

@@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -24,7 +24,7 @@
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-2-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m<ipython-input-12-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'stree'"
]
}

View File

@@ -152,11 +152,6 @@ class Stree(BaseEstimator, ClassifierMixin):
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
# computes positition of every sample is w.r.t. the hyperplane
res = self._linear_function(data, node)
# data_up, data_down = self._split_array(data, down)
# indices_up, indices_down = self._split_array(indices, down)
# res_up, res_down = self._split_array(res, down)
# weight_up, weight_down = self._split_array(weights, down)
#return [data_up, indices_up, data_down, indices_down, weight_up, weight_down, res_up, res_down]
return res
def _split_criteria(self, data: np.array) -> np.array:
@@ -176,7 +171,6 @@ class Stree(BaseEstimator, ClassifierMixin):
sample_weight = _check_sample_weight(sample_weight, X)
check_classification_targets(y)
# Initialize computed parameters
#self.random_state = check_random_state(self.random_state)
self.classes_ = np.unique(y)
self.n_iter_ = self.max_iter
self.depth_ = 0
@@ -316,8 +310,7 @@ class Stree(BaseEstimator, ClassifierMixin):
# sklearn check
check_is_fitted(self)
yp = self.predict(X).reshape(y.shape)
right = (yp == y).astype(int)
return np.sum(right) / len(y)
return np.mean(yp == y)
def __iter__(self) -> Siterator:
try: