Compare commits

...

7 Commits

Author SHA1 Message Date
b4816b2995 Show sample_weight use in test2 notebook
Update revision to RC4
Lint Stree grapher
2020-05-30 23:59:40 +02:00
5e5fea9c6a Document & lint code 2020-05-30 23:10:10 +02:00
724a4855fb Adapt some notebooks 2020-05-30 11:09:59 +02:00
a22ae81b54 Refactor split_data adding sample_weight 2020-05-29 18:52:23 +02:00
ed98054f0d First approach
Added max_depth, tol, weighted samples
2020-05-29 12:46:10 +02:00
e95bd9697a Make Stree a sklearn estimator
Added check_estimator in notebook test2
Added a Stree test with check_estimator
2020-05-25 19:51:39 +02:00
5956cd0cd2 Update google colab setup in notebooks
Undate save_all in grapher to make dest. folder if it doesn't exist
2020-05-24 20:13:27 +02:00
14 changed files with 1221 additions and 665 deletions

View File

@@ -2,7 +2,7 @@
# Stree # Stree
Oblique Tree classifier based on SVM nodes Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png) ![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png)
@@ -18,15 +18,15 @@ pip install git+https://github.com/doctorado-ml/stree
##### Slow launch but better integration ##### Slow launch but better integration
* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/test.ipynb) Test notebook * [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/test.ipynb) Test notebook
##### Fast launch but have to run first commented out cell for setup ##### Fast launch but have to run first commented out cell for setup
* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test.ipynb) Test notebook * [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test.ipynb) Test notebook
* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test2.ipynb) Another Test notebook * [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test2.ipynb) Another Test notebook
* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test_graphs.ipynb) Test Graphics notebook * [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics notebook
### Command line ### Command line

1
data/.gitignore vendored
View File

@@ -1 +0,0 @@
*

190
notebooks/adaboost.ipynb Normal file
View File

@@ -0,0 +1,190 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.datasets import load_iris\n",
"from stree import Stree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"# data = load_creditcard(0) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.986857825567503\nScore Test: 0.9805013927576601\nTook 0.12 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf = Stree(max_depth=3, random_state=random_state)\n",
"clf.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.997610513739546\nScore Test: 0.9721448467966574\nTook 7.80 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n",
"clf2.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf2.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.9796893667861409\nScore Test: 0.9554317548746518\nTook 0.48 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n",
"clf3.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf3.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 1.0\nScore Test: 0.9721448467966574\nTook 0.86 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n",
"clf4.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf4.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
"display_name": "Python 3.7.6 64-bit ('general': venv)"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -9,10 +9,7 @@
"#\n", "#\n",
"# Google Colab setup\n", "# Google Colab setup\n",
"#\n", "#\n",
"#import os\n", "#!pip install git+https://github.com/doctorado-ml/stree"
"#os.chdir(\"/content\")\n",
"#!git clone https://github.com/Doctorado-ML/STree.git\n",
"#os.chdir(\"/content/STree\")"
] ]
}, },
{ {

236
notebooks/gridsearch.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@@ -9,10 +9,7 @@
"#\n", "#\n",
"# Google Colab setup\n", "# Google Colab setup\n",
"#\n", "#\n",
"#import os\n", "#!pip install git+https://github.com/doctorado-ml/stree"
"#os.chdir(\"/content\")\n",
"#!git clone https://github.com/Doctorado-ML/STree.git\n",
"#os.chdir(\"/content/STree\")"
] ]
}, },
{ {

225
notebooks/test2.ipynb Normal file
View File

@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]\n",
"# Set weights inverse to its count class in dataset\n",
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
"weights[ytrain==1] = 1.99755 "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Accuracy of Train without weights 0.996415770609319\nAccuracy of Train with weights 0.994026284348865\nAccuracy of Tests without weights 0.9665738161559888\nAccuracy of Tests with weights 0.9721448467966574\n"
}
],
"source": [
"C = 23\n",
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"outputPrepend"
]
},
"outputs": [],
"source": [
"t = time.time()\n",
"for C in (.001, .01, 1, 5, 17):\n",
" clf = Stree(C=C, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" print(f\"************** C={C} ****************************\")\n",
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
" print(clf)\n",
" print(f\"**************************************************\")\n",
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.calibration import CalibratedClassifierCV\n",
"scaler = StandardScaler()\n",
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
"cclf.fit(Xtrain, ytrain)\n",
"res = cclf.predict_proba(Xtest)\n",
"print(res[:4, :])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#check iterator\n",
"for i in list(clf):\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#check iterator again\n",
"for i in clf:\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check if the classifier is a sklearn estimator\n",
"from sklearn.utils.estimator_checks import check_estimator\n",
"check_estimator(Stree())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Stree(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.6 64-bit ('general': venv)",
"language": "python",
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

197
notebooks/test_graphs.ipynb Normal file
View File

@@ -0,0 +1,197 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "ModuleNotFoundError",
"evalue": "No module named 'stree'",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-12-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'stree'"
]
}
],
"source": [
"import time\n",
"import random\n",
"import numpy as np\n",
"from sklearn.datasets import make_blobs\n",
"from sklearn.svm import LinearSVC\n",
"from stree import Stree, Stree_grapher"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def build_data(random_state):\n",
" random.seed(random_state)\n",
" X, y = make_blobs(centers=10, n_features=3, n_samples=500, random_state=random_state)\n",
" def make_binary(y):\n",
" for i in range(2, 10):\n",
" y[y==i] = random.randint(0, 1)\n",
" return y\n",
" y = make_binary(y)\n",
" #print(X.shape, np.unique(y), y[y==0].shape, y[y==1].shape)\n",
" return X, y"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'Stree_grapher' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-b909470cb406>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m.01\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m200\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m#gr.save_all(save_folder='data/', save_prefix='7')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'Stree_grapher' is not defined"
]
}
],
"source": [
"X, y = build_data(10)\n",
"gr = Stree_grapher(dict(C=.01, max_iter=200))\n",
"gr.fit(X, y)\n",
"#gr.save_all(save_folder='data/', save_prefix='7')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-efa3db892bfd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"print(gr)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-0e62f081c9aa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Agg'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_folder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'data/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"import matplotlib\n",
"matplotlib.use('Agg')\n",
"gr.save_all(save_folder='data/')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-b0484cfe9d26>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m#%matplotlib inline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'widget'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tree_gr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_hyperplane\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
"#%matplotlib inline\n",
"%matplotlib widget\n",
"gr._tree_gr.plot_hyperplane()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-4277c1aacbe2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'inline'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m#%matplotlib widget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
"%matplotlib inline\n",
"#%matplotlib widget\n",
"gr.plot_all()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,8 +1,9 @@
import setuptools import setuptools
__version__ = "0.9rc1" __version__ = "0.9rc4"
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
def readme(): def readme():
with open('README.md') as f: with open('README.md') as f:
return f.read() return f.read()
@@ -12,14 +13,15 @@ setuptools.setup(
name='STree', name='STree',
version=__version__, version=__version__,
license='MIT License', license='MIT License',
description='a python interface to oblique decision tree implementations', description='Oblique decision tree with svm nodes',
long_description=readme(), long_description=readme(),
long_description_content_type='text/markdown', long_description_content_type='text/markdown',
packages=setuptools.find_packages(), packages=setuptools.find_packages(),
url='https://github.com/doctorado-ml/stree', url='https://github.com/doctorado-ml/stree',
author=__author__, author=__author__,
author_email='ricardo.montanana@alu.uclm.es', author_email='ricardo.montanana@alu.uclm.es',
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-tree svm svc', keywords='scikit-learn oblique-classifier oblique-decision-tree decision-\
tree svm svc',
classifiers=[ classifiers=[
'Development Status :: 4 - Beta', 'Development Status :: 4 - Beta',
'License :: OSI Approved :: MIT License', 'License :: OSI Approved :: MIT License',
@@ -34,7 +36,6 @@ setuptools.setup(
'matplotlib', 'matplotlib',
'ipympl' 'ipympl'
], ],
data_files=[('data', ['data/.gitignore'])],
test_suite="stree.tests", test_suite="stree.tests",
zip_safe=False zip_safe=False
) )

View File

@@ -7,21 +7,28 @@ Build an oblique tree classifier based on SVM Trees
Uses LinearSVC Uses LinearSVC
''' '''
import typing
import os import os
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, \
_check_sample_weight
class Snode: class Snode:
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str): """Nodes of the tree that keeps the svm classifier and if testing the
dataset assigned to it
"""
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray,
title: str):
self._clf = clf self._clf = clf
self._vector = None if clf is None else clf.coef_ self._vector = None if clf is None else clf.coef_
self._interceptor = 0. if clf is None else clf.intercept_ self._interceptor = 0. if clf is None else clf.intercept_
self._title = title self._title = title
self._belief = 0. # belief of the prediction in a leaf node based on samples self._belief = 0.
# Only store dataset in Testing # Only store dataset in Testing
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
self._y = y self._y = y
@@ -49,8 +56,8 @@ class Snode:
return self._up return self._up
def make_predictor(self): def make_predictor(self):
"""Compute the class of the predictor and its belief based on the subdataset of the node """Compute the class of the predictor and its belief based on the
only if it is a leaf subdataset of the node only if it is a leaf
""" """
if not self.is_leaf(): if not self.is_leaf():
return return
@@ -60,7 +67,7 @@ class Snode:
min_card = min(card) min_card = min(card)
try: try:
self._belief = max_card / (max_card + min_card) self._belief = max_card / (max_card + min_card)
except: except ZeroDivisionError:
self._belief = 0. self._belief = 0.
self._class = classes[card == max_card][0] self._class = classes[card == max_card][0]
else: else:
@@ -69,7 +76,10 @@ class Snode:
def __str__(self) -> str: def __str__(self) -> str:
if self.is_leaf(): if self.is_leaf():
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}" count_values = np.unique(self._y, return_counts=True)
result = f"{self._title} - Leaf class={self._class} belief="\
f"{self._belief: .6f} counts={count_values}"
return result
else: else:
return f"{self._title}" return f"{self._title}"
@@ -97,67 +107,129 @@ class Siterator:
self._push(node.get_down()) self._push(node.get_down())
return node return node
class Stree(BaseEstimator, ClassifierMixin): class Stree(BaseEstimator, ClassifierMixin):
""" """Estimator that is based on binary trees of svm nodes
can deal with sample_weights in predict, used in boosting sklearn methods
inheriting from BaseEstimator implements get_params and set_params methods
inheriting from ClassifierMixin implement the attribute _estimator_type
with "classifier" as value
""" """
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False): def __init__(self, C: float = 1.0, max_iter: int = 1000,
self._max_iter = max_iter random_state: int = None, max_depth: int = None,
self._C = C tol: float = 1e-4, use_predictions: bool = False):
self._random_state = random_state self.max_iter = max_iter
self._tree = None self.C = C
self.__folder = 'data/' self.random_state = random_state
self.__use_predictions = use_predictions self.use_predictions = use_predictions
self.__trained = False self.max_depth = max_depth
self.__proba = False self.tol = tol
def get_params(self, deep=True): def _more_tags(self) -> dict:
"""Get dict with hyperparameters and its values to accomplish sklearn rules """Required by sklearn to tell that this estimator is a binary classifier
"""
return {"C": self._C, "random_state": self._random_state, 'max_iter': self._max_iter}
def set_params(self, **parameters): :return: the tag required
"""Set hyperparmeters as specified by sklearn, needed in Gridsearchs :rtype: dict
""" """
for parameter, value in parameters.items(): return {'binary_only': True, 'requires_y': True}
setattr(self, parameter, value)
return self
def _linear_function(self, data: np.array, node: Snode) -> np.array: def _linear_function(self, data: np.array, node: Snode) -> np.array:
"""Compute the distance of set of samples to a hyperplane, in
multiclass classification it should compute the distance to a
hyperplane of each class
:param data: dataset of samples
:type data: np.array
:param node: the node that contains the hyperplance coefficients
:type node: Snode
:return: array of distances of each sample to the hyperplane
:rtype: np.array
"""
coef = node._vector[0, :].reshape(-1, data.shape[1]) coef = node._vector[0, :].reshape(-1, data.shape[1])
return data.dot(coef.T) + node._interceptor[0] return data.dot(coef.T) + node._interceptor[0]
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list: def _split_array(self, origin: np.array, down: np.array) -> list:
if self.__use_predictions: """Split an array in two based on indices passed as down and its complement
yp = node._clf.predict(data)
down = (yp == 1).reshape(-1, 1) :param origin: dataset to split
:type origin: np.array
:param down: indices to use to split array
:type down: np.array
:return: list with two splits of the array
:rtype: list
"""
up = ~down
return origin[up[:, 0]] if any(up) else None, \
origin[down[:, 0]] if any(down) else None
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node
:param node: node containing the svm classifier
:type node: Snode
:param data: samples to find out distance to hyperplane
:type data: np.ndarray
:return: array of shape (m, 1) with the distances of every sample to
the hyperplane of the node
:rtype: np.array
"""
if self.use_predictions:
res = np.expand_dims(node._clf.decision_function(data), 1) res = np.expand_dims(node._clf.decision_function(data), 1)
else: else:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients """doesn't work with multiclass as each sample has to do inner
# computes positition of every sample is w.r.t. the hyperplane product with its own coefficients computes positition of every
sample is w.r.t. the hyperplane
"""
res = self._linear_function(data, node) res = self._linear_function(data, node)
down = res > 0 return res
up = ~down
data_down = data[down[:, 0]] if any(down) else None
indices_down = indices[down[:, 0]] if any(down) else None
res_down = res[down[:, 0]] if any(down) else None
data_up = data[up[:, 0]] if any(up) else None
indices_up = indices[up[:, 0]] if any(up) else None
res_up = res[up[:, 0]] if any(up) else None
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree': def _split_criteria(self, data: np.array) -> np.array:
X, y = check_X_y(X, y.ravel()) """Set the criteria to split arrays
:param data: [description]
:type data: np.array
:return: [description]
:rtype: np.array
"""
return data > 0
def fit(self, X: np.ndarray, y: np.ndarray,
sample_weight: np.array = None) -> 'Stree':
"""Build the tree based on the dataset of samples and its labels
:raises ValueError: if parameters C or max_depth are out of bounds
:return: itself to be able to chain actions: fit().predict() ...
:rtype: Stree
"""
# Check parameters are Ok.
if type(y).__name__ == 'np.ndarray':
y = y.ravel()
if self.C < 0:
raise ValueError(
f"Penalty term must be positive... got (C={self.C:f})")
self.__max_depth = np.iinfo(
np.int32).max if self.max_depth is None else self.max_depth
if self.__max_depth < 1:
raise ValueError(
f"Maximum depth has to be greater than 1... got (max_depth=\
{self.max_depth})")
check_classification_targets(y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
check_classification_targets(y)
# Initialize computed parameters
self.classes_, y = np.unique(y, return_inverse=True)
self.n_iter_ = self.max_iter
self.depth_ = 0
self.n_features_in_ = X.shape[1] self.n_features_in_ = X.shape[1]
self._tree = self.train(X, y.ravel(), title) self.tree_ = self.train(X, y, sample_weight, 1, 'root')
self._build_predictor() self._build_predictor()
self.__trained = True
return self return self
def _build_predictor(self): def _build_predictor(self):
"""Process the leaves to make them predictors """Process the leaves to make them predictors
""" """
def run_tree(node: Snode): def run_tree(node: Snode):
if node.is_leaf(): if node.is_leaf():
node.make_predictor() node.make_predictor()
@@ -165,69 +237,121 @@ class Stree(BaseEstimator, ClassifierMixin):
run_tree(node.get_down()) run_tree(node.get_down())
run_tree(node.get_up()) run_tree(node.get_up())
run_tree(self._tree) run_tree(self.tree_)
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode: def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray,
depth: int, title: str) -> Snode:
"""Recursive function to split the original dataset into predictor
nodes (leaves)
:param X: samples dataset
:type X: np.ndarray
:param y: samples labels
:type y: np.ndarray
:param sample_weight: weight of samples (used in boosting)
:type sample_weight: np.ndarray
:param depth: actual depth in the tree
:type depth: int
:param title: description of the node
:type title: str
:return: binary tree
:rtype: Snode
"""
if depth > self.__max_depth:
return None
if np.unique(y).shape[0] == 1: if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset # only 1 class => pure dataset
return Snode(None, X, y, title + ', <pure>') return Snode(None, X, y, title + ', <pure>')
# Train the model # Train the model
clf = LinearSVC(max_iter=self._max_iter, C=self._C, clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
random_state=self._random_state) C=self.C) # , sample_weight=sample_weight)
clf.fit(X, y) clf.fit(X, y, sample_weight=sample_weight)
tree = Snode(clf, X, y, title) tree = Snode(clf, X, y, title)
X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y) self.depth_ = max(depth, self.depth_)
down = self._split_criteria(self._distances(tree, X))
X_U, X_D = self._split_array(X, down)
y_u, y_d = self._split_array(y, down)
sw_u, sw_d = self._split_array(sample_weight, down)
if X_U is None or X_D is None: if X_U is None or X_D is None:
# didn't part anything # didn't part anything
return Snode(clf, X, y, title + ', <cgaf>') return Snode(clf, X, y, title + ', <cgaf>')
tree.set_up(self.train(X_U, y_u, title + ' - Up')) tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + ' - Up'))
tree.set_down(self.train(X_D, y_d, title + ' - Down')) tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + ' - Down'))
return tree return tree
def _reorder_results(self, y: np.array, indices: np.array) -> np.array: def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float) """Reorder an array based on the array of indices passed
:param y: data untidy
:type y: np.array
:param indices: indices used to set order
:type indices: np.array
:return: array y ordered
:rtype: np.array
"""
if y.ndim > 1 and y.shape[1] > 1:
# if predict_proba return np.array of floats
y_ordered = np.zeros(y.shape, dtype=float)
else:
# return array of same type given in y
y_ordered = y.copy()
indices = indices.astype(int) indices = indices.astype(int)
for i, index in enumerate(indices): for i, index in enumerate(indices):
y_ordered[index] = y[i] y_ordered[index] = y[i]
return y_ordered return y_ordered
def predict(self, X: np.array) -> np.array: def predict(self, X: np.array) -> np.array:
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array: """Predict labels for each sample in dataset passed
:param X: dataset of samples
:type X: np.array
:return: array of labels
:rtype: np.array
"""
def predict_class(xp: np.array, indices: np.array,
node: Snode) -> np.array:
if xp is None: if xp is None:
return [], [] return [], []
if node.is_leaf(): if node.is_leaf():
# set a class for every sample in dataset # set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class) prediction = np.full((xp.shape[0], 1), node._class)
return prediction, indices return prediction, indices
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices) down = self._split_criteria(self._distances(node, xp))
k, l = predict_class(d, i_d, node.get_down()) X_U, X_D = self._split_array(xp, down)
m, n = predict_class(u, i_u, node.get_up()) i_u, i_d = self._split_array(indices, down)
return np.append(k, m), np.append(l, n) prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check # sklearn check
check_is_fitted(self) check_is_fitted(self, ['tree_'])
# Input validation # Input validation
X = check_array(X) X = check_array(X)
# setup prediction & make it happen # setup prediction & make it happen
indices = np.arange(X.shape[0]) indices = np.arange(X.shape[0])
return self._reorder_results(*predict_class(X, indices, self._tree)) result = self._reorder_results(
*predict_class(X, indices, self.tree_)).astype(int).ravel()
return self.classes_[result]
def predict_proba(self, X: np.array) -> np.array: def predict_proba(self, X: np.array) -> np.array:
"""Computes an approximation of the probability of samples belonging to class 1 """Computes an approximation of the probability of samples belonging to
(nothing more, nothing less) class 0 and 1
:param X: dataset :param X: dataset
:type X: np.array :type X: np.array
:return: array array of shape (m, num_classes), probability of being
each class
:rtype: np.array
""" """
def predict_class(xp: np.array, indices: np.array, dist: np.array,
def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array: node: Snode) -> np.array:
"""Run the tree to compute predictions """Run the tree to compute predictions
:param xp: subdataset of samples :param xp: subdataset of samples
:type xp: np.array :type xp: np.array
:param indices: indices of subdataset samples to rebuild original order :param indices: indices of subdataset samples to rebuild original
order
:type indices: np.array :type indices: np.array
:param dist: distances of every sample to the hyperplane or the father node :param dist: distances of every sample to the hyperplane or the
father node
:type dist: np.array :type dist: np.array
:param node: node of the leaf with the class :param node: node of the leaf with the class
:type node: Snode :type node: Snode
@@ -241,71 +365,68 @@ class Stree(BaseEstimator, ClassifierMixin):
prediction = np.full((xp.shape[0], 1), node._class) prediction = np.full((xp.shape[0], 1), node._class)
prediction_proba = dist prediction_proba = dist
return np.append(prediction, prediction_proba, axis=1), indices return np.append(prediction, prediction_proba, axis=1), indices
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices) distances = self._distances(node, xp)
k, l = predict_class(d, i_d, r_d, node.get_down()) down = self._split_criteria(distances)
m, n = predict_class(u, i_u, r_u, node.get_up()) X_U, X_D = self._split_array(xp, down)
return np.append(k, m), np.append(l, n) i_u, i_d = self._split_array(indices, down)
di_u, di_d = self._split_array(distances, down)
prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check # sklearn check
check_is_fitted(self) check_is_fitted(self, ['tree_'])
# Input validation # Input validation
X = check_array(X) X = check_array(X)
# setup prediction & make it happen # setup prediction & make it happen
indices = np.arange(X.shape[0]) indices = np.arange(X.shape[0])
result, indices = predict_class(X, indices, [], self._tree) empty_dist = np.empty((X.shape[0], 1), dtype=float)
result, indices = predict_class(X, indices, empty_dist, self.tree_)
result = result.reshape(X.shape[0], 2) result = result.reshape(X.shape[0], 2)
# Turn distances to hyperplane into probabilities based on fitting distances # Turn distances to hyperplane into probabilities based on fitting
# of samples to its hyperplane that classified them, to the sigmoid function # distances of samples to its hyperplane that classified them, to the
# sigmoid function
# Probability of being 1
result[:, 1] = 1 / (1 + np.exp(-result[:, 1])) result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
# Probability of being 0
result[:, 0] = 1 - result[:, 1]
return self._reorder_results(result, indices) return self._reorder_results(result, indices)
def score(self, X: np.array, y: np.array) -> float: def score(self, X: np.array, y: np.array) -> float:
"""Return accuracy """Compute accuracy of the prediction
"""
if not self.__trained:
self.fit(X, y)
yp = self.predict(X).reshape(y.shape)
right = (yp == y).astype(int)
return np.sum(right) / len(y)
def __iter__(self): :param X: dataset of samples to make predictions
return Siterator(self._tree) :type X: np.array
:param y: samples labels
:type y: np.array
:return: accuracy of the prediction
:rtype: float
"""
# sklearn check
check_is_fitted(self)
yp = self.predict(X).reshape(y.shape)
return np.mean(yp == y)
def __iter__(self) -> Siterator:
"""Create an iterator to be able to visit the nodes of the tree in preorder,
can make a list with all the nodes in preorder
:return: an iterator, can for i in... and list(...)
:rtype: Siterator
"""
try:
tree = self.tree_
except AttributeError:
tree = None
return Siterator(tree)
def __str__(self) -> str: def __str__(self) -> str:
"""String representation of the tree
:return: description of nodes in the tree in preorder
:rtype: str
"""
output = '' output = ''
for i in self: for i in self:
output += str(i) + '\n' output += str(i) + '\n'
return output return output
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
"""Save the dataset of the node in a csv file
:param tree: node with data to save
:type tree: Snode
:param catalog: catalog file handler
:type catalog: typing.TextIO
:param number: sequential number for the generated file name
:type number: int
"""
data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
name = f"{self.__folder}dataset{number}.csv"
np.savetxt(name, data, delimiter=",")
catalog.write(f"{name}, - {str(tree)}")
if tree.is_leaf():
return
self._save_datasets(tree.get_down(), catalog, number + 1)
self._save_datasets(tree.get_up(), catalog, number + 2)
def get_catalog_name(self):
return self.__folder + "catalog.txt"
def save_sub_datasets(self):
"""Save the every dataset stored in the tree to check with manual classifier
"""
if not os.path.isdir(self.__folder):
os.mkdir(self.__folder)
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
self._save_datasets(self._tree, catalog, 1)

View File

@@ -15,6 +15,7 @@ from mpl_toolkits.mplot3d import Axes3D
from .Strees import Stree, Snode, Siterator from .Strees import Stree, Snode, Siterator
class Snode_graph(Snode): class Snode_graph(Snode):
def __init__(self, node: Stree): def __init__(self, node: Stree):
@@ -45,7 +46,8 @@ class Snode_graph(Snode):
ax.set_ylim(self._ylimits) ax.set_ylim(self._ylimits)
ax.set_zlim(self._zlimits) ax.set_zlim(self._zlimits)
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '', save_seq: int = 1): def save_hyperplane(self, save_folder: str = './', save_prefix: str = '',
save_seq: int = 1):
_, fig = self.plot_hyperplane() _, fig = self.plot_hyperplane()
name = f"{save_folder}{save_prefix}STnode{save_seq}.png" name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
fig.savefig(name, bbox_inches='tight') fig.savefig(name, bbox_inches='tight')
@@ -53,8 +55,7 @@ class Snode_graph(Snode):
def _get_cmap(self): def _get_cmap(self):
cmap = 'jet' cmap = 'jet'
if self._is_pure(): if self._is_pure() and self._class == 1:
if self._class == 1:
cmap = 'jet_r' cmap = 'jet_r'
return cmap return cmap
@@ -66,16 +67,20 @@ class Snode_graph(Snode):
fig = plt.figure(figsize=self._plot_size) fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection='3d') ax = fig.add_subplot(1, 1, 1, projection='3d')
if not self._is_pure(): if not self._is_pure():
# Can't plot hyperplane of leaves with one label because it hasn't classiffier # Can't plot hyperplane of leaves with one label because it hasn't
# classiffier
# get the splitting hyperplane # get the splitting hyperplane
def hyperplane(x, y): return (-self._interceptor - self._vector[0][0] * x def hyperplane(x, y): return (-self._interceptor
- self._vector[0][1] * y) / self._vector[0][2] - self._vector[0][0] * x
- self._vector[0][1] * y) \
/ self._vector[0][2]
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max()) tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max()) tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
xx, yy = np.meshgrid(tmpx, tmpy) xx, yy = np.meshgrid(tmpx, tmpy)
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5, antialiased=True, ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5,
rstride=1, cstride=1, cmap='seismic') antialiased=True, rstride=1, cstride=1,
cmap='seismic')
self._set_graphics_axis(ax) self._set_graphics_axis(ax)
if plot_distribution: if plot_distribution:
self.plot_distribution(ax) self.plot_distribution(ax)
@@ -97,6 +102,7 @@ class Snode_graph(Snode):
ax.set_zlabel('X2') ax.set_zlabel('X2')
plt.show() plt.show()
class Stree_grapher(Stree): class Stree_grapher(Stree):
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall """Build 3d graphs of any dataset, if it's more than 3 features PCA shall
make its magic make its magic
@@ -114,7 +120,7 @@ class Stree_grapher(Stree):
def __del__(self): def __del__(self):
try: try:
os.environ.pop('TESTING') os.environ.pop('TESTING')
except: except KeyError:
pass pass
plt.close('all') plt.close('all')
@@ -143,7 +149,7 @@ class Stree_grapher(Stree):
self._pca = PCA(n_components=3) self._pca = PCA(n_components=3)
X = self._pca.fit_transform(X) X = self._pca.fit_transform(X)
res = super().fit(X, y) res = super().fit(X, y)
self._tree_gr = self._copy_tree(self._tree) self._tree_gr = self._copy_tree(self.tree_)
self._fitted = True self._fitted = True
return res return res
@@ -164,6 +170,8 @@ class Stree_grapher(Stree):
:type save_folder: str, optional :type save_folder: str, optional
""" """
self._check_fitted() self._check_fitted()
if not os.path.isdir(save_folder):
os.mkdir(save_folder)
seq = 1 seq = 1
for node in self: for node in self:
node.save_hyperplane(save_folder=save_folder, node.save_hyperplane(save_folder=save_folder,
@@ -179,4 +187,3 @@ class Stree_grapher(Stree):
def __iter__(self): def __iter__(self):
return Siterator(self._tree_gr) return Siterator(self._tree_gr)

View File

@@ -1,4 +1,3 @@
import csv
import os import os
import unittest import unittest
@@ -22,18 +21,22 @@ class Stree_test(unittest.TestCase):
def tearDownClass(cls): def tearDownClass(cls):
try: try:
os.environ.pop('TESTING') os.environ.pop('TESTING')
except: except KeyError:
pass pass
def _get_Xy(self): def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, X, y = make_classification(n_samples=1500, n_features=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, n_informative=3, n_redundant=0,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state) n_repeated=0, n_classes=2,
n_clusters_per_class=2, class_sep=1.5,
flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
return X, y return X, y
def _check_tree(self, node: Snode): def _check_tree(self, node: Snode):
"""Check recursively that the nodes that are not leaves have the correct """Check recursively that the nodes that are not leaves have the
number of labels and its sons have the right number of elements in their dataset correct number of labels and its sons have the right number of elements
in their dataset
Arguments: Arguments:
node {Snode} -- node to check node {Snode} -- node to check
@@ -53,11 +56,11 @@ class Stree_test(unittest.TestCase):
for i in unique_y: for i in unique_y:
try: try:
number_down = count_d[i] number_down = count_d[i]
except: except IndexError:
number_down = 0 number_down = 0
try: try:
number_up = count_u[i] number_up = count_u[i]
except: except IndexError:
number_up = 0 number_up = 0
self.assertEqual(count_y[i], number_down + number_up) self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction? # Is the partition made the same as the prediction?
@@ -71,7 +74,7 @@ class Stree_test(unittest.TestCase):
def test_build_tree(self): def test_build_tree(self):
"""Check if the tree is built the same way as predictions of models """Check if the tree is built the same way as predictions of models
""" """
self._check_tree(self._clf._tree) self._check_tree(self._clf.tree_)
def _get_file_data(self, file_name: str) -> tuple: def _get_file_data(self, file_name: str) -> tuple:
"""Return X, y from data, y is the last column in array """Return X, y from data, y is the last column in array
@@ -89,7 +92,8 @@ class Stree_test(unittest.TestCase):
fx = np.delete(data, column_y, axis=1) fx = np.delete(data, column_y, axis=1)
return fx, fy return fx, fy
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list: def _find_out(self, px: np.array, x_original: np.array,
y_original) -> list:
"""Find the original values of y for a given array of samples """Find the original values of y for a given array of samples
Arguments: Arguments:
@@ -107,18 +111,6 @@ class Stree_test(unittest.TestCase):
res.append(y_original[row]) res.append(y_original[row])
return res return res
def test_subdatasets(self):
"""Check if the subdatasets files have the same labels as the original dataset
"""
self._clf.save_sub_datasets()
with open(self._clf.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',')
for row in catalog:
X, y = self._get_Xy()
x_file, y_file = self._get_file_data(row[0])
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
self.assertTrue(np.array_equal(y_file, y_original))
def test_single_prediction(self): def test_single_prediction(self):
X, y = self._get_Xy() X, y = self._get_Xy()
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1]))) yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
@@ -135,22 +127,25 @@ class Stree_test(unittest.TestCase):
X, y = self._get_Xy() X, y = self._get_Xy()
accuracy_score = self._clf.score(X, y) accuracy_score = self._clf.score(X, y)
yp = self._clf.predict(X) yp = self._clf.predict(X)
right = (yp == y).astype(int) accuracy_computed = np.mean(yp == y)
accuracy_computed = sum(right) / len(y)
self.assertEqual(accuracy_score, accuracy_computed) self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.8) self.assertGreater(accuracy_score, 0.9)
def test_single_predict_proba(self): def test_single_predict_proba(self):
"""Check that element 28 has a prediction different that the current label """Check that element 28 has a prediction different that the current
label
""" """
# Element 28 has a different prediction than the truth # Element 28 has a different prediction than the truth
decimals = 5 decimals = 5
prob = 0.29026400766
X, y = self._get_Xy() X, y = self._get_Xy()
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1])) yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(0, yp[0:, 0]) self.assertEqual(np.round(1 - prob, decimals),
np.round(yp[0:, 0], decimals))
self.assertEqual(1, y[28]) self.assertEqual(1, y[28])
self.assertAlmostEqual( self.assertAlmostEqual(
round(0.29026400766, decimals), round(prob, decimals),
round(yp[0, 1], decimals), round(yp[0, 1], decimals),
decimals decimals
) )
@@ -161,11 +156,16 @@ class Stree_test(unittest.TestCase):
decimals = 5 decimals = 5
X, y = self._get_Xy() X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num, :]) yp = self._clf.predict_proba(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist()) self.assertListEqual(
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236, y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist())
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349, expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833,
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355, 0.14269291, 0.85193236,
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692, 0.29876058, 0.7282164, 0.85958616, 0.89517877,
0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624,
0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257,
0.28906333, 0.32643692,
0.29788483, 0.01657364, 0.81149083] 0.29788483, 0.01657364, 0.81149083]
expected = np.round(expected_proba, decimals=decimals).tolist() expected = np.round(expected_proba, decimals=decimals).tolist()
computed = np.round(yp[:, 1], decimals=decimals).tolist() computed = np.round(yp[:, 1], decimals=decimals).tolist()
@@ -173,9 +173,10 @@ class Stree_test(unittest.TestCase):
self.assertAlmostEqual(expected[i], computed[i], decimals) self.assertAlmostEqual(expected[i], computed[i], decimals)
def build_models(self): def build_models(self):
"""Build and train two models, model_clf will use the sklearn classifier to """Build and train two models, model_clf will use the sklearn
compute predictions and split data. model_computed will use vector of classifier to compute predictions and split data. model_computed will
coefficients to compute both predictions and splitted data use vector of coefficients to compute both predictions and splitted
data
""" """
model_clf = Stree(random_state=self._random_state, model_clf = Stree(random_state=self._random_state,
use_predictions=True) use_predictions=True)
@@ -187,8 +188,9 @@ class Stree_test(unittest.TestCase):
return model_clf, model_computed, X, y return model_clf, model_computed, X, y
def test_use_model_predict(self): def test_use_model_predict(self):
"""Check that we get the same results wether we use the estimator in nodes """Check that we get the same results wether we use the estimator in
to compute labels or we use the hyperplane and the position of samples wrt to it nodes to compute labels or we use the hyperplane and the position of
samples wrt to it
""" """
use_clf, use_math, X, _ = self.build_models() use_clf, use_math, X, _ = self.build_models()
self.assertListEqual( self.assertListEqual(
@@ -213,14 +215,15 @@ class Stree_test(unittest.TestCase):
) )
def test_single_vs_multiple_prediction(self): def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as predicting """Check if predicting sample by sample gives the same result as
all samples at once predicting all samples at once
""" """
X, _ = self._get_Xy() X, _ = self._get_Xy()
# Compute prediction line by line # Compute prediction line by line
yp_line = np.array([], dtype=int) yp_line = np.array([], dtype=int)
for xp in X: for xp in X:
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1]))) yp_line = np.append(yp_line, self._clf.predict(
xp.reshape(-1, X.shape[1])))
# Compute prediction at once # Compute prediction at once
yp_once = self._clf.predict(X) yp_once = self._clf.predict(X)
# #
@@ -232,17 +235,54 @@ class Stree_test(unittest.TestCase):
expected = [ expected = [
'root', 'root',
'root - Down', 'root - Down',
'root - Down - Down, <cgaf> - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))', 'root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts'
'=(array([0, 1]), array([ 17, 691]))',
'root - Down - Up', 'root - Down - Up',
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))', 'root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 '
'root - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))', 'counts=(array([0, 1]), array([1, 3]))',
'root - Up, <cgaf> - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))', 'root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 '
'counts=(array([0]), array([7]))',
'root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array('
'[0, 1]), array([725, 56]))',
] ]
computed = [] computed = []
for node in self._clf: for node in self._clf:
computed.append(str(node)) computed.append(str(node))
self.assertListEqual(expected, computed) self.assertListEqual(expected, computed)
def test_is_a_sklearn_classifier(self):
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator
check_estimator(Stree())
def test_exception_if_C_is_negative(self):
tclf = Stree(C=-1)
with self.assertRaises(ValueError):
tclf.fit(*self._get_Xy())
def test_check_max_depth_is_positive_or_None(self):
tcl = Stree()
self.assertIsNone(tcl.max_depth)
tcl = Stree(max_depth=1)
self.assertGreaterEqual(1, tcl.max_depth)
with self.assertRaises(ValueError):
tcl = Stree(max_depth=-1)
tcl.fit(*self._get_Xy())
def test_check_max_depth(self):
depth = 3
tcl = Stree(random_state=self._random_state, max_depth=depth)
tcl.fit(*self._get_Xy())
self.assertEqual(depth, tcl.depth_)
def test_unfitted_tree_is_iterable(self):
tcl = Stree()
self.assertEqual(0, len(list(tcl)))
class Snode_test(unittest.TestCase): class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
@@ -255,19 +295,24 @@ class Snode_test(unittest.TestCase):
@classmethod @classmethod
def tearDownClass(cls): def tearDownClass(cls):
"""[summary]
"""
try: try:
os.environ.pop('TESTING') os.environ.pop('TESTING')
except: except KeyError:
pass pass
def _get_Xy(self): def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, X, y = make_classification(n_samples=1500, n_features=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, n_informative=3, n_redundant=0, n_classes=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state) n_repeated=0, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
return X, y return X, y
def test_attributes_in_leaves(self): def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a predictor """Check if the attributes in leaves have correct values so they form a
predictor
""" """
def check_leave(node: Snode): def check_leave(node: Snode):
@@ -282,7 +327,7 @@ class Snode_test(unittest.TestCase):
if len(classes) > 1: if len(classes) > 1:
try: try:
belief = max_card / (max_card + min_card) belief = max_card / (max_card + min_card)
except: except ZeroDivisionError:
belief = 0. belief = 0.
else: else:
belief = 1 belief = 1
@@ -291,7 +336,7 @@ class Snode_test(unittest.TestCase):
class_computed = classes[card == max_card] class_computed = classes[card == max_card]
self.assertEqual(class_computed, node._class) self.assertEqual(class_computed, node._class)
check_leave(self._clf._tree) check_leave(self._clf.tree_)
def test_nodes_coefs(self): def test_nodes_coefs(self):
"""Check if the nodes of the tree have the right attributes filled """Check if the nodes of the tree have the right attributes filled
@@ -309,5 +354,4 @@ class Snode_test(unittest.TestCase):
run_tree(node.get_down()) run_tree(node.get_down())
run_tree(node.get_up()) run_tree(node.get_up())
run_tree(self._clf._tree) run_tree(self._clf.tree_)

View File

@@ -1,194 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#import os\n",
"#os.chdir(\"/content\")\n",
"#!git clone https://github.com/Doctorado-ML/STree.git\n",
"#os.chdir(\"/content/STree\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.110% 494\nValid: 66.890% 998\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9521\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.980519 counts=(array([0, 1]), array([ 6, 302]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.940217 counts=(array([0, 1]), array([692, 44]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9521\nClassifier's accuracy (test) : 0.9643\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief=0.986842 counts=(array([0, 1]), array([ 4, 300]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.937754 counts=(array([0, 1]), array([693, 46]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9636\nClassifier's accuracy (test) : 0.9688\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([308]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([8]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.947802 counts=(array([0, 1]), array([690, 38]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([308]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([11]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.951456 counts=(array([0, 1]), array([686, 35]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9741\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n\n**************************************************\n0.7816 secs\n"
}
],
"source": [
"t = time.time()\n",
"for C in (.001, .01, 1, 5, 17):\n",
" clf = Stree(C=C, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" print(f\"************** C={C} ****************************\")\n",
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
" print(clf)\n",
" print(f\"**************************************************\")\n",
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.calibration import CalibratedClassifierCV\n",
"scaler = StandardScaler()\n",
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
"cclf.fit(Xtrain, ytrain)\n",
"res = cclf.predict_proba(Xtest)\n",
"#an array containing probabilities of belonging to the 1st class"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n"
}
],
"source": [
"#check iterator\n",
"for i in list(clf):\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n"
}
],
"source": [
"#check iterator again\n",
"for i in clf:\n",
" print(i)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long