Compare commits

...

5 Commits

Author SHA1 Message Date
b4816b2995 Show sample_weight use in test2 notebook
Update revision to RC4
Lint Stree grapher
2020-05-30 23:59:40 +02:00
5e5fea9c6a Document & lint code 2020-05-30 23:10:10 +02:00
724a4855fb Adapt some notebooks 2020-05-30 11:09:59 +02:00
a22ae81b54 Refactor split_data adding sample_weight 2020-05-29 18:52:23 +02:00
ed98054f0d First approach
Added max_depth, tol, weighted samples
2020-05-29 12:46:10 +02:00
14 changed files with 1182 additions and 676 deletions

View File

@@ -2,7 +2,7 @@
# Stree
Oblique Tree classifier based on SVM nodes
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png)
@@ -18,15 +18,15 @@ pip install git+https://github.com/doctorado-ml/stree
##### Slow launch but better integration
* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/test.ipynb) Test notebook
* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/test.ipynb) Test notebook
##### Fast launch but have to run first commented out cell for setup
* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test.ipynb) Test notebook
* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test.ipynb) Test notebook
* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test2.ipynb) Another Test notebook
* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test2.ipynb) Another Test notebook
* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test_graphs.ipynb) Test Graphics notebook
* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics notebook
### Command line

1
data/.gitignore vendored
View File

@@ -1 +0,0 @@
*

190
notebooks/adaboost.ipynb Normal file
View File

@@ -0,0 +1,190 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.datasets import load_iris\n",
"from stree import Stree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"# data = load_creditcard(0) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.986857825567503\nScore Test: 0.9805013927576601\nTook 0.12 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf = Stree(max_depth=3, random_state=random_state)\n",
"clf.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.997610513739546\nScore Test: 0.9721448467966574\nTook 7.80 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n",
"clf2.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf2.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 0.9796893667861409\nScore Test: 0.9554317548746518\nTook 0.48 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n",
"clf3.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf3.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Score Train: 1.0\nScore Test: 0.9721448467966574\nTook 0.86 seconds\n"
}
],
"source": [
"now = time.time()\n",
"clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n",
"clf4.fit(Xtrain, ytrain)\n",
"print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n",
"print(\"Score Test: \", clf4.score(Xtest, ytest))\n",
"print(f\"Took {time.time() - now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
"display_name": "Python 3.7.6 64-bit ('general': venv)"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

236
notebooks/gridsearch.ipynb Normal file

File diff suppressed because one or more lines are too long

225
notebooks/test2.ipynb Normal file
View File

@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]\n",
"# Set weights inverse to its count class in dataset\n",
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
"weights[ytrain==1] = 1.99755 "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Accuracy of Train without weights 0.996415770609319\nAccuracy of Train with weights 0.994026284348865\nAccuracy of Tests without weights 0.9665738161559888\nAccuracy of Tests with weights 0.9721448467966574\n"
}
],
"source": [
"C = 23\n",
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": [
"outputPrepend"
]
},
"outputs": [],
"source": [
"t = time.time()\n",
"for C in (.001, .01, 1, 5, 17):\n",
" clf = Stree(C=C, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" print(f\"************** C={C} ****************************\")\n",
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
" print(clf)\n",
" print(f\"**************************************************\")\n",
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.calibration import CalibratedClassifierCV\n",
"scaler = StandardScaler()\n",
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
"cclf.fit(Xtrain, ytrain)\n",
"res = cclf.predict_proba(Xtest)\n",
"print(res[:4, :])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#check iterator\n",
"for i in list(clf):\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#check iterator again\n",
"for i in clf:\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check if the classifier is a sklearn estimator\n",
"from sklearn.utils.estimator_checks import check_estimator\n",
"check_estimator(Stree())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Stree(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.6 64-bit ('general': venv)",
"language": "python",
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

197
notebooks/test_graphs.ipynb Normal file
View File

@@ -0,0 +1,197 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "ModuleNotFoundError",
"evalue": "No module named 'stree'",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-12-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'stree'"
]
}
],
"source": [
"import time\n",
"import random\n",
"import numpy as np\n",
"from sklearn.datasets import make_blobs\n",
"from sklearn.svm import LinearSVC\n",
"from stree import Stree, Stree_grapher"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def build_data(random_state):\n",
" random.seed(random_state)\n",
" X, y = make_blobs(centers=10, n_features=3, n_samples=500, random_state=random_state)\n",
" def make_binary(y):\n",
" for i in range(2, 10):\n",
" y[y==i] = random.randint(0, 1)\n",
" return y\n",
" y = make_binary(y)\n",
" #print(X.shape, np.unique(y), y[y==0].shape, y[y==1].shape)\n",
" return X, y"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'Stree_grapher' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-b909470cb406>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m.01\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m200\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m#gr.save_all(save_folder='data/', save_prefix='7')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'Stree_grapher' is not defined"
]
}
],
"source": [
"X, y = build_data(10)\n",
"gr = Stree_grapher(dict(C=.01, max_iter=200))\n",
"gr.fit(X, y)\n",
"#gr.save_all(save_folder='data/', save_prefix='7')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-efa3db892bfd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"print(gr)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-0e62f081c9aa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Agg'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_folder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'data/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"import matplotlib\n",
"matplotlib.use('Agg')\n",
"gr.save_all(save_folder='data/')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-b0484cfe9d26>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m#%matplotlib inline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'widget'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tree_gr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_hyperplane\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
"#%matplotlib inline\n",
"%matplotlib widget\n",
"gr._tree_gr.plot_hyperplane()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "name 'gr' is not defined",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-8-4277c1aacbe2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'inline'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m#%matplotlib widget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
]
}
],
"source": [
"#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
"%matplotlib inline\n",
"#%matplotlib widget\n",
"gr.plot_all()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,8 +1,9 @@
import setuptools
__version__ = "0.9rc3"
__version__ = "0.9rc4"
__author__ = "Ricardo Montañana Gómez"
def readme():
with open('README.md') as f:
return f.read()
@@ -19,7 +20,8 @@ setuptools.setup(
url='https://github.com/doctorado-ml/stree',
author=__author__,
author_email='ricardo.montanana@alu.uclm.es',
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-tree svm svc',
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-\
tree svm svc',
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: MIT License',

View File

@@ -7,22 +7,28 @@ Build an oblique tree classifier based on SVM Trees
Uses LinearSVC
'''
import typing
import os
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, \
_check_sample_weight
class Snode:
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
"""Nodes of the tree that keeps the svm classifier and if testing the
dataset assigned to it
"""
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray,
title: str):
self._clf = clf
self._vector = None if clf is None else clf.coef_
self._interceptor = 0. if clf is None else clf.intercept_
self._title = title
self._belief = 0. # belief of the prediction in a leaf node based on samples
self._belief = 0.
# Only store dataset in Testing
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
self._y = y
@@ -50,8 +56,8 @@ class Snode:
return self._up
def make_predictor(self):
"""Compute the class of the predictor and its belief based on the subdataset of the node
only if it is a leaf
"""Compute the class of the predictor and its belief based on the
subdataset of the node only if it is a leaf
"""
if not self.is_leaf():
return
@@ -61,7 +67,7 @@ class Snode:
min_card = min(card)
try:
self._belief = max_card / (max_card + min_card)
except:
except ZeroDivisionError:
self._belief = 0.
self._class = classes[card == max_card][0]
else:
@@ -70,7 +76,10 @@ class Snode:
def __str__(self) -> str:
if self.is_leaf():
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}"
count_values = np.unique(self._y, return_counts=True)
result = f"{self._title} - Leaf class={self._class} belief="\
f"{self._belief: .6f} counts={count_values}"
return result
else:
return f"{self._title}"
@@ -100,77 +109,127 @@ class Siterator:
class Stree(BaseEstimator, ClassifierMixin):
"""Estimator that is based on binary trees of svm nodes
can deal with sample_weights in predict, used in boosting sklearn methods
inheriting from BaseEstimator implements get_params and set_params methods
inheriting from ClassifierMixin implement the attribute _estimator_type
with "classifier" as value
"""
"""
__folder = 'data/'
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
def __init__(self, C: float = 1.0, max_iter: int = 1000,
random_state: int = None, max_depth: int = None,
tol: float = 1e-4, use_predictions: bool = False):
self.max_iter = max_iter
self.C = C
self.random_state = random_state
self.use_predictions = use_predictions
self.max_depth = max_depth
self.tol = tol
def get_params(self, deep=True):
"""Get dict with hyperparameters and its values to accomplish sklearn rules
def _more_tags(self) -> dict:
"""Required by sklearn to tell that this estimator is a binary classifier
:return: the tag required
:rtype: dict
"""
return {
'C': self.C,
'random_state': self.random_state,
'max_iter': self.max_iter,
'use_predictions': self.use_predictions
}
def set_params(self, **parameters):
"""Set hyperparmeters as specified by sklearn, needed in Gridsearchs
"""
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self
# Added binary_only tag as required by sklearn check_estimator
def _more_tags(self):
return {'binary_only': True}
return {'binary_only': True, 'requires_y': True}
def _linear_function(self, data: np.array, node: Snode) -> np.array:
"""Compute the distance of set of samples to a hyperplane, in
multiclass classification it should compute the distance to a
hyperplane of each class
:param data: dataset of samples
:type data: np.array
:param node: the node that contains the hyperplance coefficients
:type node: Snode
:return: array of distances of each sample to the hyperplane
:rtype: np.array
"""
coef = node._vector[0, :].reshape(-1, data.shape[1])
return data.dot(coef.T) + node._interceptor[0]
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
def _split_array(self, origin: np.array, down: np.array) -> list:
"""Split an array in two based on indices passed as down and its complement
:param origin: dataset to split
:type origin: np.array
:param down: indices to use to split array
:type down: np.array
:return: list with two splits of the array
:rtype: list
"""
up = ~down
return origin[up[:, 0]] if any(up) else None, \
origin[down[:, 0]] if any(down) else None
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node
:param node: node containing the svm classifier
:type node: Snode
:param data: samples to find out distance to hyperplane
:type data: np.ndarray
:return: array of shape (m, 1) with the distances of every sample to
the hyperplane of the node
:rtype: np.array
"""
if self.use_predictions:
yp = node._clf.predict(data)
down = (yp == 1).reshape(-1, 1)
res = np.expand_dims(node._clf.decision_function(data), 1)
else:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
# computes positition of every sample is w.r.t. the hyperplane
"""doesn't work with multiclass as each sample has to do inner
product with its own coefficients computes positition of every
sample is w.r.t. the hyperplane
"""
res = self._linear_function(data, node)
down = res > 0
up = ~down
data_down = data[down[:, 0]] if any(down) else None
indices_down = indices[down[:, 0]] if any(down) else None
res_down = res[down[:, 0]] if any(down) else None
data_up = data[up[:, 0]] if any(up) else None
indices_up = indices[up[:, 0]] if any(up) else None
res_up = res[up[:, 0]] if any(up) else None
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
return res
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
from sklearn.utils.multiclass import check_classification_targets
def _split_criteria(self, data: np.array) -> np.array:
"""Set the criteria to split arrays
:param data: [description]
:type data: np.array
:return: [description]
:rtype: np.array
"""
return data > 0
def fit(self, X: np.ndarray, y: np.ndarray,
sample_weight: np.array = None) -> 'Stree':
"""Build the tree based on the dataset of samples and its labels
:raises ValueError: if parameters C or max_depth are out of bounds
:return: itself to be able to chain actions: fit().predict() ...
:rtype: Stree
"""
# Check parameters are Ok.
if type(y).__name__ == 'np.ndarray':
y = y.ravel()
X, y = check_X_y(X, y)
self.classes_ = np.unique(y)
self.n_iter_ = self.max_iter
if self.C < 0:
raise ValueError(
f"Penalty term must be positive... got (C={self.C:f})")
self.__max_depth = np.iinfo(
np.int32).max if self.max_depth is None else self.max_depth
if self.__max_depth < 1:
raise ValueError(
f"Maximum depth has to be greater than 1... got (max_depth=\
{self.max_depth})")
check_classification_targets(y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
check_classification_targets(y)
# Initialize computed parameters
self.classes_, y = np.unique(y, return_inverse=True)
self.n_iter_ = self.max_iter
self.depth_ = 0
self.n_features_in_ = X.shape[1]
self.tree_ = self.train(X, y.ravel(), title)
self.tree_ = self.train(X, y, sample_weight, 1, 'root')
self._build_predictor()
return self
def _build_predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Snode):
if node.is_leaf():
node.make_predictor()
@@ -180,25 +239,57 @@ class Stree(BaseEstimator, ClassifierMixin):
run_tree(self.tree_)
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray,
depth: int, title: str) -> Snode:
"""Recursive function to split the original dataset into predictor
nodes (leaves)
:param X: samples dataset
:type X: np.ndarray
:param y: samples labels
:type y: np.ndarray
:param sample_weight: weight of samples (used in boosting)
:type sample_weight: np.ndarray
:param depth: actual depth in the tree
:type depth: int
:param title: description of the node
:type title: str
:return: binary tree
:rtype: Snode
"""
if depth > self.__max_depth:
return None
if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset
return Snode(None, X, y, title + ', <pure>')
# Train the model
clf = LinearSVC(max_iter=self.max_iter, C=self.C,
random_state=self.random_state)
clf.fit(X, y)
clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
C=self.C) # , sample_weight=sample_weight)
clf.fit(X, y, sample_weight=sample_weight)
tree = Snode(clf, X, y, title)
X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
self.depth_ = max(depth, self.depth_)
down = self._split_criteria(self._distances(tree, X))
X_U, X_D = self._split_array(X, down)
y_u, y_d = self._split_array(y, down)
sw_u, sw_d = self._split_array(sample_weight, down)
if X_U is None or X_D is None:
# didn't part anything
return Snode(clf, X, y, title + ', <cgaf>')
tree.set_up(self.train(X_U, y_u, title + ' - Up'))
tree.set_down(self.train(X_D, y_d, title + ' - Down'))
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + ' - Up'))
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + ' - Down'))
return tree
def _reorder_results(self, y: np.array, indices: np.array, proba=False) -> np.array:
if proba:
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
"""Reorder an array based on the array of indices passed
:param y: data untidy
:type y: np.array
:param indices: indices used to set order
:type indices: np.array
:return: array y ordered
:rtype: np.array
"""
if y.ndim > 1 and y.shape[1] > 1:
# if predict_proba return np.array of floats
y_ordered = np.zeros(y.shape, dtype=float)
else:
@@ -210,40 +301,57 @@ class Stree(BaseEstimator, ClassifierMixin):
return y_ordered
def predict(self, X: np.array) -> np.array:
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
"""Predict labels for each sample in dataset passed
:param X: dataset of samples
:type X: np.array
:return: array of labels
:rtype: np.array
"""
def predict_class(xp: np.array, indices: np.array,
node: Snode) -> np.array:
if xp is None:
return [], []
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
return prediction, indices
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, node.get_down())
m, n = predict_class(u, i_u, node.get_up())
return np.append(k, m), np.append(l, n)
down = self._split_criteria(self._distances(node, xp))
X_U, X_D = self._split_array(xp, down)
i_u, i_d = self._split_array(indices, down)
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ['tree_'])
# Input validation
X = check_array(X)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
return self._reorder_results(*predict_class(X, indices, self.tree_)).ravel()
result = self._reorder_results(
*predict_class(X, indices, self.tree_)).astype(int).ravel()
return self.classes_[result]
def predict_proba(self, X: np.array) -> np.array:
"""Computes an approximation of the probability of samples belonging to class 0 and 1
"""Computes an approximation of the probability of samples belonging to
class 0 and 1
:param X: dataset
:type X: np.array
:return: array array of shape (m, num_classes), probability of being
each class
:rtype: np.array
"""
def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
def predict_class(xp: np.array, indices: np.array, dist: np.array,
node: Snode) -> np.array:
"""Run the tree to compute predictions
:param xp: subdataset of samples
:type xp: np.array
:param indices: indices of subdataset samples to rebuild original order
:param indices: indices of subdataset samples to rebuild original
order
:type indices: np.array
:param dist: distances of every sample to the hyperplane or the father node
:param dist: distances of every sample to the hyperplane or the
father node
:type dist: np.array
:param node: node of the leaf with the class
:type node: Snode
@@ -257,10 +365,14 @@ class Stree(BaseEstimator, ClassifierMixin):
prediction = np.full((xp.shape[0], 1), node._class)
prediction_proba = dist
return np.append(prediction, prediction_proba, axis=1), indices
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, r_d, node.get_down())
m, n = predict_class(u, i_u, r_u, node.get_up())
return np.append(k, m), np.append(l, n)
distances = self._distances(node, xp)
down = self._split_criteria(distances)
X_U, X_D = self._split_array(xp, down)
i_u, i_d = self._split_array(indices, down)
di_u, di_d = self._split_array(distances, down)
prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ['tree_'])
@@ -271,57 +383,50 @@ class Stree(BaseEstimator, ClassifierMixin):
empty_dist = np.empty((X.shape[0], 1), dtype=float)
result, indices = predict_class(X, indices, empty_dist, self.tree_)
result = result.reshape(X.shape[0], 2)
# Turn distances to hyperplane into probabilities based on fitting distances
# of samples to its hyperplane that classified them, to the sigmoid function
result[:, 1] = 1 / (1 + np.exp(-result[:, 1])) # Probability of being 1
result[:, 0] = 1 - result[:, 1] # Probability of being 0
return self._reorder_results(result, indices, proba=True)
# Turn distances to hyperplane into probabilities based on fitting
# distances of samples to its hyperplane that classified them, to the
# sigmoid function
# Probability of being 1
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
# Probability of being 0
result[:, 0] = 1 - result[:, 1]
return self._reorder_results(result, indices)
def score(self, X: np.array, y: np.array) -> float:
"""Return accuracy
"""Compute accuracy of the prediction
:param X: dataset of samples to make predictions
:type X: np.array
:param y: samples labels
:type y: np.array
:return: accuracy of the prediction
:rtype: float
"""
# sklearn check
check_is_fitted(self)
yp = self.predict(X).reshape(y.shape)
right = (yp == y).astype(int)
return np.sum(right) / len(y)
return np.mean(yp == y)
def __iter__(self):
return Siterator(self.tree_)
def __iter__(self) -> Siterator:
"""Create an iterator to be able to visit the nodes of the tree in preorder,
can make a list with all the nodes in preorder
:return: an iterator, can for i in... and list(...)
:rtype: Siterator
"""
try:
tree = self.tree_
except AttributeError:
tree = None
return Siterator(tree)
def __str__(self) -> str:
"""String representation of the tree
:return: description of nodes in the tree in preorder
:rtype: str
"""
output = ''
for i in self:
output += str(i) + '\n'
return output
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
"""Save the dataset of the node in a csv file
:param tree: node with data to save
:type tree: Snode
:param catalog: catalog file handler
:type catalog: typing.TextIO
:param number: sequential number for the generated file name
:type number: int
"""
data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
name = f"{self.__folder}dataset{number}.csv"
np.savetxt(name, data, delimiter=",")
catalog.write(f"{name}, - {str(tree)}")
if tree.is_leaf():
return
self._save_datasets(tree.get_down(), catalog, number + 1)
self._save_datasets(tree.get_up(), catalog, number + 2)
def get_catalog_name(self):
return self.__folder + "catalog.txt"
def save_sub_datasets(self):
"""Save the every dataset stored in the tree to check with manual classifier
"""
if not os.path.isdir(self.__folder):
os.mkdir(self.__folder)
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
self._save_datasets(self.tree_, catalog, 1)

View File

@@ -15,6 +15,7 @@ from mpl_toolkits.mplot3d import Axes3D
from .Strees import Stree, Snode, Siterator
class Snode_graph(Snode):
def __init__(self, node: Stree):
@@ -45,7 +46,8 @@ class Snode_graph(Snode):
ax.set_ylim(self._ylimits)
ax.set_zlim(self._zlimits)
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '', save_seq: int = 1):
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '',
save_seq: int = 1):
_, fig = self.plot_hyperplane()
name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
fig.savefig(name, bbox_inches='tight')
@@ -53,8 +55,7 @@ class Snode_graph(Snode):
def _get_cmap(self):
cmap = 'jet'
if self._is_pure():
if self._class == 1:
if self._is_pure() and self._class == 1:
cmap = 'jet_r'
return cmap
@@ -66,16 +67,20 @@ class Snode_graph(Snode):
fig = plt.figure(figsize=self._plot_size)
ax = fig.add_subplot(1, 1, 1, projection='3d')
if not self._is_pure():
# Can't plot hyperplane of leaves with one label because it hasn't classiffier
# Can't plot hyperplane of leaves with one label because it hasn't
# classiffier
# get the splitting hyperplane
def hyperplane(x, y): return (-self._interceptor - self._vector[0][0] * x
- self._vector[0][1] * y) / self._vector[0][2]
def hyperplane(x, y): return (-self._interceptor
- self._vector[0][0] * x
- self._vector[0][1] * y) \
/ self._vector[0][2]
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
xx, yy = np.meshgrid(tmpx, tmpy)
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5, antialiased=True,
rstride=1, cstride=1, cmap='seismic')
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5,
antialiased=True, rstride=1, cstride=1,
cmap='seismic')
self._set_graphics_axis(ax)
if plot_distribution:
self.plot_distribution(ax)
@@ -97,6 +102,7 @@ class Snode_graph(Snode):
ax.set_zlabel('X2')
plt.show()
class Stree_grapher(Stree):
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
make its magic
@@ -114,7 +120,7 @@ class Stree_grapher(Stree):
def __del__(self):
try:
os.environ.pop('TESTING')
except:
except KeyError:
pass
plt.close('all')
@@ -181,4 +187,3 @@ class Stree_grapher(Stree):
def __iter__(self):
return Siterator(self._tree_gr)

View File

@@ -1,4 +1,3 @@
import csv
import os
import unittest
@@ -22,18 +21,22 @@ class Stree_test(unittest.TestCase):
def tearDownClass(cls):
try:
os.environ.pop('TESTING')
except:
except KeyError:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
X, y = make_classification(n_samples=1500, n_features=3,
n_informative=3, n_redundant=0,
n_repeated=0, n_classes=2,
n_clusters_per_class=2, class_sep=1.5,
flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
return X, y
def _check_tree(self, node: Snode):
"""Check recursively that the nodes that are not leaves have the correct
number of labels and its sons have the right number of elements in their dataset
"""Check recursively that the nodes that are not leaves have the
correct number of labels and its sons have the right number of elements
in their dataset
Arguments:
node {Snode} -- node to check
@@ -53,11 +56,11 @@ class Stree_test(unittest.TestCase):
for i in unique_y:
try:
number_down = count_d[i]
except:
except IndexError:
number_down = 0
try:
number_up = count_u[i]
except:
except IndexError:
number_up = 0
self.assertEqual(count_y[i], number_down + number_up)
# Is the partition made the same as the prediction?
@@ -89,7 +92,8 @@ class Stree_test(unittest.TestCase):
fx = np.delete(data, column_y, axis=1)
return fx, fy
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
def _find_out(self, px: np.array, x_original: np.array,
y_original) -> list:
"""Find the original values of y for a given array of samples
Arguments:
@@ -107,18 +111,6 @@ class Stree_test(unittest.TestCase):
res.append(y_original[row])
return res
def test_subdatasets(self):
"""Check if the subdatasets files have the same labels as the original dataset
"""
self._clf.save_sub_datasets()
with open(self._clf.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',')
for row in catalog:
X, y = self._get_Xy()
x_file, y_file = self._get_file_data(row[0])
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
self.assertTrue(np.array_equal(y_file, y_original))
def test_single_prediction(self):
X, y = self._get_Xy()
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
@@ -135,20 +127,21 @@ class Stree_test(unittest.TestCase):
X, y = self._get_Xy()
accuracy_score = self._clf.score(X, y)
yp = self._clf.predict(X)
right = (yp == y).astype(int)
accuracy_computed = sum(right) / len(y)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.8)
self.assertGreater(accuracy_score, 0.9)
def test_single_predict_proba(self):
"""Check that element 28 has a prediction different that the current label
"""Check that element 28 has a prediction different that the current
label
"""
# Element 28 has a different prediction than the truth
decimals = 5
prob = 0.29026400766
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
self.assertEqual(np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals))
self.assertEqual(np.round(1 - prob, decimals),
np.round(yp[0:, 0], decimals))
self.assertEqual(1, y[28])
self.assertAlmostEqual(
@@ -163,11 +156,16 @@ class Stree_test(unittest.TestCase):
decimals = 5
X, y = self._get_Xy()
yp = self._clf.predict_proba(X[:num, :])
self.assertListEqual(y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist())
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236,
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692,
self.assertListEqual(
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist())
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833,
0.14269291, 0.85193236,
0.29876058, 0.7282164, 0.85958616, 0.89517877,
0.99745224, 0.18860349,
0.30756427, 0.8318412, 0.18981198, 0.15564624,
0.25740655, 0.22923355,
0.87365959, 0.49928689, 0.95574351, 0.28761257,
0.28906333, 0.32643692,
0.29788483, 0.01657364, 0.81149083]
expected = np.round(expected_proba, decimals=decimals).tolist()
computed = np.round(yp[:, 1], decimals=decimals).tolist()
@@ -175,9 +173,10 @@ class Stree_test(unittest.TestCase):
self.assertAlmostEqual(expected[i], computed[i], decimals)
def build_models(self):
"""Build and train two models, model_clf will use the sklearn classifier to
compute predictions and split data. model_computed will use vector of
coefficients to compute both predictions and splitted data
"""Build and train two models, model_clf will use the sklearn
classifier to compute predictions and split data. model_computed will
use vector of coefficients to compute both predictions and splitted
data
"""
model_clf = Stree(random_state=self._random_state,
use_predictions=True)
@@ -189,8 +188,9 @@ class Stree_test(unittest.TestCase):
return model_clf, model_computed, X, y
def test_use_model_predict(self):
"""Check that we get the same results wether we use the estimator in nodes
to compute labels or we use the hyperplane and the position of samples wrt to it
"""Check that we get the same results wether we use the estimator in
nodes to compute labels or we use the hyperplane and the position of
samples wrt to it
"""
use_clf, use_math, X, _ = self.build_models()
self.assertListEqual(
@@ -215,14 +215,15 @@ class Stree_test(unittest.TestCase):
)
def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as predicting
all samples at once
"""Check if predicting sample by sample gives the same result as
predicting all samples at once
"""
X, _ = self._get_Xy()
# Compute prediction line by line
yp_line = np.array([], dtype=int)
for xp in X:
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
yp_line = np.append(yp_line, self._clf.predict(
xp.reshape(-1, X.shape[1])))
# Compute prediction at once
yp_once = self._clf.predict(X)
#
@@ -234,11 +235,15 @@ class Stree_test(unittest.TestCase):
expected = [
'root',
'root - Down',
'root - Down - Down, <cgaf> - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))',
'root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts'
'=(array([0, 1]), array([ 17, 691]))',
'root - Down - Up',
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))',
'root - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))',
'root - Up, <cgaf> - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))',
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 '
'counts=(array([0, 1]), array([1, 3]))',
'root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 '
'counts=(array([0]), array([7]))',
'root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array('
'[0, 1]), array([725, 56]))',
]
computed = []
for node in self._clf:
@@ -253,6 +258,31 @@ class Stree_test(unittest.TestCase):
from sklearn.utils.estimator_checks import check_estimator
check_estimator(Stree())
def test_exception_if_C_is_negative(self):
tclf = Stree(C=-1)
with self.assertRaises(ValueError):
tclf.fit(*self._get_Xy())
def test_check_max_depth_is_positive_or_None(self):
tcl = Stree()
self.assertIsNone(tcl.max_depth)
tcl = Stree(max_depth=1)
self.assertGreaterEqual(1, tcl.max_depth)
with self.assertRaises(ValueError):
tcl = Stree(max_depth=-1)
tcl.fit(*self._get_Xy())
def test_check_max_depth(self):
depth = 3
tcl = Stree(random_state=self._random_state, max_depth=depth)
tcl.fit(*self._get_Xy())
self.assertEqual(depth, tcl.depth_)
def test_unfitted_tree_is_iterable(self):
tcl = Stree()
self.assertEqual(0, len(list(tcl)))
class Snode_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
@@ -265,19 +295,24 @@ class Snode_test(unittest.TestCase):
@classmethod
def tearDownClass(cls):
"""[summary]
"""
try:
os.environ.pop('TESTING')
except:
except KeyError:
pass
def _get_Xy(self):
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
X, y = make_classification(n_samples=1500, n_features=3,
n_informative=3, n_redundant=0, n_classes=2,
n_repeated=0, n_clusters_per_class=2,
class_sep=1.5, flip_y=0, weights=[0.5, 0.5],
random_state=self._random_state)
return X, y
def test_attributes_in_leaves(self):
"""Check if the attributes in leaves have correct values so they form a predictor
"""Check if the attributes in leaves have correct values so they form a
predictor
"""
def check_leave(node: Snode):
@@ -292,7 +327,7 @@ class Snode_test(unittest.TestCase):
if len(classes) > 1:
try:
belief = max_card / (max_card + min_card)
except:
except ZeroDivisionError:
belief = 0.
else:
belief = 1

View File

@@ -1,227 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"tags": [
"outputPrepend"
]
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9579\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.987013 counts=(array([0, 1]), array([ 4, 304]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.945652 counts=(array([0, 1]), array([696, 40]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9579\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.990196 counts=(array([0, 1]), array([ 3, 303]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.944444 counts=(array([0, 1]), array([697, 41]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9693\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([6]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief=0.955923 counts=(array([0, 1]), array([694, 32]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9713\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([314]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([6]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.958564 counts=(array([0, 1]), array([694, 30]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9780\nClassifier's accuracy (test) : 0.9420\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([682, 23]))\n\n**************************************************\n0.4537 secs\n"
}
],
"source": [
"t = time.time()\n",
"for C in (.001, .01, 1, 5, 17):\n",
" clf = Stree(C=C, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" print(f\"************** C={C} ****************************\")\n",
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
" print(clf)\n",
" print(f\"**************************************************\")\n",
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.calibration import CalibratedClassifierCV\n",
"scaler = StandardScaler()\n",
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
"cclf.fit(Xtrain, ytrain)\n",
"res = cclf.predict_proba(Xtest)\n",
"#an array containing probabilities of belonging to the 1st class"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([682, 23]))\n"
}
],
"source": [
"#check iterator\n",
"for i in list(clf):\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([682, 23]))\n"
}
],
"source": [
"#check iterator again\n",
"for i in clf:\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Check if the classifier is a sklearn estimator\n",
"from sklearn.utils.estimator_checks import check_estimator\n",
"check_estimator(Stree())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12d18e0e0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12d185200>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12d1850e0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12d17eb00>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12d17ec20>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12d17ed40>, 'Stree')\n7 functools.partial(<function check_sample_weights_invariance at 0x12d17ee60>, 'Stree')\n8 functools.partial(<function check_estimators_fit_returns_self at 0x12d189200>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12d189200>, 'Stree', readonly_memmap=True)\n10 functools.partial(<function check_complex_data at 0x12d181050>, 'Stree')\n11 functools.partial(<function check_dtype_object at 0x12d17ef80>, 'Stree')\n12 functools.partial(<function check_estimators_empty_data_messages at 0x12d185320>, 'Stree')\n13 functools.partial(<function check_pipeline_consistency at 0x12d181f80>, 'Stree')\n14 functools.partial(<function check_estimators_nan_inf at 0x12d185440>, 'Stree')\n15 functools.partial(<function check_estimators_overwrite_params at 0x12d189f80>, 'Stree')\n16 functools.partial(<function check_estimator_sparse_data at 0x12d17e9e0>, 'Stree')\n17 functools.partial(<function check_estimators_pickle at 0x12d185680>, 'Stree')\n18 functools.partial(<function check_classifier_data_not_an_array at 0x12d18e320>, 'Stree')\n19 functools.partial(<function check_classifiers_one_label at 0x12d185d40>, 'Stree')\n20 functools.partial(<function check_classifiers_classes at 0x12d1897a0>, 'Stree')\n21 functools.partial(<function check_estimators_partial_fit_n_features at 0x12d1857a0>, 'Stree')\n22 functools.partial(<function check_classifiers_train at 0x12d185e60>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12d185e60>, 'Stree', readonly_memmap=True)\n24 functools.partial(<function check_classifiers_regression_target at 0x12d18ed40>, 'Stree')\n25 functools.partial(<function check_supervised_y_no_nan at 0x12d17cb00>, 'Stree')\n26 functools.partial(<function check_supervised_y_2d at 0x12d189440>, 'Stree')\n27 functools.partial(<function check_estimators_unfitted at 0x12d189320>, 'Stree')\n28 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12d18e8c0>, 'Stree')\n29 functools.partial(<function check_decision_proba_consistency at 0x12d18ee60>, 'Stree')\n30 functools.partial(<function check_fit2d_predict1d at 0x12d181560>, 'Stree')\n31 functools.partial(<function check_methods_subset_invariance at 0x12d181710>, 'Stree')\n32 functools.partial(<function check_fit2d_1sample at 0x12d181830>, 'Stree')\n33 functools.partial(<function check_fit2d_1feature at 0x12d181950>, 'Stree')\n34 functools.partial(<function check_fit1d at 0x12d181a70>, 'Stree')\n35 functools.partial(<function check_get_params_invariance at 0x12d18eb00>, 'Stree')\n36 functools.partial(<function check_set_params at 0x12d18ec20>, 'Stree')\n37 functools.partial(<function check_dict_unchanged at 0x12d181170>, 'Stree')\n38 functools.partial(<function check_dont_overwrite_parameters at 0x12d181440>, 'Stree')\n39 functools.partial(<function check_fit_idempotent at 0x12d192050>, 'Stree')\n"
}
],
"source": [
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Stree(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because one or more lines are too long