mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-16 07:56:06 +00:00
Merge pull request #5 from Doctorado-ML/add_kernels
#3 Add kernels to STree
This commit is contained in:
14
README.md
14
README.md
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
# Stree
|
# Stree
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@@ -18,17 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
|
|||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
##### Slow launch but better integration
|
* [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
* [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/test.ipynb) Test notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
##### Fast launch but have to run first commented out cell for setup
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test.ipynb) Test notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test2.ipynb) Another Test notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
|
||||||
|
|
||||||
### Command line
|
### Command line
|
||||||
|
|
||||||
|
@@ -1,15 +1,42 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Test AdaBoost with different configurations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#\n",
|
||||||
|
"# Google Colab setup\n",
|
||||||
|
"#\n",
|
||||||
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import time\n",
|
"import time\n",
|
||||||
"from sklearn.ensemble import AdaBoostClassifier\n",
|
"from sklearn.ensemble import AdaBoostClassifier\n",
|
||||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
"from sklearn.svm import LinearSVC\n",
|
"from sklearn.svm import LinearSVC, SVC\n",
|
||||||
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||||
"from sklearn.datasets import load_iris\n",
|
"from sklearn.datasets import load_iris\n",
|
||||||
"from stree import Stree"
|
"from stree import Stree"
|
||||||
@@ -17,7 +44,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -29,13 +56,13 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
|
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -68,9 +95,10 @@
|
|||||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
"\n",
|
"\n",
|
||||||
"data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
|
"# data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
|
||||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||||
"# data = load_creditcard(0) # Take all the samples\n",
|
"# data = load_creditcard(0) # Take all the samples\n",
|
||||||
|
"data = load_creditcard(-100000)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Xtrain = data[0]\n",
|
"Xtrain = data[0]\n",
|
||||||
"Xtest = data[1]\n",
|
"Xtest = data[1]\n",
|
||||||
@@ -78,15 +106,29 @@
|
|||||||
"ytest = data[3]"
|
"ytest = data[3]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## STree alone on the whole dataset and linear kernel"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"text": "Score Train: 0.986857825567503\nScore Test: 0.9805013927576601\nTook 0.12 seconds\n"
|
"text": "Score Train: 0.9985499829409757\nScore Test: 0.998407854584052\nTook 39.45 seconds\n"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -99,43 +141,21 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Score Train: 0.997610513739546\nScore Test: 0.9721448467966574\nTook 7.80 seconds\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"now = time.time()\n",
|
"## Different kernels with different configuations"
|
||||||
"clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n",
|
|
||||||
"clf2.fit(Xtrain, ytrain)\n",
|
|
||||||
"print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Score Test: \", clf2.score(Xtest, ytest))\n",
|
|
||||||
"print(f\"Took {time.time() - now:.2f} seconds\")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Score Train: 0.9796893667861409\nScore Test: 0.9554317548746518\nTook 0.48 seconds\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"now = time.time()\n",
|
"n_estimators = 10\n",
|
||||||
"clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n",
|
"C = 7\n",
|
||||||
"clf3.fit(Xtrain, ytrain)\n",
|
"max_depth = 3"
|
||||||
"print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Score Test: \", clf3.score(Xtest, ytest))\n",
|
|
||||||
"print(f\"Took {time.time() - now:.2f} seconds\")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -146,24 +166,46 @@
|
|||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"text": "Score Train: 1.0\nScore Test: 0.9721448467966574\nTook 0.86 seconds\n"
|
"text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"now = time.time()\n",
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
"clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n",
|
" now = time.time()\n",
|
||||||
"clf4.fit(Xtrain, ytrain)\n",
|
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
|
||||||
"print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n",
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
"print(\"Score Test: \", clf4.score(Xtest, ytest))\n",
|
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||||
"print(f\"Took {time.time() - now:.2f} seconds\")"
|
" score_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test algorithm SAMME in AdaBoost to check speed/accuracy"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
"source": []
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||||
|
" score_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@@ -1,5 +1,20 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Compare STree with different estimators"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
@@ -40,6 +55,13 @@
|
|||||||
" !tar xzf creditcard.tgz"
|
" !tar xzf creditcard.tgz"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 4,
|
||||||
@@ -55,6 +77,13 @@
|
|||||||
"print(datetime.date.today(), time.strftime(\"%H:%M:%S\"))"
|
"print(datetime.date.today(), time.strftime(\"%H:%M:%S\"))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Load dataset and normalize values"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 5,
|
||||||
@@ -113,6 +142,13 @@
|
|||||||
"print(f\"X shape: {X.shape}\\ny shape: {y.shape}\")"
|
"print(f\"X shape: {X.shape}\\ny shape: {y.shape}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build the models"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 9,
|
||||||
@@ -174,6 +210,13 @@
|
|||||||
"gradient = GradientBoostingClassifier(random_state=random_state)"
|
"gradient = GradientBoostingClassifier(random_state=random_state)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Do the test"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 15,
|
File diff suppressed because one or more lines are too long
370
notebooks/features.ipynb
Normal file
370
notebooks/features.ipynb
Normal file
@@ -0,0 +1,370 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Test smple_weight, kernels, C, sklearn estimator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#\n",
|
||||||
|
"# Google Colab setup\n",
|
||||||
|
"#\n",
|
||||||
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.svm import SVC\n",
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
|
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||||
|
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from stree import Stree\n",
|
||||||
|
"import time"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"if not os.path.isfile('data/creditcard.csv'):\n",
|
||||||
|
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
||||||
|
" !tar xzf creditcard.tgz"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.110% 494\nValid: 66.890% 998\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"\n",
|
||||||
|
"def load_creditcard(n_examples=0):\n",
|
||||||
|
" import pandas as pd\n",
|
||||||
|
" import numpy as np\n",
|
||||||
|
" import random\n",
|
||||||
|
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||||
|
" y = df.Class\n",
|
||||||
|
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||||
|
" if n_examples > 0:\n",
|
||||||
|
" # Take first n_examples samples\n",
|
||||||
|
" X = X[:n_examples, :]\n",
|
||||||
|
" y = y[:n_examples, :]\n",
|
||||||
|
" else:\n",
|
||||||
|
" # Take all the positive samples with a number of random negatives\n",
|
||||||
|
" if n_examples < 0:\n",
|
||||||
|
" Xt = X[(y == 1).ravel()]\n",
|
||||||
|
" yt = y[(y == 1).ravel()]\n",
|
||||||
|
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
||||||
|
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||||
|
" y = np.append(yt, y[indices], axis=0)\n",
|
||||||
|
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||||
|
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||||
|
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
|
"\n",
|
||||||
|
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
||||||
|
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||||
|
"data = load_creditcard(-1000) # Take all the samples\n",
|
||||||
|
"\n",
|
||||||
|
"Xtrain = data[0]\n",
|
||||||
|
"Xtest = data[1]\n",
|
||||||
|
"ytrain = data[2]\n",
|
||||||
|
"ytest = data[3]\n",
|
||||||
|
"# Set weights inverse to its count class in dataset\n",
|
||||||
|
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
|
||||||
|
"weights[ytrain==1] = 1.99755\n",
|
||||||
|
"weights_test = np.ones(Xtest.shape[0],) * 1.00244\n",
|
||||||
|
"weights_test[ytest==1] = 1.99755 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test smple_weights\n",
|
||||||
|
"Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Accuracy of Train without weights 0.9789272030651341\nAccuracy of Train with weights 0.9952107279693486\nAccuracy of Tests without weights 0.9598214285714286\nAccuracy of Tests with weights 0.9508928571428571\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"C = 23\n",
|
||||||
|
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
|
||||||
|
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test accuracy with different kernels\n",
|
||||||
|
"Compute accuracy on train and test set with default hyperparmeters of every kernel"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Time: 0.27s\tKernel: linear\tAccuracy_train: 0.9683908045977011\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9875478927203065\tAccuracy_test: 0.9598214285714286\nTime: 0.06s\tKernel: poly\tAccuracy_train: 0.9885057471264368\tAccuracy_test: 0.9464285714285714\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)\n",
|
||||||
|
" accuracy_train = clf.score(Xtrain, ytrain)\n",
|
||||||
|
" accuracy_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" time_spent = time.time() - now\n",
|
||||||
|
" print(f\"Time: {time_spent:.2f}s\\tKernel: {kernel}\\tAccuracy_train: {accuracy_train}\\tAccuracy_test: {accuracy_test}\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test diferent values of C"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {
|
||||||
|
"tags": [
|
||||||
|
"outputPrepend"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9531\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.983713 counts=(array([0, 1]), array([ 5, 302]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.940299 counts=(array([0, 1]), array([693, 44]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.990228 counts=(array([0, 1]), array([ 3, 304]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.943012 counts=(array([0, 1]), array([695, 42]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9655\nClassifier's accuracy (test) : 0.9643\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([5]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.950617 counts=(array([0, 1]), array([693, 36]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9684\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.954039 counts=(array([0, 1]), array([685, 33]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9751\nClassifier's accuracy (test) : 0.9464\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n\n**************************************************\n0.6869 secs\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"t = time.time()\n",
|
||||||
|
"for C in (.001, .01, 1, 5, 17):\n",
|
||||||
|
" clf = Stree(C=C, kernel=\"linear\", random_state=random_state)\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" print(f\"************** C={C} ****************************\")\n",
|
||||||
|
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
|
||||||
|
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
|
||||||
|
" print(clf)\n",
|
||||||
|
" print(f\"**************************************************\")\n",
|
||||||
|
"print(f\"{time.time() - t:.4f} secs\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test iterator\n",
|
||||||
|
"Check different weays of using the iterator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#check iterator\n",
|
||||||
|
"for i in list(clf):\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#check iterator again\n",
|
||||||
|
"for i in clf:\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test STree is a sklearn estimator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x1254f13b0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x1254e84d0>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x1254e83b0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x1254e0cb0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x1254e0dd0>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x1254e0ef0>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x1254e2050>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x1254e2170>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x1254eb4d0>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x1254eb4d0>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x1254e2320>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x1254e2290>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x1254e85f0>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x1254e8290>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x1254e8710>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x1254f1290>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x1254e0b90>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x1254e8950>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x1254f15f0>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x1254eb050>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x1254eba70>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x1254e8a70>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x1254f40e0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x1254da9e0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x1254eb710>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x1254eb5f0>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x1254f1c20>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x1254f4200>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x1254e2830>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x1254e29e0>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x1254e2b00>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x1254e2c20>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x1254e2d40>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x1254f1e60>, 'Stree')\n38 functools.partial(<function check_set_params at 0x1254f1f80>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x1254e2440>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x1254e2710>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x1254f43b0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x1254f4440>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x1254f44d0>, 'Stree')\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Make checks one by one\n",
|
||||||
|
"c = 0\n",
|
||||||
|
"checks = check_estimator(Stree(), generate_only=True)\n",
|
||||||
|
"for check in checks:\n",
|
||||||
|
" c += 1\n",
|
||||||
|
" print(c, check[1])\n",
|
||||||
|
" check[1](check[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Check if the classifier is a sklearn estimator\n",
|
||||||
|
"check_estimator(Stree())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Compare to SVM"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "== Not Weighted ===\nSVC train score ..: 0.9521072796934866\nSTree train score : 0.9578544061302682\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9575892857142857\n==== Weighted =====\nSVC train score ..: 0.9616858237547893\nSTree train score : 0.9616858237547893\nSVC test score ...: 0.9642857142857143\nSTree test score .: 0.9598214285714286\n*SVC test score ..: 0.951413553411694\n*STree test score : 0.9480517444389333\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"svc = SVC(C=7, kernel='rbf', gamma=.001, random_state=random_state)\n",
|
||||||
|
"clf = Stree(C=17, kernel='rbf', gamma=.001, random_state=random_state)\n",
|
||||||
|
"svc.fit(Xtrain, ytrain)\n",
|
||||||
|
"clf.fit(Xtrain, ytrain)\n",
|
||||||
|
"print(\"== Not Weighted ===\")\n",
|
||||||
|
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
|
||||||
|
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
"svc.fit(Xtrain, ytrain, weights)\n",
|
||||||
|
"clf.fit(Xtrain, ytrain, weights)\n",
|
||||||
|
"print(\"==== Weighted =====\")\n",
|
||||||
|
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
|
||||||
|
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
"print(\"*SVC test score ..:\", svc.score(Xtest, ytest, weights_test))\n",
|
||||||
|
"print(\"*STree test score :\", clf.score(Xtest, ytest, weights_test))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "root\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.969325 counts=(array([0, 1]), array([ 10, 316]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.958159 counts=(array([0, 1]), array([687, 30]))\n\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(clf)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.6-final"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
@@ -1,225 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#\n",
|
|
||||||
"# Google Colab setup\n",
|
|
||||||
"#\n",
|
|
||||||
"#!pip install git+https://github.com/doctorado-ml/stree"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
|
||||||
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
|
||||||
"from stree import Stree\n",
|
|
||||||
"import time"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"if not os.path.isfile('data/creditcard.csv'):\n",
|
|
||||||
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
|
||||||
" !tar xzf creditcard.tgz"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 19,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"random_state=1\n",
|
|
||||||
"\n",
|
|
||||||
"def load_creditcard(n_examples=0):\n",
|
|
||||||
" import pandas as pd\n",
|
|
||||||
" import numpy as np\n",
|
|
||||||
" import random\n",
|
|
||||||
" df = pd.read_csv('data/creditcard.csv')\n",
|
|
||||||
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
|
||||||
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
|
||||||
" y = df.Class\n",
|
|
||||||
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
|
||||||
" if n_examples > 0:\n",
|
|
||||||
" # Take first n_examples samples\n",
|
|
||||||
" X = X[:n_examples, :]\n",
|
|
||||||
" y = y[:n_examples, :]\n",
|
|
||||||
" else:\n",
|
|
||||||
" # Take all the positive samples with a number of random negatives\n",
|
|
||||||
" if n_examples < 0:\n",
|
|
||||||
" Xt = X[(y == 1).ravel()]\n",
|
|
||||||
" yt = y[(y == 1).ravel()]\n",
|
|
||||||
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
|
||||||
" X = np.append(Xt, X[indices], axis=0)\n",
|
|
||||||
" y = np.append(yt, y[indices], axis=0)\n",
|
|
||||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
|
||||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
|
||||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
|
||||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
|
||||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
|
||||||
"\n",
|
|
||||||
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
|
||||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
|
||||||
"data = load_creditcard(-1000) # Take all the samples\n",
|
|
||||||
"\n",
|
|
||||||
"Xtrain = data[0]\n",
|
|
||||||
"Xtest = data[1]\n",
|
|
||||||
"ytrain = data[2]\n",
|
|
||||||
"ytest = data[3]\n",
|
|
||||||
"# Set weights inverse to its count class in dataset\n",
|
|
||||||
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
|
|
||||||
"weights[ytrain==1] = 1.99755 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 21,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Accuracy of Train without weights 0.996415770609319\nAccuracy of Train with weights 0.994026284348865\nAccuracy of Tests without weights 0.9665738161559888\nAccuracy of Tests with weights 0.9721448467966574\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"C = 23\n",
|
|
||||||
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
|
|
||||||
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"tags": [
|
|
||||||
"outputPrepend"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"t = time.time()\n",
|
|
||||||
"for C in (.001, .01, 1, 5, 17):\n",
|
|
||||||
" clf = Stree(C=C, random_state=random_state)\n",
|
|
||||||
" clf.fit(Xtrain, ytrain)\n",
|
|
||||||
" print(f\"************** C={C} ****************************\")\n",
|
|
||||||
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
|
|
||||||
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
|
|
||||||
" print(clf)\n",
|
|
||||||
" print(f\"**************************************************\")\n",
|
|
||||||
"print(f\"{time.time() - t:.4f} secs\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler\n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from sklearn.calibration import CalibratedClassifierCV\n",
|
|
||||||
"scaler = StandardScaler()\n",
|
|
||||||
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
|
|
||||||
"cclf.fit(Xtrain, ytrain)\n",
|
|
||||||
"res = cclf.predict_proba(Xtest)\n",
|
|
||||||
"print(res[:4, :])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#check iterator\n",
|
|
||||||
"for i in list(clf):\n",
|
|
||||||
" print(i)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#check iterator again\n",
|
|
||||||
"for i in clf:\n",
|
|
||||||
" print(i)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Check if the classifier is a sklearn estimator\n",
|
|
||||||
"from sklearn.utils.estimator_checks import check_estimator\n",
|
|
||||||
"check_estimator(Stree())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Make checks one by one\n",
|
|
||||||
"c = 0\n",
|
|
||||||
"checks = check_estimator(Stree(), generate_only=True)\n",
|
|
||||||
"for check in checks:\n",
|
|
||||||
" c += 1\n",
|
|
||||||
" print(c, check[1])\n",
|
|
||||||
" check[1](check[0])"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.7.6-final"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
121
stree/Strees.py
121
stree/Strees.py
@@ -4,14 +4,14 @@ __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
|||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
__version__ = "0.9"
|
__version__ = "0.9"
|
||||||
Build an oblique tree classifier based on SVM Trees
|
Build an oblique tree classifier based on SVM Trees
|
||||||
Uses LinearSVC
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
|
from sklearn.utils import check_consistent_length
|
||||||
from sklearn.utils.multiclass import check_classification_targets
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
from sklearn.utils.validation import (
|
from sklearn.utils.validation import (
|
||||||
check_X_y,
|
check_X_y,
|
||||||
@@ -19,6 +19,8 @@ from sklearn.utils.validation import (
|
|||||||
check_is_fitted,
|
check_is_fitted,
|
||||||
_check_sample_weight,
|
_check_sample_weight,
|
||||||
)
|
)
|
||||||
|
from sklearn.utils.sparsefuncs import count_nonzero
|
||||||
|
from sklearn.metrics._classification import _weighted_sum, _check_targets
|
||||||
|
|
||||||
|
|
||||||
class Snode:
|
class Snode:
|
||||||
@@ -26,12 +28,8 @@ class Snode:
|
|||||||
dataset assigned to it
|
dataset assigned to it
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, clf: SVC, X: np.ndarray, y: np.ndarray, title: str):
|
||||||
self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str
|
|
||||||
):
|
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._vector = None if clf is None else clf.coef_
|
|
||||||
self._interceptor = 0.0 if clf is None else clf.intercept_
|
|
||||||
self._title = title
|
self._title = title
|
||||||
self._belief = 0.0
|
self._belief = 0.0
|
||||||
# Only store dataset in Testing
|
# Only store dataset in Testing
|
||||||
@@ -70,14 +68,14 @@ class Snode:
|
|||||||
if len(classes) > 1:
|
if len(classes) > 1:
|
||||||
max_card = max(card)
|
max_card = max(card)
|
||||||
min_card = min(card)
|
min_card = min(card)
|
||||||
try:
|
|
||||||
self._belief = max_card / (max_card + min_card)
|
|
||||||
except ZeroDivisionError:
|
|
||||||
self._belief = 0.0
|
|
||||||
self._class = classes[card == max_card][0]
|
self._class = classes[card == max_card][0]
|
||||||
|
self._belief = max_card / (max_card + min_card)
|
||||||
else:
|
else:
|
||||||
self._belief = 1
|
self._belief = 1
|
||||||
self._class = classes[0]
|
try:
|
||||||
|
self._class = classes[0]
|
||||||
|
except IndexError:
|
||||||
|
self._class = None
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
if self.is_leaf():
|
if self.is_leaf():
|
||||||
@@ -126,19 +124,23 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
C: float = 1.0,
|
C: float = 1.0,
|
||||||
|
kernel: str = "linear",
|
||||||
max_iter: int = 1000,
|
max_iter: int = 1000,
|
||||||
random_state: int = None,
|
random_state: int = None,
|
||||||
max_depth: int = None,
|
max_depth: int = None,
|
||||||
tol: float = 1e-4,
|
tol: float = 1e-4,
|
||||||
use_predictions: bool = False,
|
degree: int = 3,
|
||||||
|
gamma="scale",
|
||||||
min_samples_split: int = 0,
|
min_samples_split: int = 0,
|
||||||
):
|
):
|
||||||
self.max_iter = max_iter
|
self.max_iter = max_iter
|
||||||
self.C = C
|
self.C = C
|
||||||
|
self.kernel = kernel
|
||||||
self.random_state = random_state
|
self.random_state = random_state
|
||||||
self.use_predictions = use_predictions
|
|
||||||
self.max_depth = max_depth
|
self.max_depth = max_depth
|
||||||
self.tol = tol
|
self.tol = tol
|
||||||
|
self.gamma = gamma
|
||||||
|
self.degree = degree
|
||||||
self.min_samples_split = min_samples_split
|
self.min_samples_split = min_samples_split
|
||||||
|
|
||||||
def _more_tags(self) -> dict:
|
def _more_tags(self) -> dict:
|
||||||
@@ -149,21 +151,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
"""
|
"""
|
||||||
return {"binary_only": True, "requires_y": True}
|
return {"binary_only": True, "requires_y": True}
|
||||||
|
|
||||||
def _linear_function(self, data: np.array, node: Snode) -> np.array:
|
|
||||||
"""Compute the distance of set of samples to a hyperplane, in
|
|
||||||
multiclass classification it should compute the distance to a
|
|
||||||
hyperplane of each class
|
|
||||||
|
|
||||||
:param data: dataset of samples
|
|
||||||
:type data: np.array shape(m, n)
|
|
||||||
:param node: the node that contains the hyperplance coefficients
|
|
||||||
:type node: Snode shape(1, n)
|
|
||||||
:return: array of distances of each sample to the hyperplane
|
|
||||||
:rtype: np.array
|
|
||||||
"""
|
|
||||||
coef = node._vector[0, :].reshape(-1, data.shape[1])
|
|
||||||
return data.dot(coef.T) + node._interceptor[0]
|
|
||||||
|
|
||||||
def _split_array(self, origin: np.array, down: np.array) -> list:
|
def _split_array(self, origin: np.array, down: np.array) -> list:
|
||||||
"""Split an array in two based on indices passed as down and its complement
|
"""Split an array in two based on indices passed as down and its complement
|
||||||
|
|
||||||
@@ -191,13 +178,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
the hyperplane of the node
|
the hyperplane of the node
|
||||||
:rtype: np.array
|
:rtype: np.array
|
||||||
"""
|
"""
|
||||||
if self.use_predictions:
|
res = node._clf.decision_function(data)
|
||||||
res = np.expand_dims(node._clf.decision_function(data), 1)
|
if res.ndim == 1:
|
||||||
else:
|
return np.expand_dims(res, 1)
|
||||||
# doesn't work with multiclass as each sample has to do inner
|
elif res.shape[1] > 1:
|
||||||
# product with its own coefficients computes positition of every
|
# remove multiclass info
|
||||||
# sample is w.r.t. the hyperplane
|
res = np.delete(res, slice(1, res.shape[1]), axis=1)
|
||||||
res = self._linear_function(data, node)
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def _split_criteria(self, data: np.array) -> np.array:
|
def _split_criteria(self, data: np.array) -> np.array:
|
||||||
@@ -219,13 +205,18 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
) -> "Stree":
|
) -> "Stree":
|
||||||
"""Build the tree based on the dataset of samples and its labels
|
"""Build the tree based on the dataset of samples and its labels
|
||||||
|
|
||||||
|
:param X: dataset of samples to make predictions
|
||||||
|
:type X: np.array
|
||||||
|
:param y: samples labels
|
||||||
|
:type y: np.array
|
||||||
|
:param sample_weight: weights of the samples. Rescale C per sample.
|
||||||
|
Hi' weights force the classifier to put more emphasis on these points
|
||||||
|
:type sample_weight: np.array optional
|
||||||
:raises ValueError: if parameters C or max_depth are out of bounds
|
:raises ValueError: if parameters C or max_depth are out of bounds
|
||||||
:return: itself to be able to chain actions: fit().predict() ...
|
:return: itself to be able to chain actions: fit().predict() ...
|
||||||
:rtype: Stree
|
:rtype: Stree
|
||||||
"""
|
"""
|
||||||
# Check parameters are Ok.
|
# Check parameters are Ok.
|
||||||
if type(y).__name__ == "np.ndarray":
|
|
||||||
y = y.ravel()
|
|
||||||
if self.C < 0:
|
if self.C < 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Penalty term must be positive... got (C={self.C:f})"
|
f"Penalty term must be positive... got (C={self.C:f})"
|
||||||
@@ -266,6 +257,27 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
|
|
||||||
run_tree(self.tree_)
|
run_tree(self.tree_)
|
||||||
|
|
||||||
|
def _build_clf(self):
|
||||||
|
""" Build the correct classifier for the node
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
LinearSVC(
|
||||||
|
max_iter=self.max_iter,
|
||||||
|
random_state=self.random_state,
|
||||||
|
C=self.C,
|
||||||
|
tol=self.tol,
|
||||||
|
)
|
||||||
|
if self.kernel == "linear"
|
||||||
|
else SVC(
|
||||||
|
kernel=self.kernel,
|
||||||
|
max_iter=self.max_iter,
|
||||||
|
tol=self.tol,
|
||||||
|
C=self.C,
|
||||||
|
gamma=self.gamma,
|
||||||
|
degree=self.degree,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
self,
|
self,
|
||||||
X: np.ndarray,
|
X: np.ndarray,
|
||||||
@@ -281,7 +293,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
:type X: np.ndarray
|
:type X: np.ndarray
|
||||||
:param y: samples labels
|
:param y: samples labels
|
||||||
:type y: np.ndarray
|
:type y: np.ndarray
|
||||||
:param sample_weight: weight of samples (used in boosting)
|
:param sample_weight: weight of samples. Rescale C per sample.
|
||||||
|
Hi weights force the classifier to put more emphasis on these points.
|
||||||
:type sample_weight: np.ndarray
|
:type sample_weight: np.ndarray
|
||||||
:param depth: actual depth in the tree
|
:param depth: actual depth in the tree
|
||||||
:type depth: int
|
:type depth: int
|
||||||
@@ -296,9 +309,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
# only 1 class => pure dataset
|
# only 1 class => pure dataset
|
||||||
return Snode(None, X, y, title + ", <pure>")
|
return Snode(None, X, y, title + ", <pure>")
|
||||||
# Train the model
|
# Train the model
|
||||||
clf = LinearSVC(
|
clf = self._build_clf()
|
||||||
max_iter=self.max_iter, random_state=self.random_state, C=self.C
|
|
||||||
) # , sample_weight=sample_weight)
|
|
||||||
clf.fit(X, y, sample_weight=sample_weight)
|
clf.fit(X, y, sample_weight=sample_weight)
|
||||||
tree = Snode(clf, X, y, title)
|
tree = Snode(clf, X, y, title)
|
||||||
self.depth_ = max(depth, self.depth_)
|
self.depth_ = max(depth, self.depth_)
|
||||||
@@ -434,20 +445,36 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
result[:, 0] = 1 - result[:, 1]
|
result[:, 0] = 1 - result[:, 1]
|
||||||
return self._reorder_results(result, indices)
|
return self._reorder_results(result, indices)
|
||||||
|
|
||||||
def score(self, X: np.array, y: np.array) -> float:
|
def score(
|
||||||
|
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||||
|
) -> float:
|
||||||
"""Compute accuracy of the prediction
|
"""Compute accuracy of the prediction
|
||||||
|
|
||||||
:param X: dataset of samples to make predictions
|
:param X: dataset of samples to make predictions
|
||||||
:type X: np.array
|
:type X: np.array
|
||||||
:param y: samples labels
|
:param y_true: samples labels
|
||||||
:type y: np.array
|
:type y_true: np.array
|
||||||
|
:param sample_weight: weights of the samples. Rescale C per sample.
|
||||||
|
Hi' weights force the classifier to put more emphasis on these points
|
||||||
|
:type sample_weight: np.array optional
|
||||||
:return: accuracy of the prediction
|
:return: accuracy of the prediction
|
||||||
:rtype: float
|
:rtype: float
|
||||||
"""
|
"""
|
||||||
# sklearn check
|
# sklearn check
|
||||||
check_is_fitted(self)
|
check_is_fitted(self)
|
||||||
yp = self.predict(X).reshape(y.shape)
|
check_classification_targets(y)
|
||||||
return np.mean(yp == y)
|
X, y = check_X_y(X, y)
|
||||||
|
y_pred = self.predict(X).reshape(y.shape)
|
||||||
|
# Compute accuracy for each possible representation
|
||||||
|
y_type, y_true, y_pred = _check_targets(y, y_pred)
|
||||||
|
check_consistent_length(y_true, y_pred, sample_weight)
|
||||||
|
if y_type.startswith("multilabel"):
|
||||||
|
differing_labels = count_nonzero(y_true - y_pred, axis=1)
|
||||||
|
score = differing_labels == 0
|
||||||
|
else:
|
||||||
|
score = y_true == y_pred
|
||||||
|
|
||||||
|
return _weighted_sum(score, sample_weight, normalize=True)
|
||||||
|
|
||||||
def __iter__(self) -> Siterator:
|
def __iter__(self) -> Siterator:
|
||||||
"""Create an iterator to be able to visit the nodes of the tree in preorder,
|
"""Create an iterator to be able to visit the nodes of the tree in preorder,
|
||||||
|
@@ -41,6 +41,9 @@ class Snode_graph(Snode):
|
|||||||
def set_axis_limits(self, limits: tuple):
|
def set_axis_limits(self, limits: tuple):
|
||||||
self._xlimits, self._ylimits, self._zlimits = limits
|
self._xlimits, self._ylimits, self._zlimits = limits
|
||||||
|
|
||||||
|
def get_axis_limits(self) -> tuple:
|
||||||
|
return self._xlimits, self._ylimits, self._zlimits
|
||||||
|
|
||||||
def _set_graphics_axis(self, ax: Axes3D):
|
def _set_graphics_axis(self, ax: Axes3D):
|
||||||
ax.set_xlim(self._xlimits)
|
ax.set_xlim(self._xlimits)
|
||||||
ax.set_ylim(self._ylimits)
|
ax.set_ylim(self._ylimits)
|
||||||
@@ -50,7 +53,7 @@ class Snode_graph(Snode):
|
|||||||
self, save_folder: str = "./", save_prefix: str = "", save_seq: int = 1
|
self, save_folder: str = "./", save_prefix: str = "", save_seq: int = 1
|
||||||
):
|
):
|
||||||
_, fig = self.plot_hyperplane()
|
_, fig = self.plot_hyperplane()
|
||||||
name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
|
name = os.path.join(save_folder, f"{save_prefix}STnode{save_seq}.png")
|
||||||
fig.savefig(name, bbox_inches="tight")
|
fig.savefig(name, bbox_inches="tight")
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
|
|
||||||
@@ -73,10 +76,10 @@ class Snode_graph(Snode):
|
|||||||
# get the splitting hyperplane
|
# get the splitting hyperplane
|
||||||
def hyperplane(x, y):
|
def hyperplane(x, y):
|
||||||
return (
|
return (
|
||||||
-self._interceptor
|
-self._clf.intercept_
|
||||||
- self._vector[0][0] * x
|
- self._clf.coef_[0][0] * x
|
||||||
- self._vector[0][1] * y
|
- self._clf.coef_[0][1] * y
|
||||||
) / self._vector[0][2]
|
) / self._clf.coef_[0][2]
|
||||||
|
|
||||||
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
|
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
|
||||||
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
|
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
|
||||||
|
@@ -8,7 +8,7 @@ import matplotlib.pyplot as plt
|
|||||||
import warnings
|
import warnings
|
||||||
from sklearn.datasets import make_classification
|
from sklearn.datasets import make_classification
|
||||||
|
|
||||||
from stree import Stree_grapher, Snode_graph
|
from stree import Stree_grapher, Snode_graph, Snode
|
||||||
|
|
||||||
|
|
||||||
def get_dataset(random_state=0, n_features=3):
|
def get_dataset(random_state=0, n_features=3):
|
||||||
@@ -30,20 +30,14 @@ def get_dataset(random_state=0, n_features=3):
|
|||||||
|
|
||||||
class Stree_grapher_test(unittest.TestCase):
|
class Stree_grapher_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
os.environ["TESTING"] = "1"
|
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree_grapher(
|
self._clf = Stree_grapher(dict(random_state=self._random_state))
|
||||||
dict(random_state=self._random_state, use_predictions=False)
|
|
||||||
)
|
|
||||||
self._clf.fit(*get_dataset(self._random_state, n_features=4))
|
self._clf.fit(*get_dataset(self._random_state, n_features=4))
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tearDownClass(cls):
|
def setUp(cls):
|
||||||
try:
|
os.environ["TESTING"] = "1"
|
||||||
os.environ.pop("TESTING")
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_iterator(self):
|
def test_iterator(self):
|
||||||
"""Check preorder iterator
|
"""Check preorder iterator
|
||||||
@@ -75,8 +69,12 @@ class Stree_grapher_test(unittest.TestCase):
|
|||||||
self.assertGreater(accuracy_score, 0.86)
|
self.assertGreater(accuracy_score, 0.86)
|
||||||
|
|
||||||
def test_save_all(self):
|
def test_save_all(self):
|
||||||
folder_name = "/tmp/"
|
folder_name = os.path.join(os.sep, "tmp", "stree")
|
||||||
file_names = [f"{folder_name}STnode{i}.png" for i in range(1, 8)]
|
if os.path.isdir(folder_name):
|
||||||
|
os.rmdir(folder_name)
|
||||||
|
file_names = [
|
||||||
|
os.path.join(folder_name, f"STnode{i}.png") for i in range(1, 8)
|
||||||
|
]
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("ignore")
|
warnings.simplefilter("ignore")
|
||||||
matplotlib.use("Agg")
|
matplotlib.use("Agg")
|
||||||
@@ -85,6 +83,7 @@ class Stree_grapher_test(unittest.TestCase):
|
|||||||
self.assertTrue(os.path.exists(file_name))
|
self.assertTrue(os.path.exists(file_name))
|
||||||
self.assertEqual("png", imghdr.what(file_name))
|
self.assertEqual("png", imghdr.what(file_name))
|
||||||
os.remove(file_name)
|
os.remove(file_name)
|
||||||
|
os.rmdir(folder_name)
|
||||||
|
|
||||||
def test_plot_all(self):
|
def test_plot_all(self):
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
@@ -98,22 +97,14 @@ class Stree_grapher_test(unittest.TestCase):
|
|||||||
|
|
||||||
class Snode_graph_test(unittest.TestCase):
|
class Snode_graph_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
os.environ["TESTING"] = "1"
|
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree_grapher(
|
self._clf = Stree_grapher(dict(random_state=self._random_state))
|
||||||
dict(random_state=self._random_state, use_predictions=False)
|
|
||||||
)
|
|
||||||
self._clf.fit(*get_dataset(self._random_state))
|
self._clf.fit(*get_dataset(self._random_state))
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tearDownClass(cls):
|
def setUp(cls):
|
||||||
"""Remove the testing environ variable
|
os.environ["TESTING"] = "1"
|
||||||
"""
|
|
||||||
try:
|
|
||||||
os.environ.pop("TESTING")
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_plot_size(self):
|
def test_plot_size(self):
|
||||||
default = self._clf._tree_gr.get_plot_size()
|
default = self._clf._tree_gr.get_plot_size()
|
||||||
@@ -160,8 +151,6 @@ class Snode_graph_test(unittest.TestCase):
|
|||||||
# only exclude pure leaves
|
# only exclude pure leaves
|
||||||
self.assertIsNotNone(node._clf)
|
self.assertIsNotNone(node._clf)
|
||||||
self.assertIsNotNone(node._clf.coef_)
|
self.assertIsNotNone(node._clf.coef_)
|
||||||
self.assertIsNotNone(node._vector)
|
|
||||||
self.assertIsNotNone(node._interceptor)
|
|
||||||
if node.is_leaf():
|
if node.is_leaf():
|
||||||
return
|
return
|
||||||
run_tree(node.get_down())
|
run_tree(node.get_down())
|
||||||
@@ -171,7 +160,7 @@ class Snode_graph_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_save_hyperplane(self):
|
def test_save_hyperplane(self):
|
||||||
folder_name = "/tmp/"
|
folder_name = "/tmp/"
|
||||||
file_name = f"{folder_name}STnode1.png"
|
file_name = os.path.join(folder_name, "STnode1.png")
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("ignore")
|
warnings.simplefilter("ignore")
|
||||||
matplotlib.use("Agg")
|
matplotlib.use("Agg")
|
||||||
@@ -209,3 +198,14 @@ class Snode_graph_test(unittest.TestCase):
|
|||||||
self._clf._tree_gr.plot_distribution()
|
self._clf._tree_gr.plot_distribution()
|
||||||
num_figures_after = plt.gcf().number
|
num_figures_after = plt.gcf().number
|
||||||
self.assertEqual(1, num_figures_after - num_figures_before)
|
self.assertEqual(1, num_figures_after - num_figures_before)
|
||||||
|
|
||||||
|
def test_set_axis_limits(self):
|
||||||
|
node = Snode_graph(Snode(None, None, None, "test"))
|
||||||
|
limits = (-2, 2), (-3, 3), (-4, 4)
|
||||||
|
node.set_axis_limits(limits)
|
||||||
|
computed = node.get_axis_limits()
|
||||||
|
x, y, z = limits
|
||||||
|
xx, yy, zz = computed
|
||||||
|
self.assertEqual(x, xx)
|
||||||
|
self.assertEqual(y, yy)
|
||||||
|
self.assertEqual(z, zz)
|
||||||
|
@@ -26,20 +26,13 @@ def get_dataset(random_state=0):
|
|||||||
|
|
||||||
class Stree_test(unittest.TestCase):
|
class Stree_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
os.environ["TESTING"] = "1"
|
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree(
|
self._kernels = ["linear", "rbf", "poly"]
|
||||||
random_state=self._random_state, use_predictions=False
|
|
||||||
)
|
|
||||||
self._clf.fit(*get_dataset(self._random_state))
|
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tearDownClass(cls):
|
def setUp(cls):
|
||||||
try:
|
os.environ["TESTING"] = "1"
|
||||||
os.environ.pop("TESTING")
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _check_tree(self, node: Snode):
|
def _check_tree(self, node: Snode):
|
||||||
"""Check recursively that the nodes that are not leaves have the
|
"""Check recursively that the nodes that are not leaves have the
|
||||||
@@ -82,23 +75,13 @@ class Stree_test(unittest.TestCase):
|
|||||||
def test_build_tree(self):
|
def test_build_tree(self):
|
||||||
"""Check if the tree is built the same way as predictions of models
|
"""Check if the tree is built the same way as predictions of models
|
||||||
"""
|
"""
|
||||||
self._check_tree(self._clf.tree_)
|
import warnings
|
||||||
|
|
||||||
def _get_file_data(self, file_name: str) -> tuple:
|
warnings.filterwarnings("ignore")
|
||||||
"""Return X, y from data, y is the last column in array
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
Arguments:
|
clf.fit(*get_dataset(self._random_state))
|
||||||
file_name {str} -- the file name
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple -- tuple with samples, categories
|
|
||||||
"""
|
|
||||||
data = np.genfromtxt(file_name, delimiter=",")
|
|
||||||
data = np.array(data)
|
|
||||||
column_y = data.shape[1] - 1
|
|
||||||
fy = data[:, column_y]
|
|
||||||
fx = np.delete(data, column_y, axis=1)
|
|
||||||
return fx, fy
|
|
||||||
|
|
||||||
def _find_out(
|
def _find_out(
|
||||||
self, px: np.array, x_original: np.array, y_original
|
self, px: np.array, x_original: np.array, y_original
|
||||||
@@ -121,141 +104,85 @@ class Stree_test(unittest.TestCase):
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
def test_single_prediction(self):
|
def test_single_prediction(self):
|
||||||
|
probs = [0.29026400766, 0.73105613, 0.0307635]
|
||||||
X, y = get_dataset(self._random_state)
|
X, y = get_dataset(self._random_state)
|
||||||
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
|
for kernel, prob in zip(self._kernels, probs):
|
||||||
self.assertEqual(yp[0], y[0])
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||||
|
self.assertEqual(yp[0], y[0])
|
||||||
|
|
||||||
def test_multiple_prediction(self):
|
def test_multiple_prediction(self):
|
||||||
# First 27 elements the predictions are the same as the truth
|
# First 27 elements the predictions are the same as the truth
|
||||||
num = 27
|
num = 27
|
||||||
X, y = get_dataset(self._random_state)
|
X, y = get_dataset(self._random_state)
|
||||||
yp = self._clf.predict(X[:num, :])
|
for kernel in self._kernels:
|
||||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
yp = clf.fit(X, y).predict(X[:num, :])
|
||||||
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||||
|
|
||||||
def test_score(self):
|
def test_score(self):
|
||||||
X, y = get_dataset(self._random_state)
|
X, y = get_dataset(self._random_state)
|
||||||
accuracy_score = self._clf.score(X, y)
|
for kernel, accuracy_expected in zip(
|
||||||
yp = self._clf.predict(X)
|
self._kernels,
|
||||||
accuracy_computed = np.mean(yp == y)
|
[0.9506666666666667, 0.9606666666666667, 0.9433333333333334],
|
||||||
self.assertEqual(accuracy_score, accuracy_computed)
|
):
|
||||||
self.assertGreater(accuracy_score, 0.9)
|
clf = Stree(random_state=self._random_state, kernel=kernel,)
|
||||||
|
clf.fit(X, y)
|
||||||
|
accuracy_score = clf.score(X, y)
|
||||||
|
yp = clf.predict(X)
|
||||||
|
accuracy_computed = np.mean(yp == y)
|
||||||
|
self.assertEqual(accuracy_score, accuracy_computed)
|
||||||
|
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||||
|
|
||||||
def test_single_predict_proba(self):
|
def test_single_predict_proba(self):
|
||||||
"""Check that element 28 has a prediction different that the current
|
"""Check the element 28 probability of being 1
|
||||||
label
|
|
||||||
"""
|
"""
|
||||||
# Element 28 has a different prediction than the truth
|
|
||||||
decimals = 5
|
decimals = 5
|
||||||
prob = 0.29026400766
|
element = 28
|
||||||
|
probs = [0.29026400766, 0.73105613, 0.0307635]
|
||||||
X, y = get_dataset(self._random_state)
|
X, y = get_dataset(self._random_state)
|
||||||
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
self.assertEqual(1, y[element])
|
||||||
self.assertEqual(
|
for kernel, prob in zip(self._kernels, probs):
|
||||||
np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
)
|
yp = clf.fit(X, y).predict_proba(
|
||||||
self.assertEqual(1, y[28])
|
X[element, :].reshape(-1, X.shape[1])
|
||||||
|
)
|
||||||
self.assertAlmostEqual(
|
self.assertAlmostEqual(
|
||||||
round(prob, decimals), round(yp[0, 1], decimals), decimals
|
np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)
|
||||||
)
|
)
|
||||||
|
self.assertAlmostEqual(
|
||||||
|
round(prob, decimals), round(yp[0, 1], decimals), decimals
|
||||||
|
)
|
||||||
|
|
||||||
def test_multiple_predict_proba(self):
|
def test_multiple_predict_proba(self):
|
||||||
# First 27 elements the predictions are the same as the truth
|
# First 27 elements the predictions are the same as the truth
|
||||||
num = 27
|
num = 27
|
||||||
decimals = 5
|
|
||||||
X, y = get_dataset(self._random_state)
|
X, y = get_dataset(self._random_state)
|
||||||
yp = self._clf.predict_proba(X[:num, :])
|
for kernel in self._kernels:
|
||||||
self.assertListEqual(
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()
|
clf.fit(X, y)
|
||||||
)
|
yp = clf.predict_proba(X[:num, :])
|
||||||
expected_proba = [
|
self.assertListEqual(
|
||||||
0.88395641,
|
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()
|
||||||
0.36746962,
|
)
|
||||||
0.84158767,
|
|
||||||
0.34106833,
|
|
||||||
0.14269291,
|
|
||||||
0.85193236,
|
|
||||||
0.29876058,
|
|
||||||
0.7282164,
|
|
||||||
0.85958616,
|
|
||||||
0.89517877,
|
|
||||||
0.99745224,
|
|
||||||
0.18860349,
|
|
||||||
0.30756427,
|
|
||||||
0.8318412,
|
|
||||||
0.18981198,
|
|
||||||
0.15564624,
|
|
||||||
0.25740655,
|
|
||||||
0.22923355,
|
|
||||||
0.87365959,
|
|
||||||
0.49928689,
|
|
||||||
0.95574351,
|
|
||||||
0.28761257,
|
|
||||||
0.28906333,
|
|
||||||
0.32643692,
|
|
||||||
0.29788483,
|
|
||||||
0.01657364,
|
|
||||||
0.81149083,
|
|
||||||
]
|
|
||||||
expected = np.round(expected_proba, decimals=decimals).tolist()
|
|
||||||
computed = np.round(yp[:, 1], decimals=decimals).tolist()
|
|
||||||
for i in range(len(expected)):
|
|
||||||
self.assertAlmostEqual(expected[i], computed[i], decimals)
|
|
||||||
|
|
||||||
def build_models(self):
|
|
||||||
"""Build and train two models, model_clf will use the sklearn
|
|
||||||
classifier to compute predictions and split data. model_computed will
|
|
||||||
use vector of coefficients to compute both predictions and splitted
|
|
||||||
data
|
|
||||||
"""
|
|
||||||
model_clf = Stree(
|
|
||||||
random_state=self._random_state, use_predictions=True
|
|
||||||
)
|
|
||||||
model_computed = Stree(
|
|
||||||
random_state=self._random_state, use_predictions=False
|
|
||||||
)
|
|
||||||
X, y = get_dataset(self._random_state)
|
|
||||||
model_clf.fit(X, y)
|
|
||||||
model_computed.fit(X, y)
|
|
||||||
return model_clf, model_computed, X, y
|
|
||||||
|
|
||||||
def test_use_model_predict(self):
|
|
||||||
"""Check that we get the same results wether we use the estimator in
|
|
||||||
nodes to compute labels or we use the hyperplane and the position of
|
|
||||||
samples wrt to it
|
|
||||||
"""
|
|
||||||
use_clf, use_math, X, _ = self.build_models()
|
|
||||||
self.assertListEqual(
|
|
||||||
use_clf.predict(X).tolist(), use_math.predict(X).tolist()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_use_model_score(self):
|
|
||||||
use_clf, use_math, X, y = self.build_models()
|
|
||||||
b = use_math.score(X, y)
|
|
||||||
self.assertEqual(use_clf.score(X, y), b)
|
|
||||||
self.assertGreater(b, 0.95)
|
|
||||||
|
|
||||||
def test_use_model_predict_proba(self):
|
|
||||||
use_clf, use_math, X, _ = self.build_models()
|
|
||||||
self.assertListEqual(
|
|
||||||
use_clf.predict_proba(X).tolist(),
|
|
||||||
use_math.predict_proba(X).tolist(),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_single_vs_multiple_prediction(self):
|
def test_single_vs_multiple_prediction(self):
|
||||||
"""Check if predicting sample by sample gives the same result as
|
"""Check if predicting sample by sample gives the same result as
|
||||||
predicting all samples at once
|
predicting all samples at once
|
||||||
"""
|
"""
|
||||||
X, _ = get_dataset(self._random_state)
|
X, y = get_dataset(self._random_state)
|
||||||
# Compute prediction line by line
|
for kernel in self._kernels:
|
||||||
yp_line = np.array([], dtype=int)
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
for xp in X:
|
clf.fit(X, y)
|
||||||
yp_line = np.append(
|
# Compute prediction line by line
|
||||||
yp_line, self._clf.predict(xp.reshape(-1, X.shape[1]))
|
yp_line = np.array([], dtype=int)
|
||||||
)
|
for xp in X:
|
||||||
# Compute prediction at once
|
yp_line = np.append(
|
||||||
yp_once = self._clf.predict(X)
|
yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
|
||||||
#
|
)
|
||||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
# Compute prediction at once
|
||||||
|
yp_once = clf.predict(X)
|
||||||
|
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||||
|
|
||||||
def test_iterator_and_str(self):
|
def test_iterator_and_str(self):
|
||||||
"""Check preorder iterator
|
"""Check preorder iterator
|
||||||
@@ -275,11 +202,13 @@ class Stree_test(unittest.TestCase):
|
|||||||
]
|
]
|
||||||
computed = []
|
computed = []
|
||||||
expected_string = ""
|
expected_string = ""
|
||||||
for node in self._clf:
|
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||||
|
clf.fit(*get_dataset(self._random_state))
|
||||||
|
for node in clf:
|
||||||
computed.append(str(node))
|
computed.append(str(node))
|
||||||
expected_string += str(node) + "\n"
|
expected_string += str(node) + "\n"
|
||||||
self.assertListEqual(expected, computed)
|
self.assertListEqual(expected, computed)
|
||||||
self.assertEqual(expected_string, str(self._clf))
|
self.assertEqual(expected_string, str(clf))
|
||||||
|
|
||||||
def test_is_a_sklearn_classifier(self):
|
def test_is_a_sklearn_classifier(self):
|
||||||
import warnings
|
import warnings
|
||||||
@@ -306,10 +235,11 @@ class Stree_test(unittest.TestCase):
|
|||||||
tcl.fit(*get_dataset(self._random_state))
|
tcl.fit(*get_dataset(self._random_state))
|
||||||
|
|
||||||
def test_check_max_depth(self):
|
def test_check_max_depth(self):
|
||||||
depth = 3
|
depths = (3, 4)
|
||||||
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
for depth in depths:
|
||||||
tcl.fit(*get_dataset(self._random_state))
|
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
||||||
self.assertEqual(depth, tcl.depth_)
|
tcl.fit(*get_dataset(self._random_state))
|
||||||
|
self.assertEqual(depth, tcl.depth_)
|
||||||
|
|
||||||
def test_unfitted_tree_is_iterable(self):
|
def test_unfitted_tree_is_iterable(self):
|
||||||
tcl = Stree()
|
tcl = Stree()
|
||||||
@@ -326,25 +256,26 @@ class Stree_test(unittest.TestCase):
|
|||||||
self.assertIsNone(tcl_nosplit.tree_.get_down())
|
self.assertIsNone(tcl_nosplit.tree_.get_down())
|
||||||
self.assertIsNone(tcl_nosplit.tree_.get_up())
|
self.assertIsNone(tcl_nosplit.tree_.get_up())
|
||||||
|
|
||||||
|
def test_muticlass_dataset(self):
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
px = [[1, 2], [3, 4], [5, 6]]
|
||||||
|
py = [1, 2, 3]
|
||||||
|
clf.fit(px, py)
|
||||||
|
self.assertEqual(1.0, clf.score(px, py))
|
||||||
|
self.assertListEqual([1, 2, 3], clf.predict(px).tolist())
|
||||||
|
|
||||||
|
|
||||||
class Snode_test(unittest.TestCase):
|
class Snode_test(unittest.TestCase):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
os.environ["TESTING"] = "1"
|
|
||||||
self._random_state = 1
|
self._random_state = 1
|
||||||
self._clf = Stree(
|
self._clf = Stree(random_state=self._random_state)
|
||||||
random_state=self._random_state, use_predictions=True
|
|
||||||
)
|
|
||||||
self._clf.fit(*get_dataset(self._random_state))
|
self._clf.fit(*get_dataset(self._random_state))
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tearDownClass(cls):
|
def setUp(cls):
|
||||||
"""[summary]
|
os.environ["TESTING"] = "1"
|
||||||
"""
|
|
||||||
try:
|
|
||||||
os.environ.pop("TESTING")
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_attributes_in_leaves(self):
|
def test_attributes_in_leaves(self):
|
||||||
"""Check if the attributes in leaves have correct values so they form a
|
"""Check if the attributes in leaves have correct values so they form a
|
||||||
@@ -383,8 +314,6 @@ class Snode_test(unittest.TestCase):
|
|||||||
# only exclude pure leaves
|
# only exclude pure leaves
|
||||||
self.assertIsNotNone(node._clf)
|
self.assertIsNotNone(node._clf)
|
||||||
self.assertIsNotNone(node._clf.coef_)
|
self.assertIsNotNone(node._clf.coef_)
|
||||||
self.assertIsNotNone(node._vector)
|
|
||||||
self.assertIsNotNone(node._interceptor)
|
|
||||||
if node.is_leaf():
|
if node.is_leaf():
|
||||||
return
|
return
|
||||||
run_tree(node.get_down())
|
run_tree(node.get_down())
|
||||||
@@ -404,3 +333,8 @@ class Snode_test(unittest.TestCase):
|
|||||||
test.make_predictor()
|
test.make_predictor()
|
||||||
self.assertIsNone(test._class)
|
self.assertIsNone(test._class)
|
||||||
self.assertEqual(0, test._belief)
|
self.assertEqual(0, test._belief)
|
||||||
|
|
||||||
|
def test_make_predictor_on_leaf_bogus_data(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [], "test")
|
||||||
|
test.make_predictor()
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
Reference in New Issue
Block a user