mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
29 Commits
0.9rc2
...
add_subspa
Author | SHA1 | Date | |
---|---|---|---|
9334951d1b
|
|||
736ab7ef20
|
|||
c94bc068bd
|
|||
502ee72799
|
|||
f1ee4de37b
|
|||
ae1c199e21
|
|||
1bfe273a70
|
|||
|
647d21bdb5 | ||
1d392d534f
|
|||
f360a2640c
|
|||
|
45510b43bc | ||
286a91a3d7
|
|||
5c31c2b2a5
|
|||
7e932de072
|
|||
26273e936a
|
|||
d7c0bc3bc5
|
|||
3a48d8b405
|
|||
05b462716e
|
|||
b824229121
|
|||
8ba9b1b6a1
|
|||
37577849db
|
|||
cb10aea36e
|
|||
b9f14aec05
|
|||
b4816b2995
|
|||
5e5fea9c6a
|
|||
724a4855fb
|
|||
a22ae81b54
|
|||
ed98054f0d
|
|||
e95bd9697a
|
14
.coveragerc
Normal file
14
.coveragerc
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[run]
|
||||||
|
branch = True
|
||||||
|
source = stree
|
||||||
|
|
||||||
|
[report]
|
||||||
|
exclude_lines =
|
||||||
|
if self.debug:
|
||||||
|
pragma: no cover
|
||||||
|
raise NotImplementedError
|
||||||
|
if __name__ == .__main__.:
|
||||||
|
ignore_errors = True
|
||||||
|
omit =
|
||||||
|
stree/tests/*
|
||||||
|
stree/__init__.py
|
1
.gitignore
vendored
1
.gitignore
vendored
@@ -130,3 +130,4 @@ dmypy.json
|
|||||||
|
|
||||||
.idea
|
.idea
|
||||||
.vscode
|
.vscode
|
||||||
|
.pre-commit-config.yaml
|
13
.travis.yml
13
.travis.yml
@@ -1,13 +0,0 @@
|
|||||||
language: python
|
|
||||||
os: linux
|
|
||||||
dist: xenial
|
|
||||||
install:
|
|
||||||
- pip install -r requirements.txt
|
|
||||||
notifications:
|
|
||||||
email:
|
|
||||||
recipients:
|
|
||||||
- ricardo.montanana@alu.uclm.es
|
|
||||||
on_success: never # default: change
|
|
||||||
on_failure: always # default: always
|
|
||||||
# command to run tests
|
|
||||||
script: python -m unittest stree.tests
|
|
18
README.md
18
README.md
@@ -1,8 +1,10 @@
|
|||||||
[](https://travis-ci.com/Doctorado-ML/STree)
|
[](https://app.codeship.com/projects/399170)
|
||||||
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
|
||||||
# Stree
|
# Stree
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@@ -16,17 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
|
|||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
##### Slow launch but better integration
|
* [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
* [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/test.ipynb) Test notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
##### Fast launch but have to run first commented out cell for setup
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test.ipynb) Test notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test2.ipynb) Another Test notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test_graphs.ipynb) Test Graphics notebook
|
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
|
||||||
|
|
||||||
### Command line
|
### Command line
|
||||||
|
|
||||||
|
12
codecov.yml
Normal file
12
codecov.yml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
overage:
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
target: 90%
|
||||||
|
comment:
|
||||||
|
layout: "reach, diff, flags, files"
|
||||||
|
behavior: default
|
||||||
|
require_changes: false
|
||||||
|
require_base: yes
|
||||||
|
require_head: yes
|
||||||
|
branches: null
|
File diff suppressed because one or more lines are too long
1
data/.gitignore
vendored
1
data/.gitignore
vendored
@@ -1 +0,0 @@
|
|||||||
*
|
|
60
main.py
60
main.py
@@ -1,57 +1,29 @@
|
|||||||
import time
|
import time
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
from stree import Stree
|
from stree import Stree
|
||||||
|
|
||||||
random_state=1
|
random_state = 1
|
||||||
|
|
||||||
def load_creditcard(n_examples=0):
|
X, y = load_iris(return_X_y=True)
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import random
|
|
||||||
df = pd.read_csv('data/creditcard.csv')
|
|
||||||
print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
|
|
||||||
print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
|
|
||||||
y = np.expand_dims(df.Class.values, axis=1)
|
|
||||||
X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
|
|
||||||
if n_examples > 0:
|
|
||||||
# Take first n_examples samples
|
|
||||||
X = X[:n_examples, :]
|
|
||||||
y = y[:n_examples, :]
|
|
||||||
else:
|
|
||||||
# Take all the positive samples with a number of random negatives
|
|
||||||
if n_examples < 0:
|
|
||||||
Xt = X[(y == 1).ravel()]
|
|
||||||
yt = y[(y == 1).ravel()]
|
|
||||||
indices = random.sample(range(X.shape[0]), -1 * n_examples)
|
|
||||||
X = np.append(Xt, X[indices], axis=0)
|
|
||||||
y = np.append(yt, y[indices], axis=0)
|
|
||||||
print("X.shape", X.shape, " y.shape", y.shape)
|
|
||||||
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
|
|
||||||
print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
|
|
||||||
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
|
|
||||||
return Xtrain, Xtest, ytrain, ytest
|
|
||||||
|
|
||||||
# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
|
Xtrain, Xtest, ytrain, ytest = train_test_split(
|
||||||
# data = load_creditcard(5000) # Take the first 5000 samples
|
X, y, test_size=0.2, random_state=random_state
|
||||||
data = load_creditcard() # Take all the samples
|
)
|
||||||
|
|
||||||
Xtrain = data[0]
|
|
||||||
Xtest = data[1]
|
|
||||||
ytrain = data[2]
|
|
||||||
ytest = data[3]
|
|
||||||
|
|
||||||
now = time.time()
|
now = time.time()
|
||||||
clf = Stree(C=.01, random_state=random_state)
|
print("Predicting with max_features=sqrt(n_features)")
|
||||||
|
clf = Stree(C=0.01, random_state=random_state, max_features="auto")
|
||||||
|
clf.fit(Xtrain, ytrain)
|
||||||
|
print(f"Took {time.time() - now:.2f} seconds to train")
|
||||||
|
print(clf)
|
||||||
|
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
||||||
|
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
||||||
|
print("=" * 40)
|
||||||
|
print("Predicting with max_features=n_features")
|
||||||
|
clf = Stree(C=0.01, random_state=random_state)
|
||||||
clf.fit(Xtrain, ytrain)
|
clf.fit(Xtrain, ytrain)
|
||||||
print(f"Took {time.time() - now:.2f} seconds to train")
|
print(f"Took {time.time() - now:.2f} seconds to train")
|
||||||
print(clf)
|
print(clf)
|
||||||
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
||||||
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
||||||
proba = clf.predict_proba(Xtest)
|
|
||||||
print("Checking that we have correct probabilities, these are probabilities of sample belonging to class 1")
|
|
||||||
res0 = proba[proba[:, 0] == 0]
|
|
||||||
res1 = proba[proba[:, 0] == 1]
|
|
||||||
print("++++++++++res0 > .8++++++++++++")
|
|
||||||
print(res0[res0[:, 1] > .8])
|
|
||||||
print("**********res1 < .4************")
|
|
||||||
print(res1[res1[:, 1] < .4])
|
|
232
notebooks/adaboost.ipynb
Normal file
232
notebooks/adaboost.ipynb
Normal file
@@ -0,0 +1,232 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Test AdaBoost with different configurations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#\n",
|
||||||
|
"# Google Colab setup\n",
|
||||||
|
"#\n",
|
||||||
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import time\n",
|
||||||
|
"from sklearn.ensemble import AdaBoostClassifier\n",
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
|
"from sklearn.svm import LinearSVC, SVC\n",
|
||||||
|
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||||
|
"from sklearn.datasets import load_iris\n",
|
||||||
|
"from stree import Stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"if not os.path.isfile('data/creditcard.csv'):\n",
|
||||||
|
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
||||||
|
" !tar xzf creditcard.tgz"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"\n",
|
||||||
|
"def load_creditcard(n_examples=0):\n",
|
||||||
|
" import pandas as pd\n",
|
||||||
|
" import numpy as np\n",
|
||||||
|
" import random\n",
|
||||||
|
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||||
|
" y = df.Class\n",
|
||||||
|
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||||
|
" if n_examples > 0:\n",
|
||||||
|
" # Take first n_examples samples\n",
|
||||||
|
" X = X[:n_examples, :]\n",
|
||||||
|
" y = y[:n_examples, :]\n",
|
||||||
|
" else:\n",
|
||||||
|
" # Take all the positive samples with a number of random negatives\n",
|
||||||
|
" if n_examples < 0:\n",
|
||||||
|
" Xt = X[(y == 1).ravel()]\n",
|
||||||
|
" yt = y[(y == 1).ravel()]\n",
|
||||||
|
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
||||||
|
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||||
|
" y = np.append(yt, y[indices], axis=0)\n",
|
||||||
|
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||||
|
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||||
|
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
|
"\n",
|
||||||
|
"# data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
|
||||||
|
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||||
|
"# data = load_creditcard(0) # Take all the samples\n",
|
||||||
|
"data = load_creditcard(-100000)\n",
|
||||||
|
"\n",
|
||||||
|
"Xtrain = data[0]\n",
|
||||||
|
"Xtest = data[1]\n",
|
||||||
|
"ytrain = data[2]\n",
|
||||||
|
"ytest = data[3]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## STree alone on the whole dataset and linear kernel"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Score Train: 0.9985499829409757\nScore Test: 0.998407854584052\nTook 39.45 seconds\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"now = time.time()\n",
|
||||||
|
"clf = Stree(max_depth=3, random_state=random_state)\n",
|
||||||
|
"clf.fit(Xtrain, ytrain)\n",
|
||||||
|
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
|
||||||
|
"print(f\"Took {time.time() - now:.2f} seconds\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Different kernels with different configuations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"n_estimators = 10\n",
|
||||||
|
"C = 7\n",
|
||||||
|
"max_depth = 3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||||
|
" score_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test algorithm SAMME in AdaBoost to check speed/accuracy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||||
|
" score_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.6-final"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 2,
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||||
|
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
407
notebooks/features.ipynb
Normal file
407
notebooks/features.ipynb
Normal file
@@ -0,0 +1,407 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Test smple_weight, kernels, C, sklearn estimator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#\n",
|
||||||
|
"# Google Colab setup\n",
|
||||||
|
"#\n",
|
||||||
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.svm import SVC\n",
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
|
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||||
|
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from stree import Stree\n",
|
||||||
|
"import time"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"if not os.path.isfile('data/creditcard.csv'):\n",
|
||||||
|
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
||||||
|
" !tar xzf creditcard.tgz"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"\n",
|
||||||
|
"def load_creditcard(n_examples=0):\n",
|
||||||
|
" import pandas as pd\n",
|
||||||
|
" import numpy as np\n",
|
||||||
|
" import random\n",
|
||||||
|
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||||
|
" y = df.Class\n",
|
||||||
|
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||||
|
" if n_examples > 0:\n",
|
||||||
|
" # Take first n_examples samples\n",
|
||||||
|
" X = X[:n_examples, :]\n",
|
||||||
|
" y = y[:n_examples, :]\n",
|
||||||
|
" else:\n",
|
||||||
|
" # Take all the positive samples with a number of random negatives\n",
|
||||||
|
" if n_examples < 0:\n",
|
||||||
|
" Xt = X[(y == 1).ravel()]\n",
|
||||||
|
" yt = y[(y == 1).ravel()]\n",
|
||||||
|
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
||||||
|
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||||
|
" y = np.append(yt, y[indices], axis=0)\n",
|
||||||
|
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||||
|
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||||
|
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
|
"\n",
|
||||||
|
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
||||||
|
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||||
|
"data = load_creditcard(-1000) # Take all the samples\n",
|
||||||
|
"\n",
|
||||||
|
"Xtrain = data[0]\n",
|
||||||
|
"Xtest = data[1]\n",
|
||||||
|
"ytrain = data[2]\n",
|
||||||
|
"ytest = data[3]\n",
|
||||||
|
"# Set weights inverse to its count class in dataset\n",
|
||||||
|
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
|
||||||
|
"weights[ytrain==1] = 1.99755\n",
|
||||||
|
"weights_test = np.ones(Xtest.shape[0],) * 1.00244\n",
|
||||||
|
"weights_test[ytest==1] = 1.99755 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test smple_weights\n",
|
||||||
|
"Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Accuracy of Train without weights 0.9808429118773946\nAccuracy of Train with weights 0.9904214559386973\nAccuracy of Tests without weights 0.9441964285714286\nAccuracy of Tests with weights 0.9375\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"C = 23\n",
|
||||||
|
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
|
||||||
|
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test accuracy with different kernels\n",
|
||||||
|
"Compute accuracy on train and test set with default hyperparmeters of every kernel"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Time: 0.13s\tKernel: linear\tAccuracy_train: 0.9693486590038314\tAccuracy_test: 0.9598214285714286\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9923371647509579\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: poly\tAccuracy_train: 0.9913793103448276\tAccuracy_test: 0.9375\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)\n",
|
||||||
|
" accuracy_train = clf.score(Xtrain, ytrain)\n",
|
||||||
|
" accuracy_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" time_spent = time.time() - now\n",
|
||||||
|
" print(f\"Time: {time_spent:.2f}s\\tKernel: {kernel}\\tAccuracy_train: {accuracy_train}\\tAccuracy_test: {accuracy_test}\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test diferent values of C"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {
|
||||||
|
"tags": [
|
||||||
|
"outputPrepend"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9487\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0374\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.984076 impurity=0.0313 counts=(array([0, 1]), array([ 5, 309]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.947874 impurity=0.0988 counts=(array([0, 1]), array([691, 38]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9531\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0192\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.993506 impurity=0.0129 counts=(array([0, 1]), array([ 2, 306]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.944218 impurity=0.1053 counts=(array([0, 1]), array([694, 41]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9643\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0189\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951989 impurity=0.0914 counts=(array([0, 1]), array([694, 35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9621\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0250\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([4]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951923 impurity=0.0915 counts=(array([0, 1]), array([693, 35]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9703\nClassifier's accuracy (test) : 0.9665\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n\n**************************************************\n0.4375 secs\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"t = time.time()\n",
|
||||||
|
"for C in (.001, .01, 1, 5, 17):\n",
|
||||||
|
" clf = Stree(C=C, kernel=\"linear\", random_state=random_state)\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" print(f\"************** C={C} ****************************\")\n",
|
||||||
|
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
|
||||||
|
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
|
||||||
|
" print(clf)\n",
|
||||||
|
" print(f\"**************************************************\")\n",
|
||||||
|
"print(f\"{time.time() - t:.4f} secs\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test iterator\n",
|
||||||
|
"Check different weays of using the iterator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#check iterator\n",
|
||||||
|
"for i in list(clf):\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#check iterator again\n",
|
||||||
|
"for i in clf:\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test STree is a sklearn estimator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12735b3b0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x1273514d0>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x1273513b0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12734acb0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12734add0>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12734aef0>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12734d050>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12734d170>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x1273564d0>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x1273564d0>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12734d320>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12734d290>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x1273515f0>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x127351290>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x127351710>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12735b290>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12734ab90>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x127351950>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12735b5f0>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x127356050>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x127356a70>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x127351a70>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12735f0e0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x1273449e0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x127356710>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x1273565f0>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12735bc20>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12735f200>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12734d830>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12734d9e0>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12734db00>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12734dc20>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12734dd40>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12735be60>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12735bf80>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12734d440>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12734d710>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12735f3b0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12735f440>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x12735f4d0>, 'Stree')\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Make checks one by one\n",
|
||||||
|
"c = 0\n",
|
||||||
|
"checks = check_estimator(Stree(), generate_only=True)\n",
|
||||||
|
"for check in checks:\n",
|
||||||
|
" c += 1\n",
|
||||||
|
" print(c, check[1])\n",
|
||||||
|
" check[1](check[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Check if the classifier is a sklearn estimator\n",
|
||||||
|
"check_estimator(Stree())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Compare to SVM"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "== Not Weighted ===\nSVC train score ..: 0.9578544061302682\nSTree train score : 0.960727969348659\nSVC test score ...: 0.9508928571428571\nSTree test score .: 0.9553571428571429\n==== Weighted =====\nSVC train score ..: 0.9636015325670498\nSTree train score : 0.9626436781609196\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9553571428571429\n*SVC test score ..: 0.9447820728419238\n*STree test score : 0.9447820728419238\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"svc = SVC(C=7, kernel='rbf', gamma=.001, random_state=random_state)\n",
|
||||||
|
"clf = Stree(C=17, kernel='rbf', gamma=.001, random_state=random_state)\n",
|
||||||
|
"svc.fit(Xtrain, ytrain)\n",
|
||||||
|
"clf.fit(Xtrain, ytrain)\n",
|
||||||
|
"print(\"== Not Weighted ===\")\n",
|
||||||
|
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
|
||||||
|
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
"svc.fit(Xtrain, ytrain, weights)\n",
|
||||||
|
"clf.fit(Xtrain, ytrain, weights)\n",
|
||||||
|
"print(\"==== Weighted =====\")\n",
|
||||||
|
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
|
||||||
|
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
"print(\"*SVC test score ..:\", svc.score(Xtest, ytest, weights_test))\n",
|
||||||
|
"print(\"*STree test score :\", clf.score(Xtest, ytest, weights_test))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down, <cgaf> - Leaf class=1 belief= 0.978261 impurity=0.0425 counts=(array([0, 1]), array([ 7, 315]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.955679 impurity=0.0847 counts=(array([0, 1]), array([690, 32]))\n\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(clf)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test max_features"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "****************************************\nmax_features None = 28\nTrain score : 0.9664750957854407\nTest score .: 0.9642857142857143\nTook 0.09 seconds\n****************************************\nmax_features auto = 5\nTrain score : 0.9511494252873564\nTest score .: 0.9441964285714286\nTook 0.37 seconds\n****************************************\nmax_features log2 = 4\nTrain score : 0.935823754789272\nTest score .: 0.9330357142857143\nTook 0.10 seconds\n****************************************\nmax_features 7 = 7\nTrain score : 0.9568965517241379\nTest score .: 0.9397321428571429\nTook 3.36 seconds\n****************************************\nmax_features 0.5 = 14\nTrain score : 0.960727969348659\nTest score .: 0.9486607142857143\nTook 112.42 seconds\n****************************************\nmax_features 0.1 = 2\nTrain score : 0.8793103448275862\nTest score .: 0.8839285714285714\nTook 0.06 seconds\n****************************************\nmax_features 0.7 = 19\nTrain score : 0.9655172413793104\nTest score .: 0.9553571428571429\nTook 10.59 seconds\n"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for max_features in [None, \"auto\", \"log2\", 7, .5, .1, .7]:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" print(\"*\"*40)\n",
|
||||||
|
" clf = Stree(random_state=random_state, max_features=max_features)\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" print(f\"max_features {max_features} = {clf.max_features_}\")\n",
|
||||||
|
" print(\"Train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
" print(\"Test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
" print(f\"Took {time.time() - now:.2f} seconds\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.6-final"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
244
notebooks/gridsearch.ipynb
Normal file
244
notebooks/gridsearch.ipynb
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Test Gridsearch\n",
|
||||||
|
"with different kernels and different configurations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#\n",
|
||||||
|
"# Google Colab setup\n",
|
||||||
|
"#\n",
|
||||||
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "zIHKVxthDZEa",
|
||||||
|
"colab_type": "code",
|
||||||
|
"colab": {}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"from sklearn.ensemble import AdaBoostClassifier\n",
|
||||||
|
"from sklearn.svm import LinearSVC\n",
|
||||||
|
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||||
|
"from stree import Stree"
|
||||||
|
],
|
||||||
|
"execution_count": 2,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "IEmq50QgDZEi",
|
||||||
|
"colab_type": "code",
|
||||||
|
"colab": {}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"if not os.path.isfile('data/creditcard.csv'):\n",
|
||||||
|
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
||||||
|
" !tar xzf creditcard.tgz"
|
||||||
|
],
|
||||||
|
"execution_count": 3,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "z9Q-YUfBDZEq",
|
||||||
|
"colab_type": "code",
|
||||||
|
"colab": {},
|
||||||
|
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"\n",
|
||||||
|
"def load_creditcard(n_examples=0):\n",
|
||||||
|
" import pandas as pd\n",
|
||||||
|
" import numpy as np\n",
|
||||||
|
" import random\n",
|
||||||
|
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||||
|
" y = df.Class\n",
|
||||||
|
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||||
|
" if n_examples > 0:\n",
|
||||||
|
" # Take first n_examples samples\n",
|
||||||
|
" X = X[:n_examples, :]\n",
|
||||||
|
" y = y[:n_examples, :]\n",
|
||||||
|
" else:\n",
|
||||||
|
" # Take all the positive samples with a number of random negatives\n",
|
||||||
|
" if n_examples < 0:\n",
|
||||||
|
" Xt = X[(y == 1).ravel()]\n",
|
||||||
|
" yt = y[(y == 1).ravel()]\n",
|
||||||
|
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
||||||
|
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||||
|
" y = np.append(yt, y[indices], axis=0)\n",
|
||||||
|
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||||
|
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||||
|
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
|
"\n",
|
||||||
|
"data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
|
||||||
|
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||||
|
"# data = load_creditcard(0) # Take all the samples\n",
|
||||||
|
"\n",
|
||||||
|
"Xtrain = data[0]\n",
|
||||||
|
"Xtest = data[1]\n",
|
||||||
|
"ytrain = data[2]\n",
|
||||||
|
"ytest = data[3]"
|
||||||
|
],
|
||||||
|
"execution_count": 4,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "HmX3kR4PDZEw",
|
||||||
|
"colab_type": "code",
|
||||||
|
"colab": {}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"parameters = {\n",
|
||||||
|
" 'base_estimator': [Stree()],\n",
|
||||||
|
" 'n_estimators': [10, 25],\n",
|
||||||
|
" 'learning_rate': [.5, 1],\n",
|
||||||
|
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||||
|
" 'base_estimator__max_depth': [3, 5],\n",
|
||||||
|
" 'base_estimator__C': [1, 3],\n",
|
||||||
|
" 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
|
||||||
|
"}"
|
||||||
|
],
|
||||||
|
"execution_count": 9,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 14
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"Stree().get_params()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "CrcB8o6EDZE5",
|
||||||
|
"colab_type": "code",
|
||||||
|
"colab": {},
|
||||||
|
"outputId": "7703413a-d563-4289-a13b-532f38f82762"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"random_state=2020\n",
|
||||||
|
"clf = AdaBoostClassifier(random_state=random_state)\n",
|
||||||
|
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
|
||||||
|
"grid.fit(Xtrain, ytrain)"
|
||||||
|
],
|
||||||
|
"execution_count": 11,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 3.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 4.2s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 4.8s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 7.2s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 8.9s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 10.7s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.7s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 1.5min finished\n"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 11
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"metadata": {
|
||||||
|
"id": "ZjX88NoYDZE8",
|
||||||
|
"colab_type": "code",
|
||||||
|
"colab": {},
|
||||||
|
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"print(\"Best estimator: \", grid.best_estimator_)\n",
|
||||||
|
"print(\"Best hyperparameters: \", grid.best_params_)\n",
|
||||||
|
"print(\"Best accuracy: \", grid.best_score_)"
|
||||||
|
],
|
||||||
|
"execution_count": 16,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": "Best estimator: AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy: 0.9492316893632683\n"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.6-final"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 2,
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||||
|
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||||
|
},
|
||||||
|
"colab": {
|
||||||
|
"name": "gridsearch.ipynb",
|
||||||
|
"provenance": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
|
}
|
16
pyproject.toml
Normal file
16
pyproject.toml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[tool.black]
|
||||||
|
line-length = 79
|
||||||
|
include = '\.pyi?$'
|
||||||
|
exclude = '''
|
||||||
|
/(
|
||||||
|
\.git
|
||||||
|
| \.hg
|
||||||
|
| \.mypy_cache
|
||||||
|
| \.tox
|
||||||
|
| \.venv
|
||||||
|
| _build
|
||||||
|
| buck-out
|
||||||
|
| build
|
||||||
|
| dist
|
||||||
|
)/
|
||||||
|
'''
|
@@ -1,5 +1,4 @@
|
|||||||
numpy
|
numpy
|
||||||
scikit-learn
|
scikit-learn
|
||||||
pandas
|
pandas
|
||||||
matplotlib
|
|
||||||
ipympl
|
ipympl
|
41
setup.py
41
setup.py
@@ -1,39 +1,36 @@
|
|||||||
import setuptools
|
import setuptools
|
||||||
|
|
||||||
__version__ = "0.9rc2"
|
__version__ = "0.9rc4"
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
|
|
||||||
|
|
||||||
def readme():
|
def readme():
|
||||||
with open('README.md') as f:
|
with open("README.md") as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='STree',
|
name="STree",
|
||||||
version=__version__,
|
version=__version__,
|
||||||
license='MIT License',
|
license="MIT License",
|
||||||
description='Oblique decision tree with svm nodes',
|
description="Oblique decision tree with svm nodes",
|
||||||
long_description=readme(),
|
long_description=readme(),
|
||||||
long_description_content_type='text/markdown',
|
long_description_content_type="text/markdown",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(),
|
||||||
url='https://github.com/doctorado-ml/stree',
|
url="https://github.com/doctorado-ml/stree",
|
||||||
author=__author__,
|
author=__author__,
|
||||||
author_email='ricardo.montanana@alu.uclm.es',
|
author_email="ricardo.montanana@alu.uclm.es",
|
||||||
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-tree svm svc',
|
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
||||||
|
tree svm svc",
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 4 - Beta',
|
"Development Status :: 4 - Beta",
|
||||||
'License :: OSI Approved :: MIT License',
|
"License :: OSI Approved :: MIT License",
|
||||||
'Programming Language :: Python :: 3.7',
|
"Programming Language :: Python :: 3.7",
|
||||||
'Natural Language :: English',
|
"Natural Language :: English",
|
||||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
'Intended Audience :: Science/Research'
|
"Intended Audience :: Science/Research",
|
||||||
],
|
|
||||||
install_requires=[
|
|
||||||
'scikit-learn>=0.23.0',
|
|
||||||
'numpy',
|
|
||||||
'matplotlib',
|
|
||||||
'ipympl'
|
|
||||||
],
|
],
|
||||||
|
install_requires=["scikit-learn>=0.23.0", "numpy", "ipympl"],
|
||||||
test_suite="stree.tests",
|
test_suite="stree.tests",
|
||||||
zip_safe=False
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
699
stree/Strees.py
699
stree/Strees.py
@@ -1,37 +1,69 @@
|
|||||||
'''
|
"""
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
__version__ = "0.9"
|
__version__ = "0.9"
|
||||||
Build an oblique tree classifier based on SVM Trees
|
Build an oblique tree classifier based on SVM Trees
|
||||||
Uses LinearSVC
|
"""
|
||||||
'''
|
|
||||||
|
|
||||||
import typing
|
|
||||||
import os
|
import os
|
||||||
|
import numbers
|
||||||
|
import random
|
||||||
|
import warnings
|
||||||
|
from itertools import combinations
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
from sklearn.utils import check_consistent_length
|
||||||
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
|
from sklearn.utils.validation import (
|
||||||
|
check_X_y,
|
||||||
|
check_array,
|
||||||
|
check_is_fitted,
|
||||||
|
_check_sample_weight,
|
||||||
|
)
|
||||||
|
from sklearn.metrics._classification import _weighted_sum, _check_targets
|
||||||
|
|
||||||
|
|
||||||
class Snode:
|
class Snode:
|
||||||
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str):
|
"""Nodes of the tree that keeps the svm classifier and if testing the
|
||||||
|
dataset assigned to it
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
clf: SVC,
|
||||||
|
X: np.ndarray,
|
||||||
|
y: np.ndarray,
|
||||||
|
features: np.array,
|
||||||
|
impurity: float,
|
||||||
|
title: str,
|
||||||
|
):
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._vector = None if clf is None else clf.coef_
|
|
||||||
self._interceptor = 0. if clf is None else clf.intercept_
|
|
||||||
self._title = title
|
self._title = title
|
||||||
self._belief = 0. # belief of the prediction in a leaf node based on samples
|
self._belief = 0.0
|
||||||
# Only store dataset in Testing
|
# Only store dataset in Testing
|
||||||
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
|
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
self._y = y
|
self._y = y
|
||||||
self._down = None
|
self._down = None
|
||||||
self._up = None
|
self._up = None
|
||||||
self._class = None
|
self._class = None
|
||||||
|
self._feature = None
|
||||||
|
self._sample_weight = None
|
||||||
|
self._features = features
|
||||||
|
self._impurity = impurity
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def copy(cls, node: 'Snode') -> 'Snode':
|
def copy(cls, node: "Snode") -> "Snode":
|
||||||
return cls(node._clf, node._X, node._y, node._title)
|
return cls(
|
||||||
|
node._clf,
|
||||||
|
node._X,
|
||||||
|
node._y,
|
||||||
|
node._features,
|
||||||
|
node._impurity,
|
||||||
|
node._title,
|
||||||
|
)
|
||||||
|
|
||||||
def set_down(self, son):
|
def set_down(self, son):
|
||||||
self._down = son
|
self._down = son
|
||||||
@@ -42,15 +74,15 @@ class Snode:
|
|||||||
def is_leaf(self) -> bool:
|
def is_leaf(self) -> bool:
|
||||||
return self._up is None and self._down is None
|
return self._up is None and self._down is None
|
||||||
|
|
||||||
def get_down(self) -> 'Snode':
|
def get_down(self) -> "Snode":
|
||||||
return self._down
|
return self._down
|
||||||
|
|
||||||
def get_up(self) -> 'Snode':
|
def get_up(self) -> "Snode":
|
||||||
return self._up
|
return self._up
|
||||||
|
|
||||||
def make_predictor(self):
|
def make_predictor(self):
|
||||||
"""Compute the class of the predictor and its belief based on the subdataset of the node
|
"""Compute the class of the predictor and its belief based on the
|
||||||
only if it is a leaf
|
subdataset of the node only if it is a leaf
|
||||||
"""
|
"""
|
||||||
if not self.is_leaf():
|
if not self.is_leaf():
|
||||||
return
|
return
|
||||||
@@ -58,20 +90,29 @@ class Snode:
|
|||||||
if len(classes) > 1:
|
if len(classes) > 1:
|
||||||
max_card = max(card)
|
max_card = max(card)
|
||||||
min_card = min(card)
|
min_card = min(card)
|
||||||
try:
|
|
||||||
self._belief = max_card / (max_card + min_card)
|
|
||||||
except:
|
|
||||||
self._belief = 0.
|
|
||||||
self._class = classes[card == max_card][0]
|
self._class = classes[card == max_card][0]
|
||||||
|
self._belief = max_card / (max_card + min_card)
|
||||||
else:
|
else:
|
||||||
self._belief = 1
|
self._belief = 1
|
||||||
self._class = classes[0]
|
try:
|
||||||
|
self._class = classes[0]
|
||||||
|
except IndexError:
|
||||||
|
self._class = None
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
if self.is_leaf():
|
if self.is_leaf():
|
||||||
return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}"
|
count_values = np.unique(self._y, return_counts=True)
|
||||||
|
result = (
|
||||||
|
f"{self._title} - Leaf class={self._class} belief="
|
||||||
|
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
return result
|
||||||
else:
|
else:
|
||||||
return f"{self._title}"
|
return (
|
||||||
|
f"{self._title} feaures={self._features} impurity="
|
||||||
|
f"{self._impurity:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Siterator:
|
class Siterator:
|
||||||
@@ -82,9 +123,6 @@ class Siterator:
|
|||||||
self._stack = []
|
self._stack = []
|
||||||
self._push(tree)
|
self._push(tree)
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _push(self, node: Snode):
|
def _push(self, node: Snode):
|
||||||
if node is not None:
|
if node is not None:
|
||||||
self._stack.append(node)
|
self._stack.append(node)
|
||||||
@@ -97,63 +135,336 @@ class Siterator:
|
|||||||
self._push(node.get_down())
|
self._push(node.get_down())
|
||||||
return node
|
return node
|
||||||
|
|
||||||
class Stree(BaseEstimator, ClassifierMixin):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
|
class Splitter:
|
||||||
self._max_iter = max_iter
|
def __init__(
|
||||||
self._C = C
|
self,
|
||||||
|
clf: SVC = None,
|
||||||
|
criterion: str = None,
|
||||||
|
splitter_type: str = None,
|
||||||
|
criteria: str = None,
|
||||||
|
min_samples_split: int = None,
|
||||||
|
random_state=None,
|
||||||
|
):
|
||||||
|
self._clf = clf
|
||||||
self._random_state = random_state
|
self._random_state = random_state
|
||||||
self._tree = None
|
if random_state is not None:
|
||||||
self.__folder = 'data/'
|
random.seed(random_state)
|
||||||
self.__use_predictions = use_predictions
|
self._criterion = criterion
|
||||||
self.__trained = False
|
self._min_samples_split = min_samples_split
|
||||||
self.__proba = False
|
self._criteria = criteria
|
||||||
|
self._splitter_type = splitter_type
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
if clf is None:
|
||||||
"""Get dict with hyperparameters and its values to accomplish sklearn rules
|
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||||
"""
|
|
||||||
return {"C": self._C, "random_state": self._random_state, 'max_iter': self._max_iter}
|
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
if criterion not in ["gini", "entropy"]:
|
||||||
"""Set hyperparmeters as specified by sklearn, needed in Gridsearchs
|
raise ValueError(
|
||||||
"""
|
f"criterion must be gini or entropy got({criterion})"
|
||||||
for parameter, value in parameters.items():
|
)
|
||||||
setattr(self, parameter, value)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _linear_function(self, data: np.array, node: Snode) -> np.array:
|
if criteria not in ["min_distance", "max_samples"]:
|
||||||
coef = node._vector[0, :].reshape(-1, data.shape[1])
|
raise ValueError(
|
||||||
return data.dot(coef.T) + node._interceptor[0]
|
f"split_criteria has to be min_distance or \
|
||||||
|
max_samples got ({criteria})"
|
||||||
|
)
|
||||||
|
|
||||||
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
|
if splitter_type not in ["random", "best"]:
|
||||||
if self.__use_predictions:
|
raise ValueError(
|
||||||
yp = node._clf.predict(data)
|
f"splitter must be either random or best got({splitter_type})"
|
||||||
down = (yp == 1).reshape(-1, 1)
|
)
|
||||||
res = np.expand_dims(node._clf.decision_function(data), 1)
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
|
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||||
|
|
||||||
|
def impurity(self, y: np.array) -> np.array:
|
||||||
|
return self.criterion_function(y)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _gini(y: np.array) -> float:
|
||||||
|
_, count = np.unique(y, return_counts=True)
|
||||||
|
return 1 - np.sum(np.square(count / np.sum(count)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _entropy(y: np.array) -> float:
|
||||||
|
_, count = np.unique(y, return_counts=True)
|
||||||
|
proportion = count / np.sum(count)
|
||||||
|
return -np.sum(proportion * np.log2(proportion))
|
||||||
|
|
||||||
|
def information_gain(
|
||||||
|
self, labels_up: np.array, labels_dn: np.array
|
||||||
|
) -> float:
|
||||||
|
card_up = labels_up.shape[0] if labels_up is not None else 0
|
||||||
|
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||||
|
samples = card_up + card_dn
|
||||||
|
up = card_up / samples * self.criterion_function(labels_up)
|
||||||
|
dn = card_dn / samples * self.criterion_function(labels_dn)
|
||||||
|
return up + dn
|
||||||
|
|
||||||
|
def _select_best_set(
|
||||||
|
self, dataset: np.array, labels: np.array, features_sets: list
|
||||||
|
) -> list:
|
||||||
|
min_impurity = 1
|
||||||
|
selected = None
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
for feature_set in features_sets:
|
||||||
|
self._clf.fit(dataset[:, feature_set], labels)
|
||||||
|
node = Snode(
|
||||||
|
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
||||||
|
)
|
||||||
|
self.partition(dataset, node)
|
||||||
|
y1, y2 = self.part(labels)
|
||||||
|
impurity = self.information_gain(y1, y2)
|
||||||
|
if impurity < min_impurity:
|
||||||
|
min_impurity = impurity
|
||||||
|
selected = feature_set
|
||||||
|
return selected
|
||||||
|
|
||||||
|
def _get_subspaces_set(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> np.array:
|
||||||
|
features = range(dataset.shape[1])
|
||||||
|
features_sets = list(combinations(features, max_features))
|
||||||
|
if len(features_sets) > 1:
|
||||||
|
if self._splitter_type == "random":
|
||||||
|
return features_sets[random.randint(0, len(features_sets) - 1)]
|
||||||
|
else:
|
||||||
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
else:
|
else:
|
||||||
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
|
return features_sets[0]
|
||||||
# computes positition of every sample is w.r.t. the hyperplane
|
|
||||||
res = self._linear_function(data, node)
|
|
||||||
down = res > 0
|
|
||||||
up = ~down
|
|
||||||
data_down = data[down[:, 0]] if any(down) else None
|
|
||||||
indices_down = indices[down[:, 0]] if any(down) else None
|
|
||||||
res_down = res[down[:, 0]] if any(down) else None
|
|
||||||
data_up = data[up[:, 0]] if any(up) else None
|
|
||||||
indices_up = indices[up[:, 0]] if any(up) else None
|
|
||||||
res_up = res[up[:, 0]] if any(up) else None
|
|
||||||
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
|
|
||||||
|
|
||||||
def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
|
def get_subspace(
|
||||||
X, y = check_X_y(X, y.ravel())
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> list:
|
||||||
|
"""Return the best subspace to make a split
|
||||||
|
"""
|
||||||
|
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||||
|
return dataset[:, indices], indices
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _min_distance(data: np.array, _) -> np.array:
|
||||||
|
# chooses the lowest distance of every sample
|
||||||
|
indices = np.argmin(np.abs(data), axis=1)
|
||||||
|
return np.array(
|
||||||
|
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||||
|
# select the class with max number of samples
|
||||||
|
_, samples = np.unique(y, return_counts=True)
|
||||||
|
selected = np.argmax(samples)
|
||||||
|
return data[:, selected]
|
||||||
|
|
||||||
|
def partition(self, samples: np.array, node: Snode):
|
||||||
|
"""Set the criteria to split arrays
|
||||||
|
|
||||||
|
"""
|
||||||
|
data = self._distances(node, samples)
|
||||||
|
if data.shape[0] < self._min_samples_split:
|
||||||
|
self._down = np.ones((data.shape[0]), dtype=bool)
|
||||||
|
return
|
||||||
|
if data.ndim > 1:
|
||||||
|
# split criteria for multiclass
|
||||||
|
data = self.decision_criteria(data, node._y)
|
||||||
|
self._down = data > 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _distances(node: Snode, data: np.ndarray) -> np.array:
|
||||||
|
"""Compute distances of the samples to the hyperplane of the node
|
||||||
|
|
||||||
|
:param node: node containing the svm classifier
|
||||||
|
:type node: Snode
|
||||||
|
:param data: samples to find out distance to hyperplane
|
||||||
|
:type data: np.ndarray
|
||||||
|
:return: array of shape (m, 1) with the distances of every sample to
|
||||||
|
the hyperplane of the node
|
||||||
|
:rtype: np.array
|
||||||
|
"""
|
||||||
|
return node._clf.decision_function(data[:, node._features])
|
||||||
|
|
||||||
|
def part(self, origin: np.array) -> list:
|
||||||
|
"""Split an array in two based on indices (down) and its complement
|
||||||
|
|
||||||
|
:param origin: dataset to split
|
||||||
|
:type origin: np.array
|
||||||
|
:param down: indices to use to split array
|
||||||
|
:type down: np.array
|
||||||
|
:return: list with two splits of the array
|
||||||
|
:rtype: list
|
||||||
|
"""
|
||||||
|
up = ~self._down
|
||||||
|
return [
|
||||||
|
origin[up] if any(up) else None,
|
||||||
|
origin[self._down] if any(self._down) else None,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class Stree(BaseEstimator, ClassifierMixin):
|
||||||
|
"""Estimator that is based on binary trees of svm nodes
|
||||||
|
can deal with sample_weights in predict, used in boosting sklearn methods
|
||||||
|
inheriting from BaseEstimator implements get_params and set_params methods
|
||||||
|
inheriting from ClassifierMixin implement the attribute _estimator_type
|
||||||
|
with "classifier" as value
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
C: float = 1.0,
|
||||||
|
kernel: str = "linear",
|
||||||
|
max_iter: int = 1000,
|
||||||
|
random_state: int = None,
|
||||||
|
max_depth: int = None,
|
||||||
|
tol: float = 1e-4,
|
||||||
|
degree: int = 3,
|
||||||
|
gamma="scale",
|
||||||
|
split_criteria: str = "max_samples",
|
||||||
|
criterion: str = "gini",
|
||||||
|
min_samples_split: int = 0,
|
||||||
|
max_features=None,
|
||||||
|
splitter: str = "random",
|
||||||
|
):
|
||||||
|
self.max_iter = max_iter
|
||||||
|
self.C = C
|
||||||
|
self.kernel = kernel
|
||||||
|
self.random_state = random_state
|
||||||
|
self.max_depth = max_depth
|
||||||
|
self.tol = tol
|
||||||
|
self.gamma = gamma
|
||||||
|
self.degree = degree
|
||||||
|
self.min_samples_split = min_samples_split
|
||||||
|
self.split_criteria = split_criteria
|
||||||
|
self.max_features = max_features
|
||||||
|
self.criterion = criterion
|
||||||
|
self.splitter = splitter
|
||||||
|
|
||||||
|
def _more_tags(self) -> dict:
|
||||||
|
"""Required by sklearn to supply features of the classifier
|
||||||
|
|
||||||
|
:return: the tag required
|
||||||
|
:rtype: dict
|
||||||
|
"""
|
||||||
|
return {"requires_y": True}
|
||||||
|
|
||||||
|
def fit(
|
||||||
|
self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
|
||||||
|
) -> "Stree":
|
||||||
|
"""Build the tree based on the dataset of samples and its labels
|
||||||
|
|
||||||
|
:param X: dataset of samples to make predictions
|
||||||
|
:type X: np.array
|
||||||
|
:param y: samples labels
|
||||||
|
:type y: np.array
|
||||||
|
:param sample_weight: weights of the samples. Rescale C per sample.
|
||||||
|
Hi' weights force the classifier to put more emphasis on these points
|
||||||
|
:type sample_weight: np.array optional
|
||||||
|
:raises ValueError: if parameters C or max_depth are out of bounds
|
||||||
|
:return: itself to be able to chain actions: fit().predict() ...
|
||||||
|
:rtype: Stree
|
||||||
|
"""
|
||||||
|
# Check parameters are Ok.
|
||||||
|
if self.C < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Penalty term must be positive... got (C={self.C:f})"
|
||||||
|
)
|
||||||
|
self.__max_depth = (
|
||||||
|
np.iinfo(np.int32).max
|
||||||
|
if self.max_depth is None
|
||||||
|
else self.max_depth
|
||||||
|
)
|
||||||
|
if self.__max_depth < 1:
|
||||||
|
raise ValueError(
|
||||||
|
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||||
|
{self.max_depth})"
|
||||||
|
)
|
||||||
|
|
||||||
|
check_classification_targets(y)
|
||||||
|
X, y = check_X_y(X, y)
|
||||||
|
sample_weight = _check_sample_weight(sample_weight, X)
|
||||||
|
check_classification_targets(y)
|
||||||
|
# Initialize computed parameters
|
||||||
|
self.splitter_ = Splitter(
|
||||||
|
clf=self._build_clf(),
|
||||||
|
criterion=self.criterion,
|
||||||
|
splitter_type=self.splitter,
|
||||||
|
criteria=self.split_criteria,
|
||||||
|
random_state=self.random_state,
|
||||||
|
min_samples_split=self.min_samples_split,
|
||||||
|
)
|
||||||
|
if self.random_state is not None:
|
||||||
|
random.seed(self.random_state)
|
||||||
|
self.classes_, y = np.unique(y, return_inverse=True)
|
||||||
|
self.n_classes_ = self.classes_.shape[0]
|
||||||
|
self.n_iter_ = self.max_iter
|
||||||
|
self.depth_ = 0
|
||||||
|
self.n_features_ = X.shape[1]
|
||||||
self.n_features_in_ = X.shape[1]
|
self.n_features_in_ = X.shape[1]
|
||||||
self._tree = self.train(X, y.ravel(), title)
|
self.max_features_ = self._initialize_max_features()
|
||||||
|
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
||||||
self._build_predictor()
|
self._build_predictor()
|
||||||
self.__trained = True
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def train(
|
||||||
|
self,
|
||||||
|
X: np.ndarray,
|
||||||
|
y: np.ndarray,
|
||||||
|
sample_weight: np.ndarray,
|
||||||
|
depth: int,
|
||||||
|
title: str,
|
||||||
|
) -> Snode:
|
||||||
|
"""Recursive function to split the original dataset into predictor
|
||||||
|
nodes (leaves)
|
||||||
|
|
||||||
|
:param X: samples dataset
|
||||||
|
:type X: np.ndarray
|
||||||
|
:param y: samples labels
|
||||||
|
:type y: np.ndarray
|
||||||
|
:param sample_weight: weight of samples. Rescale C per sample.
|
||||||
|
Hi weights force the classifier to put more emphasis on these points.
|
||||||
|
:type sample_weight: np.ndarray
|
||||||
|
:param depth: actual depth in the tree
|
||||||
|
:type depth: int
|
||||||
|
:param title: description of the node
|
||||||
|
:type title: str
|
||||||
|
:return: binary tree
|
||||||
|
:rtype: Snode
|
||||||
|
"""
|
||||||
|
if depth > self.__max_depth:
|
||||||
|
return None
|
||||||
|
if np.unique(y).shape[0] == 1:
|
||||||
|
# only 1 class => pure dataset
|
||||||
|
return Snode(
|
||||||
|
clf=None,
|
||||||
|
X=X,
|
||||||
|
y=y,
|
||||||
|
features=X.shape[1],
|
||||||
|
impurity=0.0,
|
||||||
|
title=title + ", <pure>",
|
||||||
|
)
|
||||||
|
# Train the model
|
||||||
|
clf = self._build_clf()
|
||||||
|
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||||
|
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||||
|
impurity = self.splitter_.impurity(y)
|
||||||
|
node = Snode(clf, X, y, features, impurity, title)
|
||||||
|
self.depth_ = max(depth, self.depth_)
|
||||||
|
self.splitter_.partition(X, node)
|
||||||
|
X_U, X_D = self.splitter_.part(X)
|
||||||
|
y_u, y_d = self.splitter_.part(y)
|
||||||
|
sw_u, sw_d = self.splitter_.part(sample_weight)
|
||||||
|
if X_U is None or X_D is None:
|
||||||
|
# didn't part anything
|
||||||
|
return Snode(
|
||||||
|
clf,
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
features=X.shape[1],
|
||||||
|
impurity=impurity,
|
||||||
|
title=title + ", <cgaf>",
|
||||||
|
)
|
||||||
|
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||||
|
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||||
|
return node
|
||||||
|
|
||||||
def _build_predictor(self):
|
def _build_predictor(self):
|
||||||
"""Process the leaves to make them predictors
|
"""Process the leaves to make them predictors
|
||||||
"""
|
"""
|
||||||
@@ -165,147 +476,167 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
run_tree(node.get_down())
|
run_tree(node.get_down())
|
||||||
run_tree(node.get_up())
|
run_tree(node.get_up())
|
||||||
|
|
||||||
run_tree(self._tree)
|
run_tree(self.tree_)
|
||||||
|
|
||||||
def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
|
def _build_clf(self):
|
||||||
if np.unique(y).shape[0] == 1:
|
""" Build the correct classifier for the node
|
||||||
# only 1 class => pure dataset
|
"""
|
||||||
return Snode(None, X, y, title + ', <pure>')
|
return (
|
||||||
# Train the model
|
LinearSVC(
|
||||||
clf = LinearSVC(max_iter=self._max_iter, C=self._C,
|
max_iter=self.max_iter,
|
||||||
random_state=self._random_state)
|
random_state=self.random_state,
|
||||||
clf.fit(X, y)
|
C=self.C,
|
||||||
tree = Snode(clf, X, y, title)
|
tol=self.tol,
|
||||||
X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
|
)
|
||||||
if X_U is None or X_D is None:
|
if self.kernel == "linear"
|
||||||
# didn't part anything
|
else SVC(
|
||||||
return Snode(clf, X, y, title + ', <cgaf>')
|
kernel=self.kernel,
|
||||||
tree.set_up(self.train(X_U, y_u, title + ' - Up'))
|
max_iter=self.max_iter,
|
||||||
tree.set_down(self.train(X_D, y_d, title + ' - Down'))
|
tol=self.tol,
|
||||||
return tree
|
C=self.C,
|
||||||
|
gamma=self.gamma,
|
||||||
|
degree=self.degree,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
|
@staticmethod
|
||||||
y_ordered = np.zeros(y.shape, dtype=int if y.ndim == 1 else float)
|
def _reorder_results(y: np.array, indices: np.array) -> np.array:
|
||||||
|
"""Reorder an array based on the array of indices passed
|
||||||
|
|
||||||
|
:param y: data untidy
|
||||||
|
:type y: np.array
|
||||||
|
:param indices: indices used to set order
|
||||||
|
:type indices: np.array
|
||||||
|
:return: array y ordered
|
||||||
|
:rtype: np.array
|
||||||
|
"""
|
||||||
|
# return array of same type given in y
|
||||||
|
y_ordered = y.copy()
|
||||||
indices = indices.astype(int)
|
indices = indices.astype(int)
|
||||||
for i, index in enumerate(indices):
|
for i, index in enumerate(indices):
|
||||||
y_ordered[index] = y[i]
|
y_ordered[index] = y[i]
|
||||||
return y_ordered
|
return y_ordered
|
||||||
|
|
||||||
def predict(self, X: np.array) -> np.array:
|
def predict(self, X: np.array) -> np.array:
|
||||||
def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array:
|
"""Predict labels for each sample in dataset passed
|
||||||
|
|
||||||
|
:param X: dataset of samples
|
||||||
|
:type X: np.array
|
||||||
|
:return: array of labels
|
||||||
|
:rtype: np.array
|
||||||
|
"""
|
||||||
|
|
||||||
|
def predict_class(
|
||||||
|
xp: np.array, indices: np.array, node: Snode
|
||||||
|
) -> np.array:
|
||||||
if xp is None:
|
if xp is None:
|
||||||
return [], []
|
return [], []
|
||||||
if node.is_leaf():
|
if node.is_leaf():
|
||||||
# set a class for every sample in dataset
|
# set a class for every sample in dataset
|
||||||
prediction = np.full((xp.shape[0], 1), node._class)
|
prediction = np.full((xp.shape[0], 1), node._class)
|
||||||
return prediction, indices
|
return prediction, indices
|
||||||
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
|
self.splitter_.partition(xp, node)
|
||||||
k, l = predict_class(d, i_d, node.get_down())
|
x_u, x_d = self.splitter_.part(xp)
|
||||||
m, n = predict_class(u, i_u, node.get_up())
|
i_u, i_d = self.splitter_.part(indices)
|
||||||
return np.append(k, m), np.append(l, n)
|
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
|
||||||
|
prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
|
||||||
|
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
||||||
|
|
||||||
# sklearn check
|
# sklearn check
|
||||||
check_is_fitted(self)
|
check_is_fitted(self, ["tree_"])
|
||||||
# Input validation
|
# Input validation
|
||||||
X = check_array(X)
|
X = check_array(X)
|
||||||
|
if X.shape[1] != self.n_features_:
|
||||||
|
raise ValueError(
|
||||||
|
f"Expected {self.n_features_} features but got "
|
||||||
|
f"({X.shape[1]})"
|
||||||
|
)
|
||||||
# setup prediction & make it happen
|
# setup prediction & make it happen
|
||||||
indices = np.arange(X.shape[0])
|
indices = np.arange(X.shape[0])
|
||||||
return self._reorder_results(*predict_class(X, indices, self._tree))
|
result = (
|
||||||
|
self._reorder_results(*predict_class(X, indices, self.tree_))
|
||||||
|
.astype(int)
|
||||||
|
.ravel()
|
||||||
|
)
|
||||||
|
return self.classes_[result]
|
||||||
|
|
||||||
def predict_proba(self, X: np.array) -> np.array:
|
def score(
|
||||||
"""Computes an approximation of the probability of samples belonging to class 1
|
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||||
(nothing more, nothing less)
|
) -> float:
|
||||||
|
"""Compute accuracy of the prediction
|
||||||
|
|
||||||
:param X: dataset
|
:param X: dataset of samples to make predictions
|
||||||
:type X: np.array
|
:type X: np.array
|
||||||
|
:param y_true: samples labels
|
||||||
|
:type y_true: np.array
|
||||||
|
:param sample_weight: weights of the samples. Rescale C per sample.
|
||||||
|
Hi' weights force the classifier to put more emphasis on these points
|
||||||
|
:type sample_weight: np.array optional
|
||||||
|
:return: accuracy of the prediction
|
||||||
|
:rtype: float
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array:
|
|
||||||
"""Run the tree to compute predictions
|
|
||||||
|
|
||||||
:param xp: subdataset of samples
|
|
||||||
:type xp: np.array
|
|
||||||
:param indices: indices of subdataset samples to rebuild original order
|
|
||||||
:type indices: np.array
|
|
||||||
:param dist: distances of every sample to the hyperplane or the father node
|
|
||||||
:type dist: np.array
|
|
||||||
:param node: node of the leaf with the class
|
|
||||||
:type node: Snode
|
|
||||||
:return: array of labels and distances, array of indices
|
|
||||||
:rtype: np.array
|
|
||||||
"""
|
|
||||||
if xp is None:
|
|
||||||
return [], []
|
|
||||||
if node.is_leaf():
|
|
||||||
# set a class for every sample in dataset
|
|
||||||
prediction = np.full((xp.shape[0], 1), node._class)
|
|
||||||
prediction_proba = dist
|
|
||||||
return np.append(prediction, prediction_proba, axis=1), indices
|
|
||||||
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
|
|
||||||
k, l = predict_class(d, i_d, r_d, node.get_down())
|
|
||||||
m, n = predict_class(u, i_u, r_u, node.get_up())
|
|
||||||
return np.append(k, m), np.append(l, n)
|
|
||||||
|
|
||||||
# sklearn check
|
# sklearn check
|
||||||
check_is_fitted(self)
|
check_is_fitted(self)
|
||||||
# Input validation
|
check_classification_targets(y)
|
||||||
X = check_array(X)
|
X, y = check_X_y(X, y)
|
||||||
# setup prediction & make it happen
|
y_pred = self.predict(X).reshape(y.shape)
|
||||||
indices = np.arange(X.shape[0])
|
# Compute accuracy for each possible representation
|
||||||
result, indices = predict_class(X, indices, [], self._tree)
|
_, y_true, y_pred = _check_targets(y, y_pred)
|
||||||
result = result.reshape(X.shape[0], 2)
|
check_consistent_length(y_true, y_pred, sample_weight)
|
||||||
# Turn distances to hyperplane into probabilities based on fitting distances
|
score = y_true == y_pred
|
||||||
# of samples to its hyperplane that classified them, to the sigmoid function
|
return _weighted_sum(score, sample_weight, normalize=True)
|
||||||
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
|
|
||||||
return self._reorder_results(result, indices)
|
|
||||||
|
|
||||||
def score(self, X: np.array, y: np.array) -> float:
|
def __iter__(self) -> Siterator:
|
||||||
"""Return accuracy
|
"""Create an iterator to be able to visit the nodes of the tree in
|
||||||
|
preorder, can make a list with all the nodes in preorder
|
||||||
|
|
||||||
|
:return: an iterator, can for i in... and list(...)
|
||||||
|
:rtype: Siterator
|
||||||
"""
|
"""
|
||||||
if not self.__trained:
|
try:
|
||||||
self.fit(X, y)
|
tree = self.tree_
|
||||||
yp = self.predict(X).reshape(y.shape)
|
except AttributeError:
|
||||||
right = (yp == y).astype(int)
|
tree = None
|
||||||
return np.sum(right) / len(y)
|
return Siterator(tree)
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return Siterator(self._tree)
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
output = ''
|
"""String representation of the tree
|
||||||
|
|
||||||
|
:return: description of nodes in the tree in preorder
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
output = ""
|
||||||
for i in self:
|
for i in self:
|
||||||
output += str(i) + '\n'
|
output += str(i) + "\n"
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
|
def _initialize_max_features(self) -> int:
|
||||||
"""Save the dataset of the node in a csv file
|
if isinstance(self.max_features, str):
|
||||||
|
if self.max_features == "auto":
|
||||||
:param tree: node with data to save
|
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||||
:type tree: Snode
|
elif self.max_features == "sqrt":
|
||||||
:param catalog: catalog file handler
|
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||||
:type catalog: typing.TextIO
|
elif self.max_features == "log2":
|
||||||
:param number: sequential number for the generated file name
|
max_features = max(1, int(np.log2(self.n_features_)))
|
||||||
:type number: int
|
else:
|
||||||
"""
|
raise ValueError(
|
||||||
data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
|
"Invalid value for max_features. "
|
||||||
name = f"{self.__folder}dataset{number}.csv"
|
"Allowed string values are 'auto', "
|
||||||
np.savetxt(name, data, delimiter=",")
|
"'sqrt' or 'log2'."
|
||||||
catalog.write(f"{name}, - {str(tree)}")
|
)
|
||||||
if tree.is_leaf():
|
elif self.max_features is None:
|
||||||
return
|
max_features = self.n_features_
|
||||||
self._save_datasets(tree.get_down(), catalog, number + 1)
|
elif isinstance(self.max_features, numbers.Integral):
|
||||||
self._save_datasets(tree.get_up(), catalog, number + 2)
|
max_features = self.max_features
|
||||||
|
else: # float
|
||||||
def get_catalog_name(self):
|
if self.max_features > 0.0:
|
||||||
return self.__folder + "catalog.txt"
|
max_features = max(
|
||||||
|
1, int(self.max_features * self.n_features_)
|
||||||
def save_sub_datasets(self):
|
)
|
||||||
"""Save the every dataset stored in the tree to check with manual classifier
|
else:
|
||||||
"""
|
raise ValueError(
|
||||||
if not os.path.isdir(self.__folder):
|
"Invalid value for max_features."
|
||||||
os.mkdir(self.__folder)
|
"Allowed float must be in range (0, 1] "
|
||||||
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
|
f"got ({self.max_features})"
|
||||||
self._save_datasets(self._tree, catalog, 1)
|
)
|
||||||
|
return max_features
|
||||||
|
|
||||||
|
|
||||||
|
@@ -1,184 +0,0 @@
|
|||||||
'''
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
|
||||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
|
||||||
__license__ = "MIT"
|
|
||||||
__version__ = "0.9"
|
|
||||||
Plot 3D views of nodes in Stree
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
from mpl_toolkits.mplot3d import Axes3D
|
|
||||||
|
|
||||||
from .Strees import Stree, Snode, Siterator
|
|
||||||
|
|
||||||
class Snode_graph(Snode):
|
|
||||||
|
|
||||||
def __init__(self, node: Stree):
|
|
||||||
self._plot_size = (8, 8)
|
|
||||||
self._xlimits = (None, None)
|
|
||||||
self._ylimits = (None, None)
|
|
||||||
self._zlimits = (None, None)
|
|
||||||
n = Snode.copy(node)
|
|
||||||
super().__init__(n._clf, n._X, n._y, n._title)
|
|
||||||
|
|
||||||
def set_plot_size(self, size: tuple):
|
|
||||||
self._plot_size = size
|
|
||||||
|
|
||||||
def _is_pure(self) -> bool:
|
|
||||||
"""is considered pure a leaf node with one label
|
|
||||||
"""
|
|
||||||
if self.is_leaf():
|
|
||||||
return self._belief == 1.
|
|
||||||
return False
|
|
||||||
|
|
||||||
def set_axis_limits(self, limits: tuple):
|
|
||||||
self._xlimits = limits[0]
|
|
||||||
self._ylimits = limits[1]
|
|
||||||
self._zlimits = limits[2]
|
|
||||||
|
|
||||||
def _set_graphics_axis(self, ax: Axes3D):
|
|
||||||
ax.set_xlim(self._xlimits)
|
|
||||||
ax.set_ylim(self._ylimits)
|
|
||||||
ax.set_zlim(self._zlimits)
|
|
||||||
|
|
||||||
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '', save_seq: int = 1):
|
|
||||||
_, fig = self.plot_hyperplane()
|
|
||||||
name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
|
|
||||||
fig.savefig(name, bbox_inches='tight')
|
|
||||||
plt.close(fig)
|
|
||||||
|
|
||||||
def _get_cmap(self):
|
|
||||||
cmap = 'jet'
|
|
||||||
if self._is_pure():
|
|
||||||
if self._class == 1:
|
|
||||||
cmap = 'jet_r'
|
|
||||||
return cmap
|
|
||||||
|
|
||||||
def _graph_title(self):
|
|
||||||
n_class, card = np.unique(self._y, return_counts=True)
|
|
||||||
return f"{self._title} {n_class} {card}"
|
|
||||||
|
|
||||||
def plot_hyperplane(self, plot_distribution: bool = True):
|
|
||||||
fig = plt.figure(figsize=self._plot_size)
|
|
||||||
ax = fig.add_subplot(1, 1, 1, projection='3d')
|
|
||||||
if not self._is_pure():
|
|
||||||
# Can't plot hyperplane of leaves with one label because it hasn't classiffier
|
|
||||||
# get the splitting hyperplane
|
|
||||||
def hyperplane(x, y): return (-self._interceptor - self._vector[0][0] * x
|
|
||||||
- self._vector[0][1] * y) / self._vector[0][2]
|
|
||||||
|
|
||||||
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
|
|
||||||
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
|
|
||||||
xx, yy = np.meshgrid(tmpx, tmpy)
|
|
||||||
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5, antialiased=True,
|
|
||||||
rstride=1, cstride=1, cmap='seismic')
|
|
||||||
self._set_graphics_axis(ax)
|
|
||||||
if plot_distribution:
|
|
||||||
self.plot_distribution(ax)
|
|
||||||
else:
|
|
||||||
plt.title(self._graph_title())
|
|
||||||
plt.show()
|
|
||||||
return ax, fig
|
|
||||||
|
|
||||||
def plot_distribution(self, ax: Axes3D = None):
|
|
||||||
if ax is None:
|
|
||||||
fig = plt.figure(figsize=self._plot_size)
|
|
||||||
ax = fig.add_subplot(1, 1, 1, projection='3d')
|
|
||||||
plt.title(self._graph_title())
|
|
||||||
cmap = self._get_cmap()
|
|
||||||
ax.scatter(self._X[:, 0], self._X[:, 1],
|
|
||||||
self._X[:, 2], c=self._y, cmap=cmap)
|
|
||||||
ax.set_xlabel('X0')
|
|
||||||
ax.set_ylabel('X1')
|
|
||||||
ax.set_zlabel('X2')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
class Stree_grapher(Stree):
|
|
||||||
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
|
|
||||||
make its magic
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, params: dict):
|
|
||||||
self._plot_size = (8, 8)
|
|
||||||
self._tree_gr = None
|
|
||||||
# make Snode store X's
|
|
||||||
os.environ['TESTING'] = '1'
|
|
||||||
self._fitted = False
|
|
||||||
self._pca = None
|
|
||||||
super().__init__(**params)
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
try:
|
|
||||||
os.environ.pop('TESTING')
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
plt.close('all')
|
|
||||||
|
|
||||||
def _copy_tree(self, node: Snode) -> Snode_graph:
|
|
||||||
mirror = Snode_graph(node)
|
|
||||||
# clone node
|
|
||||||
mirror._class = node._class
|
|
||||||
mirror._belief = node._belief
|
|
||||||
if node.get_down() is not None:
|
|
||||||
mirror.set_down(self._copy_tree(node.get_down()))
|
|
||||||
if node.get_up() is not None:
|
|
||||||
mirror.set_up(self._copy_tree(node.get_up()))
|
|
||||||
return mirror
|
|
||||||
|
|
||||||
def fit(self, X: np.array, y: np.array) -> Stree:
|
|
||||||
"""Fit the Stree and copy the tree in a Snode_graph tree
|
|
||||||
|
|
||||||
:param X: Dataset
|
|
||||||
:type X: np.array
|
|
||||||
:param y: Labels
|
|
||||||
:type y: np.array
|
|
||||||
:return: Stree model
|
|
||||||
:rtype: Stree
|
|
||||||
"""
|
|
||||||
if X.shape[1] != 3:
|
|
||||||
self._pca = PCA(n_components=3)
|
|
||||||
X = self._pca.fit_transform(X)
|
|
||||||
res = super().fit(X, y)
|
|
||||||
self._tree_gr = self._copy_tree(self._tree)
|
|
||||||
self._fitted = True
|
|
||||||
return res
|
|
||||||
|
|
||||||
def score(self, X: np.array, y: np.array) -> float:
|
|
||||||
self._check_fitted()
|
|
||||||
if X.shape[1] != 3:
|
|
||||||
X = self._pca.transform(X)
|
|
||||||
return super().score(X, y)
|
|
||||||
|
|
||||||
def _check_fitted(self):
|
|
||||||
if not self._fitted:
|
|
||||||
raise Exception('Have to fit the grapher first!')
|
|
||||||
|
|
||||||
def save_all(self, save_folder: str = './', save_prefix: str = ''):
|
|
||||||
"""Save all the node plots in png format, each with a sequence number
|
|
||||||
|
|
||||||
:param save_folder: folder where the plots are saved, defaults to './'
|
|
||||||
:type save_folder: str, optional
|
|
||||||
"""
|
|
||||||
self._check_fitted()
|
|
||||||
if not os.path.isdir(save_folder):
|
|
||||||
os.mkdir(save_folder)
|
|
||||||
seq = 1
|
|
||||||
for node in self:
|
|
||||||
node.save_hyperplane(save_folder=save_folder,
|
|
||||||
save_prefix=save_prefix, save_seq=seq)
|
|
||||||
seq += 1
|
|
||||||
|
|
||||||
def plot_all(self):
|
|
||||||
"""Plots all the nodes
|
|
||||||
"""
|
|
||||||
self._check_fitted()
|
|
||||||
for node in self:
|
|
||||||
node.plot_hyperplane()
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return Siterator(self._tree_gr)
|
|
||||||
|
|
@@ -1,2 +1,3 @@
|
|||||||
from .Strees import Stree, Snode, Siterator
|
from .Strees import Stree, Snode, Siterator, Splitter
|
||||||
from .Strees_grapher import Stree_grapher, Snode_graph
|
|
||||||
|
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
||||||
|
91
stree/tests/Snode_test.py
Normal file
91
stree/tests/Snode_test.py
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from stree import Stree, Snode
|
||||||
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
|
class Snode_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._random_state = 1
|
||||||
|
self._clf = Stree(random_state=self._random_state)
|
||||||
|
self._clf.fit(*load_dataset(self._random_state))
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUp(cls):
|
||||||
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
|
def test_attributes_in_leaves(self):
|
||||||
|
"""Check if the attributes in leaves have correct values so they form a
|
||||||
|
predictor
|
||||||
|
"""
|
||||||
|
|
||||||
|
def check_leave(node: Snode):
|
||||||
|
if not node.is_leaf():
|
||||||
|
check_leave(node.get_down())
|
||||||
|
check_leave(node.get_up())
|
||||||
|
return
|
||||||
|
# Check Belief in leave
|
||||||
|
classes, card = np.unique(node._y, return_counts=True)
|
||||||
|
max_card = max(card)
|
||||||
|
min_card = min(card)
|
||||||
|
if len(classes) > 1:
|
||||||
|
try:
|
||||||
|
belief = max_card / (max_card + min_card)
|
||||||
|
except ZeroDivisionError:
|
||||||
|
belief = 0.0
|
||||||
|
else:
|
||||||
|
belief = 1
|
||||||
|
self.assertEqual(belief, node._belief)
|
||||||
|
# Check Class
|
||||||
|
class_computed = classes[card == max_card]
|
||||||
|
self.assertEqual(class_computed, node._class)
|
||||||
|
|
||||||
|
check_leave(self._clf.tree_)
|
||||||
|
|
||||||
|
def test_nodes_coefs(self):
|
||||||
|
"""Check if the nodes of the tree have the right attributes filled
|
||||||
|
"""
|
||||||
|
|
||||||
|
def run_tree(node: Snode):
|
||||||
|
if node._belief < 1:
|
||||||
|
# only exclude pure leaves
|
||||||
|
self.assertIsNotNone(node._clf)
|
||||||
|
self.assertIsNotNone(node._clf.coef_)
|
||||||
|
if node.is_leaf():
|
||||||
|
return
|
||||||
|
run_tree(node.get_down())
|
||||||
|
run_tree(node.get_up())
|
||||||
|
|
||||||
|
run_tree(self._clf.tree_)
|
||||||
|
|
||||||
|
def test_make_predictor_on_leaf(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
|
test.make_predictor()
|
||||||
|
self.assertEqual(1, test._class)
|
||||||
|
self.assertEqual(0.75, test._belief)
|
||||||
|
|
||||||
|
def test_make_predictor_on_not_leaf(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
|
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
|
||||||
|
test.make_predictor()
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
self.assertEqual(0, test._belief)
|
||||||
|
|
||||||
|
def test_make_predictor_on_leaf_bogus_data(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
||||||
|
test.make_predictor()
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
|
||||||
|
def test_copy_node(self):
|
||||||
|
px = [1, 2, 3, 4]
|
||||||
|
py = [1]
|
||||||
|
test = Snode(Stree(), px, py, [], 0.0, "test")
|
||||||
|
computed = Snode.copy(test)
|
||||||
|
self.assertListEqual(computed._X, px)
|
||||||
|
self.assertListEqual(computed._y, py)
|
||||||
|
self.assertEqual("test", computed._title)
|
||||||
|
self.assertIsInstance(computed._clf, Stree)
|
142
stree/tests/Splitter_test.py
Normal file
142
stree/tests/Splitter_test.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
|
from stree import Splitter
|
||||||
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
|
class Splitter_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._random_state = 1
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build(
|
||||||
|
clf=LinearSVC(),
|
||||||
|
min_samples_split=0,
|
||||||
|
splitter_type="random",
|
||||||
|
criterion="gini",
|
||||||
|
criteria="min_distance",
|
||||||
|
random_state=None,
|
||||||
|
):
|
||||||
|
return Splitter(
|
||||||
|
clf=clf,
|
||||||
|
min_samples_split=min_samples_split,
|
||||||
|
splitter_type=splitter_type,
|
||||||
|
criterion=criterion,
|
||||||
|
criteria=criteria,
|
||||||
|
random_state=random_state,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUp(cls):
|
||||||
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
|
def test_init(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.build(criterion="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.build(splitter_type="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.build(criteria="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.build(clf=None)
|
||||||
|
for splitter_type in ["best", "random"]:
|
||||||
|
for criterion in ["gini", "entropy"]:
|
||||||
|
for criteria in ["min_distance", "max_samples"]:
|
||||||
|
tcl = self.build(
|
||||||
|
splitter_type=splitter_type,
|
||||||
|
criterion=criterion,
|
||||||
|
criteria=criteria,
|
||||||
|
)
|
||||||
|
self.assertEqual(splitter_type, tcl._splitter_type)
|
||||||
|
self.assertEqual(criterion, tcl._criterion)
|
||||||
|
self.assertEqual(criteria, tcl._criteria)
|
||||||
|
|
||||||
|
def test_gini(self):
|
||||||
|
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
||||||
|
expected = 0.48
|
||||||
|
self.assertEqual(expected, Splitter._gini(y))
|
||||||
|
tcl = self.build(criterion="gini")
|
||||||
|
self.assertEqual(expected, tcl.criterion_function(y))
|
||||||
|
|
||||||
|
def test_entropy(self):
|
||||||
|
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
||||||
|
expected = 0.9709505944546686
|
||||||
|
self.assertAlmostEqual(expected, Splitter._entropy(y))
|
||||||
|
tcl = self.build(criterion="entropy")
|
||||||
|
self.assertEqual(expected, tcl.criterion_function(y))
|
||||||
|
|
||||||
|
def test_information_gain(self):
|
||||||
|
yu = np.array([0, 1, 1, 1, 1, 1])
|
||||||
|
yd = np.array([0, 0, 0, 1])
|
||||||
|
values_expected = [
|
||||||
|
("gini", 0.31666666666666665),
|
||||||
|
("entropy", 0.7145247027726656),
|
||||||
|
]
|
||||||
|
for criterion, expected in values_expected:
|
||||||
|
tcl = self.build(criterion=criterion)
|
||||||
|
computed = tcl.information_gain(yu, yd)
|
||||||
|
self.assertAlmostEqual(expected, computed)
|
||||||
|
|
||||||
|
def test_max_samples(self):
|
||||||
|
tcl = self.build(criteria="max_samples")
|
||||||
|
data = np.array(
|
||||||
|
[
|
||||||
|
[-0.1, 0.2, -0.3],
|
||||||
|
[0.7, 0.01, -0.1],
|
||||||
|
[0.7, -0.9, 0.5],
|
||||||
|
[0.1, 0.2, 0.3],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = np.array([0.2, 0.01, -0.9, 0.2])
|
||||||
|
y = [1, 2, 1, 0]
|
||||||
|
computed = tcl._max_samples(data, y)
|
||||||
|
self.assertEqual((4,), computed.shape)
|
||||||
|
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||||
|
|
||||||
|
def test_min_distance(self):
|
||||||
|
tcl = self.build()
|
||||||
|
data = np.array(
|
||||||
|
[
|
||||||
|
[-0.1, 0.2, -0.3],
|
||||||
|
[0.7, 0.01, -0.1],
|
||||||
|
[0.7, -0.9, 0.5],
|
||||||
|
[0.1, 0.2, 0.3],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = np.array([-0.1, 0.01, 0.5, 0.1])
|
||||||
|
computed = tcl._min_distance(data, None)
|
||||||
|
self.assertEqual((4,), computed.shape)
|
||||||
|
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||||
|
|
||||||
|
def test_splitter_parameter(self):
|
||||||
|
expected_values = [
|
||||||
|
[1, 7, 9],
|
||||||
|
[1, 7, 9],
|
||||||
|
[1, 7, 9],
|
||||||
|
[1, 7, 9],
|
||||||
|
[0, 5, 6],
|
||||||
|
[0, 5, 6],
|
||||||
|
[0, 5, 6],
|
||||||
|
[0, 5, 6],
|
||||||
|
]
|
||||||
|
X, y = load_dataset(self._random_state, n_features=12)
|
||||||
|
for splitter_type in ["best", "random"]:
|
||||||
|
for criterion in ["gini", "entropy"]:
|
||||||
|
for criteria in ["min_distance", "max_samples"]:
|
||||||
|
tcl = self.build(
|
||||||
|
splitter_type=splitter_type,
|
||||||
|
criterion=criterion,
|
||||||
|
criteria=criteria,
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
|
expected = expected_values.pop(0)
|
||||||
|
dataset, computed = tcl.get_subspace(X, y, max_features=3)
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(
|
||||||
|
X[:, computed].tolist(), dataset.tolist()
|
||||||
|
)
|
334
stree/tests/Stree_test.py
Normal file
334
stree/tests/Stree_test.py
Normal file
@@ -0,0 +1,334 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
|
|
||||||
|
from stree import Stree, Snode
|
||||||
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
|
class Stree_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._random_state = 1
|
||||||
|
self._kernels = ["linear", "rbf", "poly"]
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUp(cls):
|
||||||
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
|
def _check_tree(self, node: Snode):
|
||||||
|
"""Check recursively that the nodes that are not leaves have the
|
||||||
|
correct number of labels and its sons have the right number of elements
|
||||||
|
in their dataset
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
node {Snode} -- node to check
|
||||||
|
"""
|
||||||
|
if node.is_leaf():
|
||||||
|
return
|
||||||
|
y_prediction = node._clf.predict(node._X)
|
||||||
|
y_down = node.get_down()._y
|
||||||
|
y_up = node.get_up()._y
|
||||||
|
# Is a correct partition in terms of cadinality?
|
||||||
|
# i.e. The partition algorithm didn't forget any sample
|
||||||
|
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||||
|
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||||
|
_, count_d = np.unique(y_down, return_counts=True)
|
||||||
|
_, count_u = np.unique(y_up, return_counts=True)
|
||||||
|
#
|
||||||
|
for i in unique_y:
|
||||||
|
try:
|
||||||
|
number_down = count_d[i]
|
||||||
|
except IndexError:
|
||||||
|
number_down = 0
|
||||||
|
try:
|
||||||
|
number_up = count_u[i]
|
||||||
|
except IndexError:
|
||||||
|
number_up = 0
|
||||||
|
self.assertEqual(count_y[i], number_down + number_up)
|
||||||
|
# Is the partition made the same as the prediction?
|
||||||
|
# as the node is not a leaf...
|
||||||
|
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||||
|
self.assertEqual(count_yp[0], y_up.shape[0])
|
||||||
|
self.assertEqual(count_yp[1], y_down.shape[0])
|
||||||
|
self._check_tree(node.get_down())
|
||||||
|
self._check_tree(node.get_up())
|
||||||
|
|
||||||
|
def test_build_tree(self):
|
||||||
|
"""Check if the tree is built the same way as predictions of models
|
||||||
|
"""
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
clf.fit(*load_dataset(self._random_state))
|
||||||
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _find_out(px: np.array, x_original: np.array, y_original) -> list:
|
||||||
|
"""Find the original values of y for a given array of samples
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
px {np.array} -- array of samples to search for
|
||||||
|
x_original {np.array} -- original dataset
|
||||||
|
y_original {[type]} -- original classes
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
np.array -- classes of the given samples
|
||||||
|
"""
|
||||||
|
res = []
|
||||||
|
for needle in px:
|
||||||
|
for row in range(x_original.shape[0]):
|
||||||
|
if all(x_original[row, :] == needle):
|
||||||
|
res.append(y_original[row])
|
||||||
|
return res
|
||||||
|
|
||||||
|
def test_single_prediction(self):
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||||
|
self.assertEqual(yp[0], y[0])
|
||||||
|
|
||||||
|
def test_multiple_prediction(self):
|
||||||
|
# First 27 elements the predictions are the same as the truth
|
||||||
|
num = 27
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
yp = clf.fit(X, y).predict(X[:num, :])
|
||||||
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||||
|
|
||||||
|
def test_score(self):
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
accuracies = [
|
||||||
|
0.9506666666666667,
|
||||||
|
0.9606666666666667,
|
||||||
|
0.9433333333333334,
|
||||||
|
]
|
||||||
|
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||||
|
clf = Stree(random_state=self._random_state, kernel=kernel,)
|
||||||
|
clf.fit(X, y)
|
||||||
|
accuracy_score = clf.score(X, y)
|
||||||
|
yp = clf.predict(X)
|
||||||
|
accuracy_computed = np.mean(yp == y)
|
||||||
|
self.assertEqual(accuracy_score, accuracy_computed)
|
||||||
|
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||||
|
|
||||||
|
def test_single_vs_multiple_prediction(self):
|
||||||
|
"""Check if predicting sample by sample gives the same result as
|
||||||
|
predicting all samples at once
|
||||||
|
"""
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
clf.fit(X, y)
|
||||||
|
# Compute prediction line by line
|
||||||
|
yp_line = np.array([], dtype=int)
|
||||||
|
for xp in X:
|
||||||
|
yp_line = np.append(
|
||||||
|
yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
|
||||||
|
)
|
||||||
|
# Compute prediction at once
|
||||||
|
yp_once = clf.predict(X)
|
||||||
|
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||||
|
|
||||||
|
def test_iterator_and_str(self):
|
||||||
|
"""Check preorder iterator
|
||||||
|
"""
|
||||||
|
expected = [
|
||||||
|
"root feaures=(0, 1, 2) impurity=0.5000",
|
||||||
|
"root - Down feaures=(0, 1, 2) impurity=0.0671",
|
||||||
|
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 "
|
||||||
|
"impurity=0.0469 counts=(array([0, 1]), array([ 17, 691]))",
|
||||||
|
"root - Down - Up feaures=(0, 1, 2) impurity=0.3967",
|
||||||
|
"root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 "
|
||||||
|
"impurity=0.3750 counts=(array([0, 1]), array([1, 3]))",
|
||||||
|
"root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 "
|
||||||
|
"impurity=0.0000 counts=(array([0]), array([7]))",
|
||||||
|
"root - Up, <cgaf> - Leaf class=0 belief= 0.928297 impurity=0.1331"
|
||||||
|
" counts=(array([0, 1]), array([725, 56]))",
|
||||||
|
]
|
||||||
|
computed = []
|
||||||
|
expected_string = ""
|
||||||
|
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||||
|
clf.fit(*load_dataset(self._random_state))
|
||||||
|
for node in clf:
|
||||||
|
computed.append(str(node))
|
||||||
|
expected_string += str(node) + "\n"
|
||||||
|
self.assertListEqual(expected, computed)
|
||||||
|
self.assertEqual(expected_string, str(clf))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def test_is_a_sklearn_classifier():
|
||||||
|
import warnings
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
|
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
|
from sklearn.utils.estimator_checks import check_estimator
|
||||||
|
|
||||||
|
check_estimator(Stree())
|
||||||
|
|
||||||
|
def test_exception_if_C_is_negative(self):
|
||||||
|
tclf = Stree(C=-1)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
tclf.fit(*load_dataset(self._random_state))
|
||||||
|
|
||||||
|
def test_exception_if_bogus_split_criteria(self):
|
||||||
|
tclf = Stree(split_criteria="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
tclf.fit(*load_dataset(self._random_state))
|
||||||
|
|
||||||
|
def test_check_max_depth_is_positive_or_None(self):
|
||||||
|
tcl = Stree()
|
||||||
|
self.assertIsNone(tcl.max_depth)
|
||||||
|
tcl = Stree(max_depth=1)
|
||||||
|
self.assertGreaterEqual(1, tcl.max_depth)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
tcl = Stree(max_depth=-1)
|
||||||
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
|
|
||||||
|
def test_check_max_depth(self):
|
||||||
|
depths = (3, 4)
|
||||||
|
for depth in depths:
|
||||||
|
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
||||||
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
|
self.assertEqual(depth, tcl.depth_)
|
||||||
|
|
||||||
|
def test_unfitted_tree_is_iterable(self):
|
||||||
|
tcl = Stree()
|
||||||
|
self.assertEqual(0, len(list(tcl)))
|
||||||
|
|
||||||
|
def test_min_samples_split(self):
|
||||||
|
dataset = [[1], [2], [3]], [1, 1, 0]
|
||||||
|
tcl_split = Stree(min_samples_split=3).fit(*dataset)
|
||||||
|
self.assertIsNotNone(tcl_split.tree_.get_down())
|
||||||
|
self.assertIsNotNone(tcl_split.tree_.get_up())
|
||||||
|
tcl_nosplit = Stree(min_samples_split=4).fit(*dataset)
|
||||||
|
self.assertIsNone(tcl_nosplit.tree_.get_down())
|
||||||
|
self.assertIsNone(tcl_nosplit.tree_.get_up())
|
||||||
|
|
||||||
|
def test_simple_muticlass_dataset(self):
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
split_criteria="max_samples",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
|
px = [[1, 2], [5, 6], [9, 10]]
|
||||||
|
py = [0, 1, 2]
|
||||||
|
clf.fit(px, py)
|
||||||
|
self.assertEqual(1.0, clf.score(px, py))
|
||||||
|
self.assertListEqual(py, clf.predict(px).tolist())
|
||||||
|
self.assertListEqual(py, clf.classes_.tolist())
|
||||||
|
|
||||||
|
def test_muticlass_dataset(self):
|
||||||
|
datasets = {
|
||||||
|
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||||
|
"Iris": load_iris(return_X_y=True),
|
||||||
|
}
|
||||||
|
outcomes = {
|
||||||
|
"Synt": {
|
||||||
|
"max_samples linear": 0.9533333333333334,
|
||||||
|
"max_samples rbf": 0.836,
|
||||||
|
"max_samples poly": 0.9473333333333334,
|
||||||
|
"min_distance linear": 0.9533333333333334,
|
||||||
|
"min_distance rbf": 0.836,
|
||||||
|
"min_distance poly": 0.9473333333333334,
|
||||||
|
},
|
||||||
|
"Iris": {
|
||||||
|
"max_samples linear": 0.98,
|
||||||
|
"max_samples rbf": 1.0,
|
||||||
|
"max_samples poly": 1.0,
|
||||||
|
"min_distance linear": 0.98,
|
||||||
|
"min_distance rbf": 1.0,
|
||||||
|
"min_distance poly": 1.0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
for name, dataset in datasets.items():
|
||||||
|
px, py = dataset
|
||||||
|
for criteria in ["max_samples", "min_distance"]:
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(
|
||||||
|
C=1e4,
|
||||||
|
max_iter=1e4,
|
||||||
|
kernel=kernel,
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
|
clf.fit(px, py)
|
||||||
|
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||||
|
self.assertAlmostEqual(outcome, clf.score(px, py))
|
||||||
|
|
||||||
|
def test_max_features(self):
|
||||||
|
n_features = 16
|
||||||
|
expected_values = [
|
||||||
|
("auto", 4),
|
||||||
|
("log2", 4),
|
||||||
|
("sqrt", 4),
|
||||||
|
(0.5, 8),
|
||||||
|
(3, 3),
|
||||||
|
(None, 16),
|
||||||
|
]
|
||||||
|
clf = Stree()
|
||||||
|
clf.n_features_ = n_features
|
||||||
|
for max_features, expected in expected_values:
|
||||||
|
clf.set_params(**dict(max_features=max_features))
|
||||||
|
computed = clf._initialize_max_features()
|
||||||
|
self.assertEqual(expected, computed)
|
||||||
|
# Check bogus max_features
|
||||||
|
values = ["duck", -0.1, 0.0]
|
||||||
|
for max_features in values:
|
||||||
|
clf.set_params(**dict(max_features=max_features))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = clf._initialize_max_features()
|
||||||
|
|
||||||
|
def test_get_subspaces(self):
|
||||||
|
dataset = np.random.random((10, 16))
|
||||||
|
y = np.random.randint(0, 2, 10)
|
||||||
|
expected_values = [
|
||||||
|
("auto", 4),
|
||||||
|
("log2", 4),
|
||||||
|
("sqrt", 4),
|
||||||
|
(0.5, 8),
|
||||||
|
(3, 3),
|
||||||
|
(None, 16),
|
||||||
|
]
|
||||||
|
clf = Stree()
|
||||||
|
for max_features, expected in expected_values:
|
||||||
|
clf.set_params(**dict(max_features=max_features))
|
||||||
|
clf.fit(dataset, y)
|
||||||
|
computed, indices = clf.splitter_.get_subspace(
|
||||||
|
dataset, y, clf.max_features_
|
||||||
|
)
|
||||||
|
self.assertListEqual(
|
||||||
|
dataset[:, indices].tolist(), computed.tolist()
|
||||||
|
)
|
||||||
|
self.assertEqual(expected, len(indices))
|
||||||
|
|
||||||
|
def test_bogus_criterion(self):
|
||||||
|
clf = Stree(criterion="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(*load_dataset())
|
||||||
|
|
||||||
|
def test_predict_feature_dimensions(self):
|
||||||
|
X = np.random.rand(10, 5)
|
||||||
|
y = np.random.randint(0, 2, 10)
|
||||||
|
clf = Stree()
|
||||||
|
clf.fit(X, y)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.predict(X[:, :3])
|
||||||
|
|
||||||
|
def test_score_max_features(self):
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
clf = Stree(random_state=self._random_state, max_features=2)
|
||||||
|
clf.fit(X, y)
|
||||||
|
self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))
|
||||||
|
|
||||||
|
def test_bogus_splitter_parameter(self):
|
||||||
|
clf = Stree(splitter="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(*load_dataset())
|
@@ -1,313 +0,0 @@
|
|||||||
import csv
|
|
||||||
import os
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.datasets import make_classification
|
|
||||||
|
|
||||||
from stree import Stree, Snode
|
|
||||||
|
|
||||||
|
|
||||||
class Stree_test(unittest.TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
os.environ['TESTING'] = '1'
|
|
||||||
self._random_state = 1
|
|
||||||
self._clf = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=False)
|
|
||||||
self._clf.fit(*self._get_Xy())
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def tearDownClass(cls):
|
|
||||||
try:
|
|
||||||
os.environ.pop('TESTING')
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _get_Xy(self):
|
|
||||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
|
||||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
|
||||||
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
def _check_tree(self, node: Snode):
|
|
||||||
"""Check recursively that the nodes that are not leaves have the correct
|
|
||||||
number of labels and its sons have the right number of elements in their dataset
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
node {Snode} -- node to check
|
|
||||||
"""
|
|
||||||
if node.is_leaf():
|
|
||||||
return
|
|
||||||
y_prediction = node._clf.predict(node._X)
|
|
||||||
y_down = node.get_down()._y
|
|
||||||
y_up = node.get_up()._y
|
|
||||||
# Is a correct partition in terms of cadinality?
|
|
||||||
# i.e. The partition algorithm didn't forget any sample
|
|
||||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
|
||||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
|
||||||
_, count_d = np.unique(y_down, return_counts=True)
|
|
||||||
_, count_u = np.unique(y_up, return_counts=True)
|
|
||||||
#
|
|
||||||
for i in unique_y:
|
|
||||||
try:
|
|
||||||
number_down = count_d[i]
|
|
||||||
except:
|
|
||||||
number_down = 0
|
|
||||||
try:
|
|
||||||
number_up = count_u[i]
|
|
||||||
except:
|
|
||||||
number_up = 0
|
|
||||||
self.assertEqual(count_y[i], number_down + number_up)
|
|
||||||
# Is the partition made the same as the prediction?
|
|
||||||
# as the node is not a leaf...
|
|
||||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
|
||||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
|
||||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
|
||||||
self._check_tree(node.get_down())
|
|
||||||
self._check_tree(node.get_up())
|
|
||||||
|
|
||||||
def test_build_tree(self):
|
|
||||||
"""Check if the tree is built the same way as predictions of models
|
|
||||||
"""
|
|
||||||
self._check_tree(self._clf._tree)
|
|
||||||
|
|
||||||
def _get_file_data(self, file_name: str) -> tuple:
|
|
||||||
"""Return X, y from data, y is the last column in array
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
file_name {str} -- the file name
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple -- tuple with samples, categories
|
|
||||||
"""
|
|
||||||
data = np.genfromtxt(file_name, delimiter=',')
|
|
||||||
data = np.array(data)
|
|
||||||
column_y = data.shape[1] - 1
|
|
||||||
fy = data[:, column_y]
|
|
||||||
fx = np.delete(data, column_y, axis=1)
|
|
||||||
return fx, fy
|
|
||||||
|
|
||||||
def _find_out(self, px: np.array, x_original: np.array, y_original) -> list:
|
|
||||||
"""Find the original values of y for a given array of samples
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
px {np.array} -- array of samples to search for
|
|
||||||
x_original {np.array} -- original dataset
|
|
||||||
y_original {[type]} -- original classes
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
np.array -- classes of the given samples
|
|
||||||
"""
|
|
||||||
res = []
|
|
||||||
for needle in px:
|
|
||||||
for row in range(x_original.shape[0]):
|
|
||||||
if all(x_original[row, :] == needle):
|
|
||||||
res.append(y_original[row])
|
|
||||||
return res
|
|
||||||
|
|
||||||
def test_subdatasets(self):
|
|
||||||
"""Check if the subdatasets files have the same labels as the original dataset
|
|
||||||
"""
|
|
||||||
self._clf.save_sub_datasets()
|
|
||||||
with open(self._clf.get_catalog_name()) as cat_file:
|
|
||||||
catalog = csv.reader(cat_file, delimiter=',')
|
|
||||||
for row in catalog:
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
x_file, y_file = self._get_file_data(row[0])
|
|
||||||
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
|
|
||||||
self.assertTrue(np.array_equal(y_file, y_original))
|
|
||||||
|
|
||||||
def test_single_prediction(self):
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
|
|
||||||
self.assertEqual(yp[0], y[0])
|
|
||||||
|
|
||||||
def test_multiple_prediction(self):
|
|
||||||
# First 27 elements the predictions are the same as the truth
|
|
||||||
num = 27
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict(X[:num, :])
|
|
||||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
|
||||||
|
|
||||||
def test_score(self):
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
accuracy_score = self._clf.score(X, y)
|
|
||||||
yp = self._clf.predict(X)
|
|
||||||
right = (yp == y).astype(int)
|
|
||||||
accuracy_computed = sum(right) / len(y)
|
|
||||||
self.assertEqual(accuracy_score, accuracy_computed)
|
|
||||||
self.assertGreater(accuracy_score, 0.8)
|
|
||||||
|
|
||||||
def test_single_predict_proba(self):
|
|
||||||
"""Check that element 28 has a prediction different that the current label
|
|
||||||
"""
|
|
||||||
# Element 28 has a different prediction than the truth
|
|
||||||
decimals = 5
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
|
||||||
self.assertEqual(0, yp[0:, 0])
|
|
||||||
self.assertEqual(1, y[28])
|
|
||||||
self.assertAlmostEqual(
|
|
||||||
round(0.29026400766, decimals),
|
|
||||||
round(yp[0, 1], decimals),
|
|
||||||
decimals
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multiple_predict_proba(self):
|
|
||||||
# First 27 elements the predictions are the same as the truth
|
|
||||||
num = 27
|
|
||||||
decimals = 5
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict_proba(X[:num, :])
|
|
||||||
self.assertListEqual(y[:num].tolist(), yp[:, 0].tolist())
|
|
||||||
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236,
|
|
||||||
0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349,
|
|
||||||
0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355,
|
|
||||||
0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692,
|
|
||||||
0.29788483, 0.01657364, 0.81149083]
|
|
||||||
expected = np.round(expected_proba, decimals=decimals).tolist()
|
|
||||||
computed = np.round(yp[:, 1], decimals=decimals).tolist()
|
|
||||||
for i in range(len(expected)):
|
|
||||||
self.assertAlmostEqual(expected[i], computed[i], decimals)
|
|
||||||
|
|
||||||
def build_models(self):
|
|
||||||
"""Build and train two models, model_clf will use the sklearn classifier to
|
|
||||||
compute predictions and split data. model_computed will use vector of
|
|
||||||
coefficients to compute both predictions and splitted data
|
|
||||||
"""
|
|
||||||
model_clf = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=True)
|
|
||||||
model_computed = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=False)
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
model_clf.fit(X, y)
|
|
||||||
model_computed.fit(X, y)
|
|
||||||
return model_clf, model_computed, X, y
|
|
||||||
|
|
||||||
def test_use_model_predict(self):
|
|
||||||
"""Check that we get the same results wether we use the estimator in nodes
|
|
||||||
to compute labels or we use the hyperplane and the position of samples wrt to it
|
|
||||||
"""
|
|
||||||
use_clf, use_math, X, _ = self.build_models()
|
|
||||||
self.assertListEqual(
|
|
||||||
use_clf.predict(X).tolist(),
|
|
||||||
use_math.predict(X).tolist()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_use_model_score(self):
|
|
||||||
use_clf, use_math, X, y = self.build_models()
|
|
||||||
b = use_math.score(X, y)
|
|
||||||
self.assertEqual(
|
|
||||||
use_clf.score(X, y),
|
|
||||||
b
|
|
||||||
)
|
|
||||||
self.assertGreater(b, .95)
|
|
||||||
|
|
||||||
def test_use_model_predict_proba(self):
|
|
||||||
use_clf, use_math, X, _ = self.build_models()
|
|
||||||
self.assertListEqual(
|
|
||||||
use_clf.predict_proba(X).tolist(),
|
|
||||||
use_math.predict_proba(X).tolist()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_single_vs_multiple_prediction(self):
|
|
||||||
"""Check if predicting sample by sample gives the same result as predicting
|
|
||||||
all samples at once
|
|
||||||
"""
|
|
||||||
X, _ = self._get_Xy()
|
|
||||||
# Compute prediction line by line
|
|
||||||
yp_line = np.array([], dtype=int)
|
|
||||||
for xp in X:
|
|
||||||
yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1])))
|
|
||||||
# Compute prediction at once
|
|
||||||
yp_once = self._clf.predict(X)
|
|
||||||
#
|
|
||||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
|
||||||
|
|
||||||
def test_iterator(self):
|
|
||||||
"""Check preorder iterator
|
|
||||||
"""
|
|
||||||
expected = [
|
|
||||||
'root',
|
|
||||||
'root - Down',
|
|
||||||
'root - Down - Down, <cgaf> - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))',
|
|
||||||
'root - Down - Up',
|
|
||||||
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))',
|
|
||||||
'root - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))',
|
|
||||||
'root - Up, <cgaf> - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))',
|
|
||||||
]
|
|
||||||
computed = []
|
|
||||||
for node in self._clf:
|
|
||||||
computed.append(str(node))
|
|
||||||
self.assertListEqual(expected, computed)
|
|
||||||
|
|
||||||
class Snode_test(unittest.TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
os.environ['TESTING'] = '1'
|
|
||||||
self._random_state = 1
|
|
||||||
self._clf = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=True)
|
|
||||||
self._clf.fit(*self._get_Xy())
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def tearDownClass(cls):
|
|
||||||
try:
|
|
||||||
os.environ.pop('TESTING')
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _get_Xy(self):
|
|
||||||
X, y = make_classification(n_samples=1500, n_features=3, n_informative=3,
|
|
||||||
n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,
|
|
||||||
class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state)
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
def test_attributes_in_leaves(self):
|
|
||||||
"""Check if the attributes in leaves have correct values so they form a predictor
|
|
||||||
"""
|
|
||||||
|
|
||||||
def check_leave(node: Snode):
|
|
||||||
if not node.is_leaf():
|
|
||||||
check_leave(node.get_down())
|
|
||||||
check_leave(node.get_up())
|
|
||||||
return
|
|
||||||
# Check Belief in leave
|
|
||||||
classes, card = np.unique(node._y, return_counts=True)
|
|
||||||
max_card = max(card)
|
|
||||||
min_card = min(card)
|
|
||||||
if len(classes) > 1:
|
|
||||||
try:
|
|
||||||
belief = max_card / (max_card + min_card)
|
|
||||||
except:
|
|
||||||
belief = 0.
|
|
||||||
else:
|
|
||||||
belief = 1
|
|
||||||
self.assertEqual(belief, node._belief)
|
|
||||||
# Check Class
|
|
||||||
class_computed = classes[card == max_card]
|
|
||||||
self.assertEqual(class_computed, node._class)
|
|
||||||
|
|
||||||
check_leave(self._clf._tree)
|
|
||||||
|
|
||||||
def test_nodes_coefs(self):
|
|
||||||
"""Check if the nodes of the tree have the right attributes filled
|
|
||||||
"""
|
|
||||||
|
|
||||||
def run_tree(node: Snode):
|
|
||||||
if node._belief < 1:
|
|
||||||
# only exclude pure leaves
|
|
||||||
self.assertIsNotNone(node._clf)
|
|
||||||
self.assertIsNotNone(node._clf.coef_)
|
|
||||||
self.assertIsNotNone(node._vector)
|
|
||||||
self.assertIsNotNone(node._interceptor)
|
|
||||||
if node.is_leaf():
|
|
||||||
return
|
|
||||||
run_tree(node.get_down())
|
|
||||||
run_tree(node.get_up())
|
|
||||||
|
|
||||||
run_tree(self._clf._tree)
|
|
||||||
|
|
@@ -1 +1,5 @@
|
|||||||
from .Strees_test import Stree_test, Snode_test
|
from .Stree_test import Stree_test
|
||||||
|
from .Snode_test import Snode_test
|
||||||
|
from .Splitter_test import Splitter_test
|
||||||
|
|
||||||
|
__all__ = ["Stree_test", "Snode_test", "Splitter_test"]
|
||||||
|
17
stree/tests/utils.py
Normal file
17
stree/tests/utils.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from sklearn.datasets import make_classification
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset(random_state=0, n_classes=2, n_features=3):
|
||||||
|
X, y = make_classification(
|
||||||
|
n_samples=1500,
|
||||||
|
n_features=n_features,
|
||||||
|
n_informative=3,
|
||||||
|
n_redundant=0,
|
||||||
|
n_repeated=0,
|
||||||
|
n_classes=n_classes,
|
||||||
|
n_clusters_per_class=2,
|
||||||
|
class_sep=1.5,
|
||||||
|
flip_y=0,
|
||||||
|
random_state=random_state,
|
||||||
|
)
|
||||||
|
return X, y
|
191
test2.ipynb
191
test2.ipynb
@@ -1,191 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#\n",
|
|
||||||
"# Google Colab setup\n",
|
|
||||||
"#\n",
|
|
||||||
"#!pip install git+https://github.com/doctorado-ml/stree"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
|
||||||
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
|
||||||
"from stree import Stree\n",
|
|
||||||
"import time"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"if not os.path.isfile('data/creditcard.csv'):\n",
|
|
||||||
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
|
||||||
" !tar xzf creditcard.tgz"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.110% 494\nValid: 66.890% 998\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"random_state=1\n",
|
|
||||||
"\n",
|
|
||||||
"def load_creditcard(n_examples=0):\n",
|
|
||||||
" import pandas as pd\n",
|
|
||||||
" import numpy as np\n",
|
|
||||||
" import random\n",
|
|
||||||
" df = pd.read_csv('data/creditcard.csv')\n",
|
|
||||||
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
|
||||||
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
|
||||||
" y = df.Class\n",
|
|
||||||
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
|
||||||
" if n_examples > 0:\n",
|
|
||||||
" # Take first n_examples samples\n",
|
|
||||||
" X = X[:n_examples, :]\n",
|
|
||||||
" y = y[:n_examples, :]\n",
|
|
||||||
" else:\n",
|
|
||||||
" # Take all the positive samples with a number of random negatives\n",
|
|
||||||
" if n_examples < 0:\n",
|
|
||||||
" Xt = X[(y == 1).ravel()]\n",
|
|
||||||
" yt = y[(y == 1).ravel()]\n",
|
|
||||||
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
|
||||||
" X = np.append(Xt, X[indices], axis=0)\n",
|
|
||||||
" y = np.append(yt, y[indices], axis=0)\n",
|
|
||||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
|
||||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
|
||||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
|
||||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
|
||||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
|
||||||
"\n",
|
|
||||||
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
|
||||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
|
||||||
"data = load_creditcard(-1000) # Take all the samples\n",
|
|
||||||
"\n",
|
|
||||||
"Xtrain = data[0]\n",
|
|
||||||
"Xtest = data[1]\n",
|
|
||||||
"ytrain = data[2]\n",
|
|
||||||
"ytest = data[3]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9521\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.980519 counts=(array([0, 1]), array([ 6, 302]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.940217 counts=(array([0, 1]), array([692, 44]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9521\nClassifier's accuracy (test) : 0.9643\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief=0.986842 counts=(array([0, 1]), array([ 4, 300]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.937754 counts=(array([0, 1]), array([693, 46]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9636\nClassifier's accuracy (test) : 0.9688\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([308]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([8]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.947802 counts=(array([0, 1]), array([690, 38]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([308]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([11]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.951456 counts=(array([0, 1]), array([686, 35]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9741\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n\n**************************************************\n0.7816 secs\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"t = time.time()\n",
|
|
||||||
"for C in (.001, .01, 1, 5, 17):\n",
|
|
||||||
" clf = Stree(C=C, random_state=random_state)\n",
|
|
||||||
" clf.fit(Xtrain, ytrain)\n",
|
|
||||||
" print(f\"************** C={C} ****************************\")\n",
|
|
||||||
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
|
|
||||||
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
|
|
||||||
" print(clf)\n",
|
|
||||||
" print(f\"**************************************************\")\n",
|
|
||||||
"print(f\"{time.time() - t:.4f} secs\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler\n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from sklearn.calibration import CalibratedClassifierCV\n",
|
|
||||||
"scaler = StandardScaler()\n",
|
|
||||||
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
|
|
||||||
"cclf.fit(Xtrain, ytrain)\n",
|
|
||||||
"res = cclf.predict_proba(Xtest)\n",
|
|
||||||
"#an array containing probabilities of belonging to the 1st class"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"#check iterator\n",
|
|
||||||
"for i in list(clf):\n",
|
|
||||||
" print(i)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([306]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([10]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([7]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.961538 counts=(array([0, 1]), array([675, 27]))\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"#check iterator again\n",
|
|
||||||
"for i in clf:\n",
|
|
||||||
" print(i)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.7.6-final"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user