mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
28 Commits
add_kernel
...
Adding-Git
Author | SHA1 | Date | |
---|---|---|---|
98881cbd45
|
|||
cdb9fd6faa
|
|||
82f7352f9a
|
|||
8359e442e5
|
|||
|
673081cdc5 | ||
|
36816074ff | ||
475ad7e752
|
|||
|
1c869e154e | ||
f5706c3159
|
|||
be552fdd6c
|
|||
5e3a8e3ec5
|
|||
554ec03c32
|
|||
4b7e4a3fb0
|
|||
76723993fd
|
|||
ecd0b86f4d
|
|||
3e52a4746c
|
|||
|
a20e45e8e7 | ||
9334951d1b
|
|||
736ab7ef20
|
|||
c94bc068bd
|
|||
502ee72799
|
|||
f1ee4de37b
|
|||
ae1c199e21
|
|||
1bfe273a70
|
|||
|
647d21bdb5 | ||
1d392d534f
|
|||
f360a2640c
|
|||
|
45510b43bc |
@@ -10,5 +10,4 @@ exclude_lines =
|
||||
if __name__ == .__main__.:
|
||||
ignore_errors = True
|
||||
omit =
|
||||
stree/tests/*
|
||||
stree/__init__.py
|
47
.github/workflows/main.yml
vendored
Normal file
47
.github/workflows/main.yml
vendored
Normal file
@@ -0,0 +1,47 @@
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches: [master]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [macos-latest, ubuntu-latest]
|
||||
python: [3.8]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python ${{ matrix.python }}
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
pip install -q --upgrade pip
|
||||
pip install -q -r requirements.txt
|
||||
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
||||
- name: Lint
|
||||
run: |
|
||||
black --check --diff stree
|
||||
flake8 --count stree
|
||||
- name: Tests
|
||||
run: |
|
||||
coverage run -m unittest -v stree.tests
|
||||
coverage xml
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
files: ./coverage.xml
|
||||
- name: Run codacy-coverage-reporter
|
||||
if: runner.os == 'Linux'
|
||||
uses: codacy/codacy-coverage-reporter-action@master
|
||||
with:
|
||||
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
|
||||
coverage-reports: coverage.xml
|
2
.gitignore
vendored
2
.gitignore
vendored
@@ -131,3 +131,5 @@ dmypy.json
|
||||
.idea
|
||||
.vscode
|
||||
.pre-commit-config.yaml
|
||||
|
||||
**.csv
|
16
README.md
16
README.md
@@ -1,6 +1,6 @@
|
||||
[](https://app.codeship.com/projects/399170)
|
||||

|
||||
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||
|
||||
# Stree
|
||||
|
||||
@@ -18,17 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
|
||||
|
||||
### Jupyter notebooks
|
||||
|
||||
* [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||
|
||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||
|
||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
|
||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
|
||||
|
||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
|
||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
|
||||
|
||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
|
||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
|
||||
|
||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
|
||||
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
|
||||
|
||||
### Command line
|
||||
|
||||
|
@@ -3,9 +3,6 @@ overage:
|
||||
project:
|
||||
default:
|
||||
target: 90%
|
||||
patch:
|
||||
default:
|
||||
target: 90%
|
||||
comment:
|
||||
layout: "reach, diff, flags, files"
|
||||
behavior: default
|
||||
|
87
main.py
87
main.py
@@ -1,88 +1,29 @@
|
||||
import time
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.datasets import load_iris
|
||||
from stree import Stree
|
||||
|
||||
random_state = 1
|
||||
|
||||
X, y = load_iris(return_X_y=True)
|
||||
|
||||
def load_creditcard(n_examples=0):
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
df = pd.read_csv("data/creditcard.csv")
|
||||
print(
|
||||
"Fraud: {0:.3f}% {1}".format(
|
||||
df.Class[df.Class == 1].count() * 100 / df.shape[0],
|
||||
df.Class[df.Class == 1].count(),
|
||||
)
|
||||
)
|
||||
print(
|
||||
"Valid: {0:.3f}% {1}".format(
|
||||
df.Class[df.Class == 0].count() * 100 / df.shape[0],
|
||||
df.Class[df.Class == 0].count(),
|
||||
)
|
||||
)
|
||||
y = np.expand_dims(df.Class.values, axis=1)
|
||||
X = df.drop(["Class", "Time", "Amount"], axis=1).values
|
||||
if n_examples > 0:
|
||||
# Take first n_examples samples
|
||||
X = X[:n_examples, :]
|
||||
y = y[:n_examples, :]
|
||||
else:
|
||||
# Take all the positive samples with a number of random negatives
|
||||
if n_examples < 0:
|
||||
Xt = X[(y == 1).ravel()]
|
||||
yt = y[(y == 1).ravel()]
|
||||
indices = random.sample(range(X.shape[0]), -1 * n_examples)
|
||||
X = np.append(Xt, X[indices], axis=0)
|
||||
y = np.append(yt, y[indices], axis=0)
|
||||
print("X.shape", X.shape, " y.shape", y.shape)
|
||||
print(
|
||||
"Fraud: {0:.3f}% {1}".format(
|
||||
len(y[y == 1]) * 100 / X.shape[0], len(y[y == 1])
|
||||
)
|
||||
)
|
||||
print(
|
||||
"Valid: {0:.3f}% {1}".format(
|
||||
len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])
|
||||
)
|
||||
)
|
||||
Xtrain, Xtest, ytrain, ytest = train_test_split(
|
||||
X,
|
||||
y,
|
||||
train_size=0.7,
|
||||
shuffle=True,
|
||||
random_state=random_state,
|
||||
stratify=y,
|
||||
)
|
||||
return Xtrain, Xtest, ytrain, ytest
|
||||
|
||||
|
||||
# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
|
||||
# data = load_creditcard(5000) # Take the first 5000 samples
|
||||
data = load_creditcard() # Take all the samples
|
||||
|
||||
Xtrain = data[0]
|
||||
Xtest = data[1]
|
||||
ytrain = data[2]
|
||||
ytest = data[3]
|
||||
Xtrain, Xtest, ytrain, ytest = train_test_split(
|
||||
X, y, test_size=0.2, random_state=random_state
|
||||
)
|
||||
|
||||
now = time.time()
|
||||
print("Predicting with max_features=sqrt(n_features)")
|
||||
clf = Stree(C=0.01, random_state=random_state, max_features="auto")
|
||||
clf.fit(Xtrain, ytrain)
|
||||
print(f"Took {time.time() - now:.2f} seconds to train")
|
||||
print(clf)
|
||||
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
||||
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
||||
print("=" * 40)
|
||||
print("Predicting with max_features=n_features")
|
||||
clf = Stree(C=0.01, random_state=random_state)
|
||||
clf.fit(Xtrain, ytrain)
|
||||
print(f"Took {time.time() - now:.2f} seconds to train")
|
||||
print(clf)
|
||||
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
||||
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
||||
proba = clf.predict_proba(Xtest)
|
||||
print(
|
||||
"Checking that we have correct probabilities, these are probabilities of "
|
||||
"sample belonging to class 1"
|
||||
)
|
||||
res0 = proba[proba[:, 0] == 0]
|
||||
res1 = proba[proba[:, 0] == 1]
|
||||
print("++++++++++res0 > .8++++++++++++")
|
||||
print(res0[res0[:, 1] > 0.8])
|
||||
print("**********res1 < .4************")
|
||||
print(res1[res1[:, 1] < 0.4])
|
||||
|
File diff suppressed because one or more lines are too long
@@ -4,7 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test AdaBoost with different configurations"
|
||||
"# Test Stree with AdaBoost and Bagging with different configurations"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -34,11 +34,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"from sklearn.ensemble import AdaBoostClassifier\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"from sklearn.svm import LinearSVC, SVC\n",
|
||||
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||
"from sklearn.datasets import load_iris\n",
|
||||
"from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from stree import Stree"
|
||||
]
|
||||
},
|
||||
@@ -57,12 +54,20 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n"
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\n",
|
||||
"Valid: 99.827% 284315\n",
|
||||
"X.shape (100492, 28) y.shape (100492,)\n",
|
||||
"Fraud: 0.652% 655\n",
|
||||
"Valid: 99.348% 99837\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -117,23 +122,27 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## STree alone on the whole dataset and linear kernel"
|
||||
"## STree alone with 100.000 samples and linear kernel"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Score Train: 0.9985499829409757\nScore Test: 0.998407854584052\nTook 39.45 seconds\n"
|
||||
"text": [
|
||||
"Score Train: 0.9985073353804162\nScore Test: 0.9983746848878864\nTook 35.80 seconds\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"now = time.time()\n",
|
||||
"clf = Stree(max_depth=3, random_state=random_state)\n",
|
||||
"clf = Stree(max_depth=3, random_state=random_state, max_iter=1e3)\n",
|
||||
"clf.fit(Xtrain, ytrain)\n",
|
||||
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
|
||||
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
|
||||
@@ -144,7 +153,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Different kernels with different configuations"
|
||||
"## Adaboost"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -161,18 +170,24 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n"
|
||||
"text": [
|
||||
"Kernel: linear\tTime: 49.66 seconds\tScore Train: 0.9983225\tScore Test: 0.9983083\n",
|
||||
"Kernel: rbf\tTime: 12.73 seconds\tScore Train: 0.9934891\tScore Test: 0.9934656\n",
|
||||
"Kernel: poly\tTime: 76.24 seconds\tScore Train: 0.9972706\tScore Test: 0.9969152\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||
" now = time.time()\n",
|
||||
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state, max_iter=1e3), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf.fit(Xtrain, ytrain)\n",
|
||||
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||
" score_test = clf.score(Xtest, ytest)\n",
|
||||
@@ -183,24 +198,41 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test algorithm SAMME in AdaBoost to check speed/accuracy"
|
||||
"## Bagging"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"n_estimators = 10\n",
|
||||
"C = 7\n",
|
||||
"max_depth = 3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n"
|
||||
"text": [
|
||||
"Kernel: linear\tTime: 231.51 seconds\tScore Train: 0.9984931\tScore Test: 0.9983083\n",
|
||||
"Kernel: rbf\tTime: 114.77 seconds\tScore Train: 0.9992323\tScore Test: 0.9983083\n",
|
||||
"Kernel: poly\tTime: 67.87 seconds\tScore Train: 0.9993319\tScore Test: 0.9985074\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||
" now = time.time()\n",
|
||||
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
" clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state, max_iter=1e3), n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf.fit(Xtrain, ytrain)\n",
|
||||
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||
" score_test = clf.score(Xtest, ytest)\n",
|
||||
@@ -219,12 +251,12 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.8.4-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||
"name": "python38464bitgeneralf6de308d3831407c8bd68d4a5e328a38",
|
||||
"display_name": "Python 3.8.4 64-bit ('general')"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
@@ -4,7 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test smple_weight, kernels, C, sklearn estimator"
|
||||
"# Test sample_weight, kernels, C, sklearn estimator"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -47,7 +47,9 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
@@ -59,12 +61,16 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.110% 494\nValid: 66.890% 998\n"
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (5492, 28) y.shape (5492,)\nFraud: 9.141% 502\nValid: 90.859% 4990\n[0.09183143 0.09183143 0.09183143 0.09183143] [0.09041262 0.09041262 0.09041262 0.09041262]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -94,22 +100,29 @@
|
||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state)\n",
|
||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||
"\n",
|
||||
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
||||
"data = load_creditcard(-5000) # Take all true samples with up to 5000 of the others\n",
|
||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||
"data = load_creditcard(-1000) # Take all the samples\n",
|
||||
"# data = load_creditcard(-1000) # Take 1000 samples\n",
|
||||
"\n",
|
||||
"Xtrain = data[0]\n",
|
||||
"Xtest = data[1]\n",
|
||||
"ytrain = data[2]\n",
|
||||
"ytest = data[3]\n",
|
||||
"_, data = np.unique(ytrain, return_counts=True)\n",
|
||||
"wtrain = (data[1] / np.sum(data), data[0] / np.sum(data))\n",
|
||||
"_, data = np.unique(ytest, return_counts=True)\n",
|
||||
"wtest = (data[1] / np.sum(data), data[0] / np.sum(data))\n",
|
||||
"# Set weights inverse to its count class in dataset\n",
|
||||
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
|
||||
"weights[ytrain==1] = 1.99755\n",
|
||||
"weights_test = np.ones(Xtest.shape[0],) * 1.00244\n",
|
||||
"weights_test[ytest==1] = 1.99755 "
|
||||
"weights = np.ones(Xtrain.shape[0],)\n",
|
||||
"weights[ytrain==0] = wtrain[0]\n",
|
||||
"weights[ytrain==1] = wtrain[1]\n",
|
||||
"weights_test = np.ones(Xtest.shape[0],)\n",
|
||||
"weights_test[ytest==0] = wtest[0]\n",
|
||||
"weights_test[ytest==1] = wtest[1]\n",
|
||||
"print(weights[:4], weights_test[:4])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -123,19 +136,26 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test smple_weights\n",
|
||||
"## Test sample_weights\n",
|
||||
"Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Accuracy of Train without weights 0.9789272030651341\nAccuracy of Train with weights 0.9952107279693486\nAccuracy of Tests without weights 0.9598214285714286\nAccuracy of Tests with weights 0.9508928571428571\n"
|
||||
"text": [
|
||||
"Accuracy of Train without weights 0.9851716961498439\n",
|
||||
"Accuracy of Train with weights 0.986732570239334\n",
|
||||
"Accuracy of Tests without weights 0.9866504854368932\n",
|
||||
"Accuracy of Tests with weights 0.9781553398058253\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -157,12 +177,18 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Time: 0.27s\tKernel: linear\tAccuracy_train: 0.9683908045977011\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9875478927203065\tAccuracy_test: 0.9598214285714286\nTime: 0.06s\tKernel: poly\tAccuracy_train: 0.9885057471264368\tAccuracy_test: 0.9464285714285714\n"
|
||||
"text": [
|
||||
"Time: 26.03s\tKernel: linear\tAccuracy_train: 0.9851716961498439\tAccuracy_test: 0.9866504854368932\n",
|
||||
"Time: 0.54s\tKernel: rbf\tAccuracy_train: 0.9947970863683663\tAccuracy_test: 0.9878640776699029\n",
|
||||
"Time: 0.43s\tKernel: poly\tAccuracy_train: 0.9960978147762747\tAccuracy_test: 0.9854368932038835\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -187,15 +213,65 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"outputPrepend"
|
||||
]
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9531\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.983713 counts=(array([0, 1]), array([ 5, 302]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.940299 counts=(array([0, 1]), array([693, 44]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.990228 counts=(array([0, 1]), array([ 3, 304]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.943012 counts=(array([0, 1]), array([695, 42]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9655\nClassifier's accuracy (test) : 0.9643\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([5]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.950617 counts=(array([0, 1]), array([693, 36]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9684\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.954039 counts=(array([0, 1]), array([685, 33]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9751\nClassifier's accuracy (test) : 0.9464\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n\n**************************************************\n0.6869 secs\n"
|
||||
"text": [
|
||||
"************** C=0.001 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9828\n",
|
||||
"Classifier's accuracy (test) : 0.9848\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.981716 impurity=0.1317 counts=(array([0, 1]), array([3490, 65]))\n",
|
||||
"root - Up, <cgaf> - Leaf class=1 belief= 0.996540 impurity=0.0333 counts=(array([0, 1]), array([ 1, 288]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=0.01 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9834\n",
|
||||
"Classifier's accuracy (test) : 0.9854\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.982269 impurity=0.1285 counts=(array([0, 1]), array([3490, 63]))\n",
|
||||
"root - Up, <cgaf> - Leaf class=1 belief= 0.996564 impurity=0.0331 counts=(array([0, 1]), array([ 1, 290]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=1 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9847\n",
|
||||
"Classifier's accuracy (test) : 0.9867\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.983371 impurity=0.1221 counts=(array([0, 1]), array([3489, 59]))\n",
|
||||
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0584 counts=(array([0, 1]), array([ 2, 294]))\n",
|
||||
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([2]))\n",
|
||||
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([294]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=5 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9852\n",
|
||||
"Classifier's accuracy (test) : 0.9867\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\n",
|
||||
"root - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\n",
|
||||
"root - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\n",
|
||||
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\n",
|
||||
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\n",
|
||||
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=17 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9852\n",
|
||||
"Classifier's accuracy (test) : 0.9867\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\n",
|
||||
"root - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\n",
|
||||
"root - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\n",
|
||||
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\n",
|
||||
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\n",
|
||||
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"64.5792 secs\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -222,12 +298,16 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n"
|
||||
"text": [
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\nroot - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\nroot - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -239,12 +319,16 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n"
|
||||
"text": [
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\nroot - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\nroot - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -263,12 +347,58 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x1254f13b0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x1254e84d0>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x1254e83b0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x1254e0cb0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x1254e0dd0>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x1254e0ef0>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x1254e2050>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x1254e2170>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x1254eb4d0>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x1254eb4d0>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x1254e2320>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x1254e2290>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x1254e85f0>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x1254e8290>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x1254e8710>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x1254f1290>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x1254e0b90>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x1254e8950>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x1254f15f0>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x1254eb050>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x1254eba70>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x1254e8a70>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x1254f40e0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x1254da9e0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x1254eb710>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x1254eb5f0>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x1254f1c20>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x1254f4200>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x1254e2830>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x1254e29e0>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x1254e2b00>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x1254e2c20>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x1254e2d40>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x1254f1e60>, 'Stree')\n38 functools.partial(<function check_set_params at 0x1254f1f80>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x1254e2440>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x1254e2710>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x1254f43b0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x1254f4440>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x1254f44d0>, 'Stree')\n"
|
||||
"text": [
|
||||
"1 functools.partial(<function check_no_attributes_set_in_init at 0x125acaee0>, 'Stree')\n",
|
||||
"2 functools.partial(<function check_estimators_dtypes at 0x125ac7040>, 'Stree')\n",
|
||||
"3 functools.partial(<function check_fit_score_takes_y at 0x125ac2ee0>, 'Stree')\n",
|
||||
"4 functools.partial(<function check_sample_weights_pandas_series at 0x125ac0820>, 'Stree')\n",
|
||||
"5 functools.partial(<function check_sample_weights_not_an_array at 0x125ac0940>, 'Stree')\n",
|
||||
"6 functools.partial(<function check_sample_weights_list at 0x125ac0a60>, 'Stree')\n",
|
||||
"7 functools.partial(<function check_sample_weights_shape at 0x125ac0b80>, 'Stree')\n",
|
||||
"8 functools.partial(<function check_sample_weights_invariance at 0x125ac0ca0>, 'Stree')\n",
|
||||
"9 functools.partial(<function check_estimators_fit_returns_self at 0x125aca040>, 'Stree')\n",
|
||||
"10 functools.partial(<function check_estimators_fit_returns_self at 0x125aca040>, 'Stree', readonly_memmap=True)\n",
|
||||
"11 functools.partial(<function check_complex_data at 0x125ac0e50>, 'Stree')\n",
|
||||
"12 functools.partial(<function check_dtype_object at 0x125ac0dc0>, 'Stree')\n",
|
||||
"13 functools.partial(<function check_estimators_empty_data_messages at 0x125ac7160>, 'Stree')\n",
|
||||
"14 functools.partial(<function check_pipeline_consistency at 0x125ac2dc0>, 'Stree')\n",
|
||||
"15 functools.partial(<function check_estimators_nan_inf at 0x125ac7280>, 'Stree')\n",
|
||||
"16 functools.partial(<function check_estimators_overwrite_params at 0x125acadc0>, 'Stree')\n",
|
||||
"17 functools.partial(<function check_estimator_sparse_data at 0x125ac0700>, 'Stree')\n",
|
||||
"18 functools.partial(<function check_estimators_pickle at 0x125ac74c0>, 'Stree')\n",
|
||||
"19 functools.partial(<function check_classifier_data_not_an_array at 0x125acd160>, 'Stree')\n",
|
||||
"20 functools.partial(<function check_classifiers_one_label at 0x125ac7b80>, 'Stree')\n",
|
||||
"21 functools.partial(<function check_classifiers_classes at 0x125aca5e0>, 'Stree')\n",
|
||||
"22 functools.partial(<function check_estimators_partial_fit_n_features at 0x125ac75e0>, 'Stree')\n",
|
||||
"23 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree')\n",
|
||||
"24 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree', readonly_memmap=True)\n",
|
||||
"25 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n",
|
||||
"26 functools.partial(<function check_classifiers_regression_target at 0x125acdc10>, 'Stree')\n",
|
||||
"27 functools.partial(<function check_supervised_y_no_nan at 0x125aab790>, 'Stree')\n",
|
||||
"28 functools.partial(<function check_supervised_y_2d at 0x125aca280>, 'Stree')\n",
|
||||
"29 functools.partial(<function check_estimators_unfitted at 0x125aca160>, 'Stree')\n",
|
||||
"30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x125acd790>, 'Stree')\n",
|
||||
"31 functools.partial(<function check_decision_proba_consistency at 0x125acdd30>, 'Stree')\n",
|
||||
"32 functools.partial(<function check_fit2d_predict1d at 0x125ac23a0>, 'Stree')\n",
|
||||
"33 functools.partial(<function check_methods_subset_invariance at 0x125ac2550>, 'Stree')\n",
|
||||
"34 functools.partial(<function check_fit2d_1sample at 0x125ac2670>, 'Stree')\n",
|
||||
"35 functools.partial(<function check_fit2d_1feature at 0x125ac2790>, 'Stree')\n",
|
||||
"36 functools.partial(<function check_fit1d at 0x125ac28b0>, 'Stree')\n",
|
||||
"37 functools.partial(<function check_get_params_invariance at 0x125acd9d0>, 'Stree')\n",
|
||||
"38 functools.partial(<function check_set_params at 0x125acdaf0>, 'Stree')\n",
|
||||
"39 functools.partial(<function check_dict_unchanged at 0x125ac0f70>, 'Stree')\n",
|
||||
"40 functools.partial(<function check_dont_overwrite_parameters at 0x125ac2280>, 'Stree')\n",
|
||||
"41 functools.partial(<function check_fit_idempotent at 0x125acdee0>, 'Stree')\n",
|
||||
"42 functools.partial(<function check_n_features_in at 0x125acdf70>, 'Stree')\n",
|
||||
"43 functools.partial(<function check_requires_y_none at 0x125ad1040>, 'Stree')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -301,12 +431,27 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "== Not Weighted ===\nSVC train score ..: 0.9521072796934866\nSTree train score : 0.9578544061302682\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9575892857142857\n==== Weighted =====\nSVC train score ..: 0.9616858237547893\nSTree train score : 0.9616858237547893\nSVC test score ...: 0.9642857142857143\nSTree test score .: 0.9598214285714286\n*SVC test score ..: 0.951413553411694\n*STree test score : 0.9480517444389333\n"
|
||||
"text": [
|
||||
"== Not Weighted ===\n",
|
||||
"SVC train score ..: 0.9825702393340271\n",
|
||||
"STree train score : 0.9841311134235172\n",
|
||||
"SVC test score ...: 0.9830097087378641\n",
|
||||
"STree test score .: 0.9848300970873787\n",
|
||||
"==== Weighted =====\n",
|
||||
"SVC train score ..: 0.9786680541103018\n",
|
||||
"STree train score : 0.9802289281997919\n",
|
||||
"SVC test score ...: 0.9805825242718447\n",
|
||||
"STree test score .: 0.9817961165048543\n",
|
||||
"*SVC test score ..: 0.9439939825655582\n",
|
||||
"*STree test score : 0.9476832429673473\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -333,17 +478,89 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.969325 counts=(array([0, 1]), array([ 10, 316]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.958159 counts=(array([0, 1]), array([687, 30]))\n\n"
|
||||
"text": [
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down, <cgaf> - Leaf class=0 belief= 0.990520 impurity=0.0773 counts=(array([0, 1]), array([3448, 33]))\nroot - Up, <cgaf> - Leaf class=1 belief= 0.881543 impurity=0.5249 counts=(array([0, 1]), array([ 43, 320]))\n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(clf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test max_features"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"****************************************\n",
|
||||
"max_features None = 28\n",
|
||||
"Train score : 0.9846514047866806\n",
|
||||
"Test score .: 0.9866504854368932\n",
|
||||
"Took 10.18 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features auto = 5\n",
|
||||
"Train score : 0.9836108220603538\n",
|
||||
"Test score .: 0.9842233009708737\n",
|
||||
"Took 5.22 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features log2 = 4\n",
|
||||
"Train score : 0.9791883454734651\n",
|
||||
"Test score .: 0.9793689320388349\n",
|
||||
"Took 2.05 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 7 = 7\n",
|
||||
"Train score : 0.9737252861602498\n",
|
||||
"Test score .: 0.9739077669902912\n",
|
||||
"Took 2.86 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 0.5 = 14\n",
|
||||
"Train score : 0.981789802289282\n",
|
||||
"Test score .: 0.9824029126213593\n",
|
||||
"Took 48.35 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 0.1 = 2\n",
|
||||
"Train score : 0.9638397502601457\n",
|
||||
"Test score .: 0.9648058252427184\n",
|
||||
"Took 0.35 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 0.7 = 19\n",
|
||||
"Train score : 0.9841311134235172\n",
|
||||
"Test score .: 0.9860436893203883\n",
|
||||
"Took 20.89 seconds\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for max_features in [None, \"auto\", \"log2\", 7, .5, .1, .7]:\n",
|
||||
" now = time.time()\n",
|
||||
" print(\"*\"*40)\n",
|
||||
" clf = Stree(random_state=random_state, max_features=max_features)\n",
|
||||
" clf.fit(Xtrain, ytrain)\n",
|
||||
" print(f\"max_features {max_features} = {clf.max_features_}\")\n",
|
||||
" print(\"Train score :\", clf.score(Xtrain, ytrain))\n",
|
||||
" print(\"Test score .:\", clf.score(Xtest, ytest))\n",
|
||||
" print(f\"Took {time.time() - now:.2f} seconds\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -362,7 +579,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.8.4-final"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@@ -66,7 +66,8 @@
|
||||
"id": "z9Q-YUfBDZEq",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b"
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"random_state=1\n",
|
||||
@@ -112,7 +113,9 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -131,31 +134,68 @@
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"parameters = {\n",
|
||||
"parameters = [{\n",
|
||||
" 'base_estimator': [Stree()],\n",
|
||||
" 'n_estimators': [10, 25],\n",
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5],\n",
|
||||
" 'base_estimator__C': [1, 3],\n",
|
||||
" 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
|
||||
"}"
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__kernel': ['linear']\n",
|
||||
"},\n",
|
||||
"{\n",
|
||||
" 'base_estimator': [Stree()],\n",
|
||||
" 'n_estimators': [10, 25],\n",
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__degree': [3, 5, 7],\n",
|
||||
" 'base_estimator__kernel': ['poly']\n",
|
||||
"},\n",
|
||||
"{\n",
|
||||
" 'base_estimator': [Stree()],\n",
|
||||
" 'n_estimators': [10, 25],\n",
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__gamma': [.1, 1, 10],\n",
|
||||
" 'base_estimator__kernel': ['rbf']\n",
|
||||
"}]"
|
||||
],
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}"
|
||||
"text/plain": [
|
||||
"{'C': 1.0,\n",
|
||||
" 'criterion': 'entropy',\n",
|
||||
" 'degree': 3,\n",
|
||||
" 'gamma': 'scale',\n",
|
||||
" 'kernel': 'linear',\n",
|
||||
" 'max_depth': None,\n",
|
||||
" 'max_features': None,\n",
|
||||
" 'max_iter': 100000.0,\n",
|
||||
" 'min_samples_split': 0,\n",
|
||||
" 'random_state': None,\n",
|
||||
" 'split_criteria': 'impurity',\n",
|
||||
" 'splitter': 'random',\n",
|
||||
" 'tol': 0.0001}"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 14
|
||||
"execution_count": 6
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -168,52 +208,214 @@
|
||||
"id": "CrcB8o6EDZE5",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762"
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"random_state=2020\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state)\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
|
||||
"grid.fit(Xtrain, ytrain)"
|
||||
],
|
||||
"execution_count": 11,
|
||||
"execution_count": 7,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 3.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 4.2s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 4.8s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 7.2s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 8.9s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 10.7s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.7s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 1.5min finished\n"
|
||||
"text": [
|
||||
"Fitting 5 folds for each of 1008 candidates, totalling 5040 fits\n",
|
||||
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.6s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 3.2s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 3.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 4.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 4.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 5.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 5.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 6.2s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 7.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 8.2s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 9.6s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 11.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 12.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 14.3s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 16.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 18.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 20.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 21.9s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 23.4s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 24.9s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 26.6s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 29.3s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 31.9s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 35.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 38.7s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 42.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 496 tasks | elapsed: 46.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 529 tasks | elapsed: 52.7s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 562 tasks | elapsed: 58.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 597 tasks | elapsed: 1.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 632 tasks | elapsed: 1.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 669 tasks | elapsed: 1.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 706 tasks | elapsed: 1.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 745 tasks | elapsed: 1.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 784 tasks | elapsed: 1.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 825 tasks | elapsed: 1.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 866 tasks | elapsed: 1.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 909 tasks | elapsed: 1.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 952 tasks | elapsed: 1.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 997 tasks | elapsed: 2.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1042 tasks | elapsed: 2.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1089 tasks | elapsed: 2.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1136 tasks | elapsed: 2.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1185 tasks | elapsed: 2.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1234 tasks | elapsed: 2.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1285 tasks | elapsed: 2.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1336 tasks | elapsed: 2.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1389 tasks | elapsed: 2.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1442 tasks | elapsed: 2.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1497 tasks | elapsed: 2.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1552 tasks | elapsed: 2.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1609 tasks | elapsed: 2.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1666 tasks | elapsed: 2.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1725 tasks | elapsed: 2.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1784 tasks | elapsed: 3.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1845 tasks | elapsed: 3.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1906 tasks | elapsed: 3.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1969 tasks | elapsed: 3.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2032 tasks | elapsed: 3.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2097 tasks | elapsed: 3.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2162 tasks | elapsed: 3.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2229 tasks | elapsed: 3.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2296 tasks | elapsed: 3.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2365 tasks | elapsed: 3.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2434 tasks | elapsed: 3.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2505 tasks | elapsed: 3.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2576 tasks | elapsed: 3.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2649 tasks | elapsed: 3.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2722 tasks | elapsed: 4.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2797 tasks | elapsed: 4.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2872 tasks | elapsed: 4.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2949 tasks | elapsed: 4.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3026 tasks | elapsed: 4.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3105 tasks | elapsed: 4.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3184 tasks | elapsed: 4.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3265 tasks | elapsed: 5.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3346 tasks | elapsed: 5.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3429 tasks | elapsed: 5.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3512 tasks | elapsed: 5.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3597 tasks | elapsed: 5.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3682 tasks | elapsed: 6.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3769 tasks | elapsed: 6.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3856 tasks | elapsed: 6.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3945 tasks | elapsed: 6.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4034 tasks | elapsed: 7.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4125 tasks | elapsed: 7.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4216 tasks | elapsed: 7.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4309 tasks | elapsed: 7.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4402 tasks | elapsed: 8.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4497 tasks | elapsed: 8.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4592 tasks | elapsed: 8.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4689 tasks | elapsed: 9.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4786 tasks | elapsed: 9.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4885 tasks | elapsed: 9.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4984 tasks | elapsed: 9.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 5040 out of 5040 | elapsed: 10.0min finished\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
|
||||
"text/plain": [
|
||||
"GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n",
|
||||
" n_jobs=-1,\n",
|
||||
" param_grid=[{'base_estimator': [Stree(C=7, max_depth=5,\n",
|
||||
" split_criteria='max_samples',\n",
|
||||
" tol=0.01)],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__kernel': ['linear'],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples',\n",
|
||||
" 'impurity'],\n",
|
||||
" 'base_e...\n",
|
||||
" 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n",
|
||||
" {'base_estimator': [Stree()],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__gamma': [0.1, 1, 10],\n",
|
||||
" 'base_estimator__kernel': ['rbf'],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples',\n",
|
||||
" 'impurity'],\n",
|
||||
" 'base_estimator__tol': [0.1, 0.01],\n",
|
||||
" 'learning_rate': [0.5, 1],\n",
|
||||
" 'n_estimators': [10, 25]}],\n",
|
||||
" return_train_score=True, verbose=10)"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 11
|
||||
"execution_count": 7
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n",
|
||||
" n_jobs=-1,\n",
|
||||
" param_grid={'base_estimator': [Stree(C=55, max_depth=3, tol=0.01)],\n",
|
||||
" 'base_estimator__C': [7, 55],\n",
|
||||
" 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n",
|
||||
" 'base_estimator__max_depth': [3, 5],\n",
|
||||
" 'base_estimator__tol': [0.1, 0.01],\n",
|
||||
" 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n",
|
||||
" return_train_score=True, verbose=10)"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "ZjX88NoYDZE8",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344"
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"print(\"Best estimator: \", grid.best_estimator_)\n",
|
||||
"print(\"Best hyperparameters: \", grid.best_params_)\n",
|
||||
"print(\"Best accuracy: \", grid.best_score_)"
|
||||
],
|
||||
"execution_count": 16,
|
||||
"execution_count": 8,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Best estimator: AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy: 0.9492316893632683\n"
|
||||
"text": [
|
||||
"Best estimator: AdaBoostClassifier(algorithm='SAMME',\n base_estimator=Stree(C=7, max_depth=5,\n split_criteria='max_samples',\n tol=0.01),\n learning_rate=0.5, n_estimators=25, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=7, max_depth=5, split_criteria='max_samples', tol=0.01), 'base_estimator__C': 7, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 5, 'base_estimator__split_criteria': 'max_samples', 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\nBest accuracy: 0.9549825174825175\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"Best estimator: AdaBoostClassifier(algorithm='SAMME',\n",
|
||||
" base_estimator=Stree(C=55, max_depth=3, tol=0.01),\n",
|
||||
" learning_rate=0.5, n_estimators=25, random_state=2020)\n",
|
||||
"\n",
|
||||
"Best hyperparameters: {'base_estimator': Stree(C=55, max_depth=3, tol=0.01), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\n",
|
||||
"\n",
|
||||
"Best accuracy: 0.9559440559440558"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"0.9511547662863451"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -227,12 +429,12 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.8.4-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||
"name": "python38464bitgeneralvenv77203c0a6afd4428bd66253ef62753dc",
|
||||
"display_name": "Python 3.8.4 64-bit ('general': venv)"
|
||||
},
|
||||
"colab": {
|
||||
"name": "gridsearch.ipynb",
|
||||
|
File diff suppressed because one or more lines are too long
@@ -1,5 +1,4 @@
|
||||
numpy
|
||||
scikit-learn
|
||||
scikit-learn==0.23.2
|
||||
pandas
|
||||
matplotlib
|
||||
ipympl
|
6
setup.py
6
setup.py
@@ -1,6 +1,6 @@
|
||||
import setuptools
|
||||
|
||||
__version__ = "0.9rc4"
|
||||
__version__ = "0.9rc6"
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
|
||||
|
||||
@@ -25,12 +25,12 @@ setuptools.setup(
|
||||
classifiers=[
|
||||
"Development Status :: 4 - Beta",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Natural Language :: English",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Intended Audience :: Science/Research",
|
||||
],
|
||||
install_requires=["scikit-learn>=0.23.0", "numpy", "matplotlib", "ipympl"],
|
||||
install_requires=["scikit-learn==0.23.2", "numpy", "ipympl"],
|
||||
test_suite="stree.tests",
|
||||
zip_safe=False,
|
||||
)
|
||||
|
636
stree/Strees.py
636
stree/Strees.py
@@ -7,19 +7,23 @@ Build an oblique tree classifier based on SVM Trees
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numbers
|
||||
import random
|
||||
import warnings
|
||||
from math import log, factorial
|
||||
from typing import Optional
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.svm import SVC, LinearSVC
|
||||
from sklearn.utils import check_consistent_length
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils.validation import (
|
||||
check_X_y,
|
||||
check_array,
|
||||
check_is_fitted,
|
||||
_check_sample_weight,
|
||||
)
|
||||
from sklearn.utils.sparsefuncs import count_nonzero
|
||||
from sklearn.metrics._classification import _weighted_sum, _check_targets
|
||||
|
||||
|
||||
@@ -28,7 +32,16 @@ class Snode:
|
||||
dataset assigned to it
|
||||
"""
|
||||
|
||||
def __init__(self, clf: SVC, X: np.ndarray, y: np.ndarray, title: str):
|
||||
def __init__(
|
||||
self,
|
||||
clf: SVC,
|
||||
X: np.ndarray,
|
||||
y: np.ndarray,
|
||||
features: np.array,
|
||||
impurity: float,
|
||||
title: str,
|
||||
weight: np.ndarray = None,
|
||||
):
|
||||
self._clf = clf
|
||||
self._title = title
|
||||
self._belief = 0.0
|
||||
@@ -38,10 +51,30 @@ class Snode:
|
||||
self._down = None
|
||||
self._up = None
|
||||
self._class = None
|
||||
self._feature = None
|
||||
self._sample_weight = (
|
||||
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||
)
|
||||
self._features = features
|
||||
self._impurity = impurity
|
||||
self._partition_column: int = -1
|
||||
|
||||
@classmethod
|
||||
def copy(cls, node: "Snode") -> "Snode":
|
||||
return cls(node._clf, node._X, node._y, node._title)
|
||||
return cls(
|
||||
node._clf,
|
||||
node._X,
|
||||
node._y,
|
||||
node._features,
|
||||
node._impurity,
|
||||
node._title,
|
||||
)
|
||||
|
||||
def set_partition_column(self, col: int):
|
||||
self._partition_column = col
|
||||
|
||||
def get_partition_column(self) -> int:
|
||||
return self._partition_column
|
||||
|
||||
def set_down(self, son):
|
||||
self._down = son
|
||||
@@ -67,9 +100,8 @@ class Snode:
|
||||
classes, card = np.unique(self._y, return_counts=True)
|
||||
if len(classes) > 1:
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
self._class = classes[card == max_card][0]
|
||||
self._belief = max_card / (max_card + min_card)
|
||||
self._belief = max_card / np.sum(card)
|
||||
else:
|
||||
self._belief = 1
|
||||
try:
|
||||
@@ -78,28 +110,28 @@ class Snode:
|
||||
self._class = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
count_values = np.unique(self._y, return_counts=True)
|
||||
if self.is_leaf():
|
||||
count_values = np.unique(self._y, return_counts=True)
|
||||
result = (
|
||||
return (
|
||||
f"{self._title} - Leaf class={self._class} belief="
|
||||
f"{self._belief: .6f} counts={count_values}"
|
||||
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||
f"counts={count_values}"
|
||||
)
|
||||
return result
|
||||
else:
|
||||
return f"{self._title}"
|
||||
return (
|
||||
f"{self._title} feaures={self._features} impurity="
|
||||
f"{self._impurity:.4f} "
|
||||
f"counts={count_values}"
|
||||
)
|
||||
|
||||
|
||||
class Siterator:
|
||||
"""Stree preorder iterator
|
||||
"""
|
||||
"""Stree preorder iterator"""
|
||||
|
||||
def __init__(self, tree: Snode):
|
||||
self._stack = []
|
||||
self._push(tree)
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def _push(self, node: Snode):
|
||||
if node is not None:
|
||||
self._stack.append(node)
|
||||
@@ -113,6 +145,249 @@ class Siterator:
|
||||
return node
|
||||
|
||||
|
||||
class Splitter:
|
||||
def __init__(
|
||||
self,
|
||||
clf: SVC = None,
|
||||
criterion: str = None,
|
||||
splitter_type: str = None,
|
||||
criteria: str = None,
|
||||
min_samples_split: int = None,
|
||||
random_state=None,
|
||||
):
|
||||
self._clf = clf
|
||||
self._random_state = random_state
|
||||
if random_state is not None:
|
||||
random.seed(random_state)
|
||||
self._criterion = criterion
|
||||
self._min_samples_split = min_samples_split
|
||||
self._criteria = criteria
|
||||
self._splitter_type = splitter_type
|
||||
|
||||
if clf is None:
|
||||
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||
|
||||
if criterion not in ["gini", "entropy"]:
|
||||
raise ValueError(
|
||||
f"criterion must be gini or entropy got({criterion})"
|
||||
)
|
||||
|
||||
if criteria not in [
|
||||
"max_samples",
|
||||
"impurity",
|
||||
]:
|
||||
raise ValueError(
|
||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||
)
|
||||
|
||||
if splitter_type not in ["random", "best"]:
|
||||
raise ValueError(
|
||||
f"splitter must be either random or best, got({splitter_type})"
|
||||
)
|
||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||
|
||||
def partition_impurity(self, y: np.array) -> np.array:
|
||||
return self.criterion_function(y)
|
||||
|
||||
@staticmethod
|
||||
def _gini(y: np.array) -> float:
|
||||
_, count = np.unique(y, return_counts=True)
|
||||
return 1 - np.sum(np.square(count / np.sum(count)))
|
||||
|
||||
@staticmethod
|
||||
def _entropy(y: np.array) -> float:
|
||||
n_labels = len(y)
|
||||
if n_labels <= 1:
|
||||
return 0
|
||||
counts = np.bincount(y)
|
||||
proportions = counts / n_labels
|
||||
n_classes = np.count_nonzero(proportions)
|
||||
if n_classes <= 1:
|
||||
return 0
|
||||
entropy = 0.0
|
||||
# Compute standard entropy.
|
||||
for prop in proportions:
|
||||
if prop != 0.0:
|
||||
entropy -= prop * log(prop, n_classes)
|
||||
return entropy
|
||||
|
||||
def information_gain(
|
||||
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||
) -> float:
|
||||
imp_prev = self.criterion_function(labels)
|
||||
card_up = card_dn = imp_up = imp_dn = 0
|
||||
if labels_up is not None:
|
||||
card_up = labels_up.shape[0]
|
||||
imp_up = self.criterion_function(labels_up)
|
||||
if labels_dn is not None:
|
||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||
imp_dn = self.criterion_function(labels_dn)
|
||||
samples = card_up + card_dn
|
||||
if samples == 0:
|
||||
return 0.0
|
||||
else:
|
||||
result = (
|
||||
imp_prev
|
||||
- (card_up / samples) * imp_up
|
||||
- (card_dn / samples) * imp_dn
|
||||
)
|
||||
return result
|
||||
|
||||
def _select_best_set(
|
||||
self, dataset: np.array, labels: np.array, features_sets: list
|
||||
) -> list:
|
||||
max_gain = 0
|
||||
selected = None
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
for feature_set in features_sets:
|
||||
self._clf.fit(dataset[:, feature_set], labels)
|
||||
node = Snode(
|
||||
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
||||
)
|
||||
self.partition(dataset, node, train=True)
|
||||
y1, y2 = self.part(labels)
|
||||
gain = self.information_gain(labels, y1, y2)
|
||||
if gain > max_gain:
|
||||
max_gain = gain
|
||||
selected = feature_set
|
||||
return selected if selected is not None else feature_set
|
||||
|
||||
@staticmethod
|
||||
def _generate_spaces(features: int, max_features: int) -> list:
|
||||
comb = set()
|
||||
# Generate at most 5 combinations
|
||||
if max_features == features:
|
||||
set_length = 1
|
||||
else:
|
||||
number = factorial(features) / (
|
||||
factorial(max_features) * factorial(features - max_features)
|
||||
)
|
||||
set_length = min(5, number)
|
||||
while len(comb) < set_length:
|
||||
comb.add(
|
||||
tuple(sorted(random.sample(range(features), max_features)))
|
||||
)
|
||||
return list(comb)
|
||||
|
||||
def _get_subspaces_set(
|
||||
self, dataset: np.array, labels: np.array, max_features: int
|
||||
) -> np.array:
|
||||
features_sets = self._generate_spaces(dataset.shape[1], max_features)
|
||||
if len(features_sets) > 1:
|
||||
if self._splitter_type == "random":
|
||||
index = random.randint(0, len(features_sets) - 1)
|
||||
return features_sets[index]
|
||||
else:
|
||||
return self._select_best_set(dataset, labels, features_sets)
|
||||
else:
|
||||
return features_sets[0]
|
||||
|
||||
def get_subspace(
|
||||
self, dataset: np.array, labels: np.array, max_features: int
|
||||
) -> tuple:
|
||||
"""Return the best/random subspace to make a split"""
|
||||
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||
return dataset[:, indices], indices
|
||||
|
||||
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
||||
"""return column of dataset to be taken into account to split dataset
|
||||
|
||||
:param data: distances to hyper plane of every class
|
||||
:type data: np.array (m, n_classes)
|
||||
:param y: vector of labels (classes)
|
||||
:type y: np.array (m,)
|
||||
:return: column of dataset to be taken into account to split dataset
|
||||
:rtype: int
|
||||
"""
|
||||
max_gain = 0
|
||||
selected = -1
|
||||
for col in range(data.shape[1]):
|
||||
tup = y[data[:, col] > 0]
|
||||
tdn = y[data[:, col] <= 0]
|
||||
info_gain = self.information_gain(y, tup, tdn)
|
||||
if info_gain > max_gain:
|
||||
selected = col
|
||||
max_gain = info_gain
|
||||
return selected
|
||||
|
||||
@staticmethod
|
||||
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||
"""return column of dataset to be taken into account to split dataset
|
||||
|
||||
:param data: distances to hyper plane of every class
|
||||
:type data: np.array (m, n_classes)
|
||||
:param y: vector of labels (classes)
|
||||
:type y: np.array (m,)
|
||||
:return: column of dataset to be taken into account to split dataset
|
||||
:rtype: int
|
||||
"""
|
||||
# select the class with max number of samples
|
||||
_, samples = np.unique(y, return_counts=True)
|
||||
return np.argmax(samples)
|
||||
|
||||
def partition(self, samples: np.array, node: Snode, train: bool):
|
||||
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||
that should go to one side of the tree (down)
|
||||
|
||||
"""
|
||||
# data contains the distances of every sample to every class hyperplane
|
||||
# array of (m, nc) nc = # classes
|
||||
data = self._distances(node, samples)
|
||||
if data.shape[0] < self._min_samples_split:
|
||||
# there aren't enough samples to split
|
||||
self._up = np.ones((data.shape[0]), dtype=bool)
|
||||
return
|
||||
if data.ndim > 1:
|
||||
# split criteria for multiclass
|
||||
# Convert data to a (m, 1) array selecting values for samples
|
||||
if train:
|
||||
# in train time we have to compute the column to take into
|
||||
# account to split the dataset
|
||||
col = self.decision_criteria(data, node._y)
|
||||
node.set_partition_column(col)
|
||||
else:
|
||||
# in predcit time just use the column computed in train time
|
||||
# is taking the classifier of class <col>
|
||||
col = node.get_partition_column()
|
||||
if col == -1:
|
||||
# No partition is producing information gain
|
||||
data = np.ones(data.shape)
|
||||
data = data[:, col]
|
||||
self._up = data > 0
|
||||
|
||||
def part(self, origin: np.array) -> list:
|
||||
"""Split an array in two based on indices (down) and its complement
|
||||
partition has to be called first to establish down indices
|
||||
|
||||
:param origin: dataset to split
|
||||
:type origin: np.array
|
||||
:param down: indices to use to split array
|
||||
:type down: np.array
|
||||
:return: list with two splits of the array
|
||||
:rtype: list
|
||||
"""
|
||||
down = ~self._up
|
||||
return [
|
||||
origin[self._up] if any(self._up) else None,
|
||||
origin[down] if any(down) else None,
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _distances(node: Snode, data: np.ndarray) -> np.array:
|
||||
"""Compute distances of the samples to the hyperplane of the node
|
||||
|
||||
:param node: node containing the svm classifier
|
||||
:type node: Snode
|
||||
:param data: samples to find out distance to hyperplane
|
||||
:type data: np.ndarray
|
||||
:return: array of shape (m, nc) with the distances of every sample to
|
||||
the hyperplane of every class. nc = # of classes
|
||||
:rtype: np.array
|
||||
"""
|
||||
return node._clf.decision_function(data[:, node._features])
|
||||
|
||||
|
||||
class Stree(BaseEstimator, ClassifierMixin):
|
||||
"""Estimator that is based on binary trees of svm nodes
|
||||
can deal with sample_weights in predict, used in boosting sklearn methods
|
||||
@@ -125,13 +400,17 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
self,
|
||||
C: float = 1.0,
|
||||
kernel: str = "linear",
|
||||
max_iter: int = 1000,
|
||||
max_iter: int = 1e5,
|
||||
random_state: int = None,
|
||||
max_depth: int = None,
|
||||
tol: float = 1e-4,
|
||||
degree: int = 3,
|
||||
gamma="scale",
|
||||
split_criteria: str = "impurity",
|
||||
criterion: str = "entropy",
|
||||
min_samples_split: int = 0,
|
||||
max_features=None,
|
||||
splitter: str = "random",
|
||||
):
|
||||
self.max_iter = max_iter
|
||||
self.C = C
|
||||
@@ -142,63 +421,18 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
self.gamma = gamma
|
||||
self.degree = degree
|
||||
self.min_samples_split = min_samples_split
|
||||
self.split_criteria = split_criteria
|
||||
self.max_features = max_features
|
||||
self.criterion = criterion
|
||||
self.splitter = splitter
|
||||
|
||||
def _more_tags(self) -> dict:
|
||||
"""Required by sklearn to tell that this estimator is a binary classifier
|
||||
"""Required by sklearn to supply features of the classifier
|
||||
|
||||
:return: the tag required
|
||||
:rtype: dict
|
||||
"""
|
||||
return {"binary_only": True, "requires_y": True}
|
||||
|
||||
def _split_array(self, origin: np.array, down: np.array) -> list:
|
||||
"""Split an array in two based on indices passed as down and its complement
|
||||
|
||||
:param origin: dataset to split
|
||||
:type origin: np.array
|
||||
:param down: indices to use to split array
|
||||
:type down: np.array
|
||||
:return: list with two splits of the array
|
||||
:rtype: list
|
||||
"""
|
||||
up = ~down
|
||||
return (
|
||||
origin[up[:, 0]] if any(up) else None,
|
||||
origin[down[:, 0]] if any(down) else None,
|
||||
)
|
||||
|
||||
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
||||
"""Compute distances of the samples to the hyperplane of the node
|
||||
|
||||
:param node: node containing the svm classifier
|
||||
:type node: Snode
|
||||
:param data: samples to find out distance to hyperplane
|
||||
:type data: np.ndarray
|
||||
:return: array of shape (m, 1) with the distances of every sample to
|
||||
the hyperplane of the node
|
||||
:rtype: np.array
|
||||
"""
|
||||
res = node._clf.decision_function(data)
|
||||
if res.ndim == 1:
|
||||
return np.expand_dims(res, 1)
|
||||
elif res.shape[1] > 1:
|
||||
# remove multiclass info
|
||||
res = np.delete(res, slice(1, res.shape[1]), axis=1)
|
||||
return res
|
||||
|
||||
def _split_criteria(self, data: np.array) -> np.array:
|
||||
"""Set the criteria to split arrays
|
||||
|
||||
:param data: [description]
|
||||
:type data: np.array
|
||||
:return: [description]
|
||||
:rtype: np.array
|
||||
"""
|
||||
return (
|
||||
data > 0
|
||||
if data.shape[0] >= self.min_samples_split
|
||||
else np.ones((data.shape[0], 1), dtype=bool)
|
||||
)
|
||||
return {"requires_y": True}
|
||||
|
||||
def fit(
|
||||
self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
|
||||
@@ -231,53 +465,35 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||
{self.max_depth})"
|
||||
)
|
||||
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=np.float64
|
||||
)
|
||||
check_classification_targets(y)
|
||||
# Initialize computed parameters
|
||||
self.splitter_ = Splitter(
|
||||
clf=self._build_clf(),
|
||||
criterion=self.criterion,
|
||||
splitter_type=self.splitter,
|
||||
criteria=self.split_criteria,
|
||||
random_state=self.random_state,
|
||||
min_samples_split=self.min_samples_split,
|
||||
)
|
||||
if self.random_state is not None:
|
||||
random.seed(self.random_state)
|
||||
self.classes_, y = np.unique(y, return_inverse=True)
|
||||
self.n_classes_ = self.classes_.shape[0]
|
||||
self.n_iter_ = self.max_iter
|
||||
self.depth_ = 0
|
||||
self.n_features_ = X.shape[1]
|
||||
self.n_features_in_ = X.shape[1]
|
||||
self.max_features_ = self._initialize_max_features()
|
||||
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
||||
self._build_predictor()
|
||||
return self
|
||||
|
||||
def _build_predictor(self):
|
||||
"""Process the leaves to make them predictors
|
||||
"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node.is_leaf():
|
||||
node.make_predictor()
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
|
||||
run_tree(self.tree_)
|
||||
|
||||
def _build_clf(self):
|
||||
""" Build the correct classifier for the node
|
||||
"""
|
||||
return (
|
||||
LinearSVC(
|
||||
max_iter=self.max_iter,
|
||||
random_state=self.random_state,
|
||||
C=self.C,
|
||||
tol=self.tol,
|
||||
)
|
||||
if self.kernel == "linear"
|
||||
else SVC(
|
||||
kernel=self.kernel,
|
||||
max_iter=self.max_iter,
|
||||
tol=self.tol,
|
||||
C=self.C,
|
||||
gamma=self.gamma,
|
||||
degree=self.degree,
|
||||
)
|
||||
)
|
||||
|
||||
def train(
|
||||
self,
|
||||
X: np.ndarray,
|
||||
@@ -285,7 +501,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
sample_weight: np.ndarray,
|
||||
depth: int,
|
||||
title: str,
|
||||
) -> Snode:
|
||||
) -> Optional[Snode]:
|
||||
"""Recursive function to split the original dataset into predictor
|
||||
nodes (leaves)
|
||||
|
||||
@@ -307,24 +523,83 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
return None
|
||||
if np.unique(y).shape[0] == 1:
|
||||
# only 1 class => pure dataset
|
||||
return Snode(None, X, y, title + ", <pure>")
|
||||
return Snode(
|
||||
clf=None,
|
||||
X=X,
|
||||
y=y,
|
||||
features=X.shape[1],
|
||||
impurity=0.0,
|
||||
title=title + ", <pure>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
# Train the model
|
||||
clf = self._build_clf()
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
tree = Snode(clf, X, y, title)
|
||||
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||
# solve WARNING: class label 0 specified in weight is not found
|
||||
# in bagging
|
||||
if any(sample_weight == 0):
|
||||
indices = sample_weight == 0
|
||||
y_next = y[~indices]
|
||||
# touch weights if removing any class
|
||||
if np.unique(y_next).shape[0] != self.n_classes_:
|
||||
sample_weight += 1e-5
|
||||
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||
impurity = self.splitter_.partition_impurity(y)
|
||||
node = Snode(clf, X, y, features, impurity, title, sample_weight)
|
||||
self.depth_ = max(depth, self.depth_)
|
||||
down = self._split_criteria(self._distances(tree, X))
|
||||
X_U, X_D = self._split_array(X, down)
|
||||
y_u, y_d = self._split_array(y, down)
|
||||
sw_u, sw_d = self._split_array(sample_weight, down)
|
||||
self.splitter_.partition(X, node, True)
|
||||
X_U, X_D = self.splitter_.part(X)
|
||||
y_u, y_d = self.splitter_.part(y)
|
||||
sw_u, sw_d = self.splitter_.part(sample_weight)
|
||||
if X_U is None or X_D is None:
|
||||
# didn't part anything
|
||||
return Snode(clf, X, y, title + ", <cgaf>")
|
||||
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||
return tree
|
||||
return Snode(
|
||||
clf,
|
||||
X,
|
||||
y,
|
||||
features=X.shape[1],
|
||||
impurity=impurity,
|
||||
title=title + ", <cgaf>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||
return node
|
||||
|
||||
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
|
||||
def _build_predictor(self):
|
||||
"""Process the leaves to make them predictors"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node.is_leaf():
|
||||
node.make_predictor()
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
|
||||
run_tree(self.tree_)
|
||||
|
||||
def _build_clf(self):
|
||||
"""Build the correct classifier for the node"""
|
||||
return (
|
||||
LinearSVC(
|
||||
max_iter=self.max_iter,
|
||||
random_state=self.random_state,
|
||||
C=self.C,
|
||||
tol=self.tol,
|
||||
)
|
||||
if self.kernel == "linear"
|
||||
else SVC(
|
||||
kernel=self.kernel,
|
||||
max_iter=self.max_iter,
|
||||
tol=self.tol,
|
||||
C=self.C,
|
||||
gamma=self.gamma,
|
||||
degree=self.degree,
|
||||
)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _reorder_results(y: np.array, indices: np.array) -> np.array:
|
||||
"""Reorder an array based on the array of indices passed
|
||||
|
||||
:param y: data untidy
|
||||
@@ -334,12 +609,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
:return: array y ordered
|
||||
:rtype: np.array
|
||||
"""
|
||||
if y.ndim > 1 and y.shape[1] > 1:
|
||||
# if predict_proba return np.array of floats
|
||||
y_ordered = np.zeros(y.shape, dtype=float)
|
||||
else:
|
||||
# return array of same type given in y
|
||||
y_ordered = y.copy()
|
||||
# return array of same type given in y
|
||||
y_ordered = y.copy()
|
||||
indices = indices.astype(int)
|
||||
for i, index in enumerate(indices):
|
||||
y_ordered[index] = y[i]
|
||||
@@ -363,17 +634,22 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
return prediction, indices
|
||||
down = self._split_criteria(self._distances(node, xp))
|
||||
X_U, X_D = self._split_array(xp, down)
|
||||
i_u, i_d = self._split_array(indices, down)
|
||||
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
|
||||
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
|
||||
self.splitter_.partition(xp, node, train=False)
|
||||
x_u, x_d = self.splitter_.part(xp)
|
||||
i_u, i_d = self.splitter_.part(indices)
|
||||
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
|
||||
prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
|
||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self, ["tree_"])
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
if X.shape[1] != self.n_features_:
|
||||
raise ValueError(
|
||||
f"Expected {self.n_features_} features but got "
|
||||
f"({X.shape[1]})"
|
||||
)
|
||||
# setup prediction & make it happen
|
||||
indices = np.arange(X.shape[0])
|
||||
result = (
|
||||
@@ -383,68 +659,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
)
|
||||
return self.classes_[result]
|
||||
|
||||
def predict_proba(self, X: np.array) -> np.array:
|
||||
"""Computes an approximation of the probability of samples belonging to
|
||||
class 0 and 1
|
||||
:param X: dataset
|
||||
:type X: np.array
|
||||
:return: array array of shape (m, num_classes), probability of being
|
||||
each class
|
||||
:rtype: np.array
|
||||
"""
|
||||
|
||||
def predict_class(
|
||||
xp: np.array, indices: np.array, dist: np.array, node: Snode
|
||||
) -> np.array:
|
||||
"""Run the tree to compute predictions
|
||||
|
||||
:param xp: subdataset of samples
|
||||
:type xp: np.array
|
||||
:param indices: indices of subdataset samples to rebuild original
|
||||
order
|
||||
:type indices: np.array
|
||||
:param dist: distances of every sample to the hyperplane or the
|
||||
father node
|
||||
:type dist: np.array
|
||||
:param node: node of the leaf with the class
|
||||
:type node: Snode
|
||||
:return: array of labels and distances, array of indices
|
||||
:rtype: np.array
|
||||
"""
|
||||
if xp is None:
|
||||
return [], []
|
||||
if node.is_leaf():
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
prediction_proba = dist
|
||||
return np.append(prediction, prediction_proba, axis=1), indices
|
||||
distances = self._distances(node, xp)
|
||||
down = self._split_criteria(distances)
|
||||
X_U, X_D = self._split_array(xp, down)
|
||||
i_u, i_d = self._split_array(indices, down)
|
||||
di_u, di_d = self._split_array(distances, down)
|
||||
prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
|
||||
prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
|
||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self, ["tree_"])
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
# setup prediction & make it happen
|
||||
indices = np.arange(X.shape[0])
|
||||
empty_dist = np.empty((X.shape[0], 1), dtype=float)
|
||||
result, indices = predict_class(X, indices, empty_dist, self.tree_)
|
||||
result = result.reshape(X.shape[0], 2)
|
||||
# Turn distances to hyperplane into probabilities based on fitting
|
||||
# distances of samples to its hyperplane that classified them, to the
|
||||
# sigmoid function
|
||||
# Probability of being 1
|
||||
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
|
||||
# Probability of being 0
|
||||
result[:, 0] = 1 - result[:, 1]
|
||||
return self._reorder_results(result, indices)
|
||||
|
||||
def score(
|
||||
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||
) -> float:
|
||||
@@ -466,19 +680,14 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
X, y = check_X_y(X, y)
|
||||
y_pred = self.predict(X).reshape(y.shape)
|
||||
# Compute accuracy for each possible representation
|
||||
y_type, y_true, y_pred = _check_targets(y, y_pred)
|
||||
_, y_true, y_pred = _check_targets(y, y_pred)
|
||||
check_consistent_length(y_true, y_pred, sample_weight)
|
||||
if y_type.startswith("multilabel"):
|
||||
differing_labels = count_nonzero(y_true - y_pred, axis=1)
|
||||
score = differing_labels == 0
|
||||
else:
|
||||
score = y_true == y_pred
|
||||
|
||||
score = y_true == y_pred
|
||||
return _weighted_sum(score, sample_weight, normalize=True)
|
||||
|
||||
def __iter__(self) -> Siterator:
|
||||
"""Create an iterator to be able to visit the nodes of the tree in preorder,
|
||||
can make a list with all the nodes in preorder
|
||||
"""Create an iterator to be able to visit the nodes of the tree in
|
||||
preorder, can make a list with all the nodes in preorder
|
||||
|
||||
:return: an iterator, can for i in... and list(...)
|
||||
:rtype: Siterator
|
||||
@@ -499,3 +708,34 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
for i in self:
|
||||
output += str(i) + "\n"
|
||||
return output
|
||||
|
||||
def _initialize_max_features(self) -> int:
|
||||
if isinstance(self.max_features, str):
|
||||
if self.max_features == "auto":
|
||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||
elif self.max_features == "sqrt":
|
||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||
elif self.max_features == "log2":
|
||||
max_features = max(1, int(np.log2(self.n_features_)))
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid value for max_features. "
|
||||
"Allowed string values are 'auto', "
|
||||
"'sqrt' or 'log2'."
|
||||
)
|
||||
elif self.max_features is None:
|
||||
max_features = self.n_features_
|
||||
elif isinstance(self.max_features, numbers.Integral):
|
||||
max_features = self.max_features
|
||||
else: # float
|
||||
if self.max_features > 0.0:
|
||||
max_features = max(
|
||||
1, int(self.max_features * self.n_features_)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid value for max_features."
|
||||
"Allowed float must be in range (0, 1] "
|
||||
f"got ({self.max_features})"
|
||||
)
|
||||
return max_features
|
||||
|
@@ -1,205 +0,0 @@
|
||||
"""
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
||||
__license__ = "MIT"
|
||||
__version__ = "0.9"
|
||||
Plot 3D views of nodes in Stree
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from sklearn.decomposition import PCA
|
||||
from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
from .Strees import Stree, Snode, Siterator
|
||||
|
||||
|
||||
class Snode_graph(Snode):
|
||||
def __init__(self, node: Stree):
|
||||
self._plot_size = (8, 8)
|
||||
self._xlimits = (None, None)
|
||||
self._ylimits = (None, None)
|
||||
self._zlimits = (None, None)
|
||||
n = Snode.copy(node)
|
||||
super().__init__(n._clf, n._X, n._y, n._title)
|
||||
|
||||
def set_plot_size(self, size: tuple):
|
||||
self._plot_size = size
|
||||
|
||||
def get_plot_size(self) -> tuple:
|
||||
return self._plot_size
|
||||
|
||||
def _is_pure(self) -> bool:
|
||||
"""is considered pure a leaf node with one label
|
||||
"""
|
||||
if self.is_leaf():
|
||||
return self._belief == 1.0
|
||||
return False
|
||||
|
||||
def set_axis_limits(self, limits: tuple):
|
||||
self._xlimits, self._ylimits, self._zlimits = limits
|
||||
|
||||
def get_axis_limits(self) -> tuple:
|
||||
return self._xlimits, self._ylimits, self._zlimits
|
||||
|
||||
def _set_graphics_axis(self, ax: Axes3D):
|
||||
ax.set_xlim(self._xlimits)
|
||||
ax.set_ylim(self._ylimits)
|
||||
ax.set_zlim(self._zlimits)
|
||||
|
||||
def save_hyperplane(
|
||||
self, save_folder: str = "./", save_prefix: str = "", save_seq: int = 1
|
||||
):
|
||||
_, fig = self.plot_hyperplane()
|
||||
name = os.path.join(save_folder, f"{save_prefix}STnode{save_seq}.png")
|
||||
fig.savefig(name, bbox_inches="tight")
|
||||
plt.close(fig)
|
||||
|
||||
def _get_cmap(self):
|
||||
cmap = "jet"
|
||||
if self._is_pure() and self._class == 1:
|
||||
cmap = "jet_r"
|
||||
return cmap
|
||||
|
||||
def _graph_title(self):
|
||||
n_class, card = np.unique(self._y, return_counts=True)
|
||||
return f"{self._title} {n_class} {card}"
|
||||
|
||||
def plot_hyperplane(self, plot_distribution: bool = True):
|
||||
fig = plt.figure(figsize=self._plot_size)
|
||||
ax = fig.add_subplot(1, 1, 1, projection="3d")
|
||||
if not self._is_pure():
|
||||
# Can't plot hyperplane of leaves with one label because it hasn't
|
||||
# classiffier
|
||||
# get the splitting hyperplane
|
||||
def hyperplane(x, y):
|
||||
return (
|
||||
-self._clf.intercept_
|
||||
- self._clf.coef_[0][0] * x
|
||||
- self._clf.coef_[0][1] * y
|
||||
) / self._clf.coef_[0][2]
|
||||
|
||||
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
|
||||
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
|
||||
xx, yy = np.meshgrid(tmpx, tmpy)
|
||||
ax.plot_surface(
|
||||
xx,
|
||||
yy,
|
||||
hyperplane(xx, yy),
|
||||
alpha=0.5,
|
||||
antialiased=True,
|
||||
rstride=1,
|
||||
cstride=1,
|
||||
cmap="seismic",
|
||||
)
|
||||
self._set_graphics_axis(ax)
|
||||
if plot_distribution:
|
||||
self.plot_distribution(ax)
|
||||
else:
|
||||
plt.title(self._graph_title())
|
||||
plt.show()
|
||||
return ax, fig
|
||||
|
||||
def plot_distribution(self, ax: Axes3D = None):
|
||||
if ax is None:
|
||||
fig = plt.figure(figsize=self._plot_size)
|
||||
ax = fig.add_subplot(1, 1, 1, projection="3d")
|
||||
plt.title(self._graph_title())
|
||||
cmap = self._get_cmap()
|
||||
ax.scatter(
|
||||
self._X[:, 0], self._X[:, 1], self._X[:, 2], c=self._y, cmap=cmap
|
||||
)
|
||||
ax.set_xlabel("X0")
|
||||
ax.set_ylabel("X1")
|
||||
ax.set_zlabel("X2")
|
||||
plt.show()
|
||||
|
||||
|
||||
class Stree_grapher(Stree):
|
||||
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
|
||||
make its magic
|
||||
"""
|
||||
|
||||
def __init__(self, params: dict):
|
||||
self._plot_size = (8, 8)
|
||||
self._tree_gr = None
|
||||
# make Snode store X's
|
||||
os.environ["TESTING"] = "1"
|
||||
self._fitted = False
|
||||
self._pca = None
|
||||
super().__init__(**params)
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
os.environ.pop("TESTING")
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
def _copy_tree(self, node: Snode) -> Snode_graph:
|
||||
mirror = Snode_graph(node)
|
||||
# clone node
|
||||
mirror._class = node._class
|
||||
mirror._belief = node._belief
|
||||
if node.get_down() is not None:
|
||||
mirror.set_down(self._copy_tree(node.get_down()))
|
||||
if node.get_up() is not None:
|
||||
mirror.set_up(self._copy_tree(node.get_up()))
|
||||
return mirror
|
||||
|
||||
def fit(
|
||||
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||
) -> "Stree_grapher":
|
||||
"""Fit the Stree and copy the tree in a Snode_graph tree
|
||||
|
||||
:param X: Dataset
|
||||
:type X: np.array
|
||||
:param y: Labels
|
||||
:type y: np.array
|
||||
:return: Stree model
|
||||
:rtype: Stree
|
||||
"""
|
||||
if X.shape[1] != 3:
|
||||
self._pca = PCA(n_components=3)
|
||||
X = self._pca.fit_transform(X)
|
||||
super().fit(X, y, sample_weight=sample_weight)
|
||||
self._tree_gr = self._copy_tree(self.tree_)
|
||||
self._fitted = True
|
||||
return self
|
||||
|
||||
def score(self, X: np.array, y: np.array) -> float:
|
||||
self._check_fitted()
|
||||
if X.shape[1] != 3:
|
||||
X = self._pca.transform(X)
|
||||
return super().score(X, y)
|
||||
|
||||
def _check_fitted(self):
|
||||
if not self._fitted:
|
||||
raise Exception("Have to fit the grapher first!")
|
||||
|
||||
def save_all(self, save_folder: str = "./", save_prefix: str = ""):
|
||||
"""Save all the node plots in png format, each with a sequence number
|
||||
|
||||
:param save_folder: folder where the plots are saved, defaults to './'
|
||||
:type save_folder: str, optional
|
||||
"""
|
||||
self._check_fitted()
|
||||
if not os.path.isdir(save_folder):
|
||||
os.mkdir(save_folder)
|
||||
seq = 1
|
||||
for node in self:
|
||||
node.save_hyperplane(
|
||||
save_folder=save_folder, save_prefix=save_prefix, save_seq=seq
|
||||
)
|
||||
seq += 1
|
||||
|
||||
def plot_all(self):
|
||||
"""Plots all the nodes
|
||||
"""
|
||||
self._check_fitted()
|
||||
for node in self:
|
||||
node.plot_hyperplane()
|
||||
|
||||
def __iter__(self):
|
||||
return Siterator(self._tree_gr)
|
@@ -1,4 +1,3 @@
|
||||
from .Strees import Stree, Snode, Siterator
|
||||
from .Strees_grapher import Stree_grapher, Snode_graph
|
||||
from .Strees import Stree, Snode, Siterator, Splitter
|
||||
|
||||
__all__ = ["Stree", "Snode", "Siterator", "Stree_grapher", "Snode_graph"]
|
||||
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
||||
|
96
stree/tests/Snode_test.py
Normal file
96
stree/tests/Snode_test.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from stree import Stree, Snode
|
||||
from .utils import load_dataset
|
||||
|
||||
|
||||
class Snode_test(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._random_state = 1
|
||||
self._clf = Stree(random_state=self._random_state)
|
||||
self._clf.fit(*load_dataset(self._random_state))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def test_attributes_in_leaves(self):
|
||||
"""Check if the attributes in leaves have correct values so they form a
|
||||
predictor
|
||||
"""
|
||||
|
||||
def check_leave(node: Snode):
|
||||
if not node.is_leaf():
|
||||
check_leave(node.get_down())
|
||||
check_leave(node.get_up())
|
||||
return
|
||||
# Check Belief in leave
|
||||
classes, card = np.unique(node._y, return_counts=True)
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
if len(classes) > 1:
|
||||
belief = max_card / (max_card + min_card)
|
||||
else:
|
||||
belief = 1
|
||||
self.assertEqual(belief, node._belief)
|
||||
# Check Class
|
||||
class_computed = classes[card == max_card]
|
||||
self.assertEqual(class_computed, node._class)
|
||||
# Check Partition column
|
||||
self.assertEqual(node._partition_column, -1)
|
||||
|
||||
check_leave(self._clf.tree_)
|
||||
|
||||
def test_nodes_coefs(self):
|
||||
"""Check if the nodes of the tree have the right attributes filled"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node._belief < 1:
|
||||
# only exclude pure leaves
|
||||
self.assertIsNotNone(node._clf)
|
||||
self.assertIsNotNone(node._clf.coef_)
|
||||
if node.is_leaf():
|
||||
return
|
||||
run_tree(node.get_up())
|
||||
run_tree(node.get_down())
|
||||
|
||||
model = Stree(self._random_state)
|
||||
model.fit(*load_dataset(self._random_state, 3, 4))
|
||||
run_tree(model.tree_)
|
||||
|
||||
def test_make_predictor_on_leaf(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||
test.make_predictor()
|
||||
self.assertEqual(1, test._class)
|
||||
self.assertEqual(0.75, test._belief)
|
||||
self.assertEqual(-1, test._partition_column)
|
||||
|
||||
def test_make_predictor_on_not_leaf(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
|
||||
test.make_predictor()
|
||||
self.assertIsNone(test._class)
|
||||
self.assertEqual(0, test._belief)
|
||||
self.assertEqual(-1, test._partition_column)
|
||||
self.assertEqual(-1, test.get_up()._partition_column)
|
||||
|
||||
def test_make_predictor_on_leaf_bogus_data(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
||||
test.make_predictor()
|
||||
self.assertIsNone(test._class)
|
||||
self.assertEqual(-1, test._partition_column)
|
||||
|
||||
def test_copy_node(self):
|
||||
px = [1, 2, 3, 4]
|
||||
py = [1]
|
||||
test = Snode(Stree(), px, py, [], 0.0, "test")
|
||||
computed = Snode.copy(test)
|
||||
self.assertListEqual(computed._X, px)
|
||||
self.assertListEqual(computed._y, py)
|
||||
self.assertEqual("test", computed._title)
|
||||
self.assertIsInstance(computed._clf, Stree)
|
||||
self.assertEqual(test._partition_column, computed._partition_column)
|
224
stree/tests/Splitter_test.py
Normal file
224
stree/tests/Splitter_test.py
Normal file
@@ -0,0 +1,224 @@
|
||||
import os
|
||||
import unittest
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.datasets import load_wine, load_iris
|
||||
from stree import Splitter
|
||||
|
||||
|
||||
class Splitter_test(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._random_state = 1
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def build(
|
||||
clf=SVC,
|
||||
min_samples_split=0,
|
||||
splitter_type="random",
|
||||
criterion="gini",
|
||||
criteria="max_samples",
|
||||
random_state=None,
|
||||
):
|
||||
return Splitter(
|
||||
clf=clf(random_state=random_state, kernel="rbf"),
|
||||
min_samples_split=min_samples_split,
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
criteria=criteria,
|
||||
random_state=random_state,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def test_init(self):
|
||||
with self.assertRaises(ValueError):
|
||||
self.build(criterion="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
self.build(splitter_type="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
self.build(criteria="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
_ = Splitter(clf=None)
|
||||
for splitter_type in ["best", "random"]:
|
||||
for criterion in ["gini", "entropy"]:
|
||||
for criteria in ["max_samples", "impurity"]:
|
||||
tcl = self.build(
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
criteria=criteria,
|
||||
)
|
||||
self.assertEqual(splitter_type, tcl._splitter_type)
|
||||
self.assertEqual(criterion, tcl._criterion)
|
||||
self.assertEqual(criteria, tcl._criteria)
|
||||
|
||||
def test_gini(self):
|
||||
expected_values = [
|
||||
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
|
||||
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
|
||||
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
|
||||
([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
|
||||
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
|
||||
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
|
||||
([0], 0),
|
||||
([1, 1, 1, 1], 0),
|
||||
]
|
||||
for labels, expected in expected_values:
|
||||
self.assertAlmostEqual(expected, Splitter._gini(labels))
|
||||
tcl = self.build(criterion="gini")
|
||||
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||
|
||||
def test_entropy(self):
|
||||
expected_values = [
|
||||
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
|
||||
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
|
||||
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
|
||||
([0, 0, 1, 1, 1, 1, 0, 0], 1),
|
||||
([0, 0, 1, 1, 2, 2, 3, 3], 1),
|
||||
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
|
||||
([1], 0),
|
||||
([0, 0, 0, 0], 0),
|
||||
]
|
||||
for labels, expected in expected_values:
|
||||
self.assertAlmostEqual(expected, Splitter._entropy(labels))
|
||||
tcl = self.build(criterion="entropy")
|
||||
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||
|
||||
def test_information_gain(self):
|
||||
expected_values = [
|
||||
(
|
||||
[0, 1, 1, 1, 1, 1],
|
||||
[0, 0, 0, 1],
|
||||
0.16333333333333333,
|
||||
0.25642589168200297,
|
||||
),
|
||||
(
|
||||
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
|
||||
[5, 3, 2, 1, 1],
|
||||
0.007381776239907684,
|
||||
-0.03328610916207225,
|
||||
),
|
||||
([], [], 0.0, 0.0),
|
||||
([1], [], 0.0, 0.0),
|
||||
([], [1], 0.0, 0.0),
|
||||
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
|
||||
([], [1, 1, 1, 2], 0.0, 0.0),
|
||||
(None, [1, 2, 3], 0.0, 0.0),
|
||||
([1, 2, 3], None, 0.0, 0.0),
|
||||
]
|
||||
for yu, yd, expected_gini, expected_entropy in expected_values:
|
||||
yu = np.array(yu, dtype=np.int32) if yu is not None else None
|
||||
yd = np.array(yd, dtype=np.int32) if yd is not None else None
|
||||
if yu is not None and yd is not None:
|
||||
complete = np.append(yu, yd)
|
||||
elif yd is not None:
|
||||
complete = yd
|
||||
else:
|
||||
complete = yu
|
||||
tcl = self.build(criterion="gini")
|
||||
computed = tcl.information_gain(complete, yu, yd)
|
||||
self.assertAlmostEqual(expected_gini, computed)
|
||||
tcl = self.build(criterion="entropy")
|
||||
computed = tcl.information_gain(complete, yu, yd)
|
||||
self.assertAlmostEqual(expected_entropy, computed)
|
||||
|
||||
def test_max_samples(self):
|
||||
tcl = self.build(criteria="max_samples")
|
||||
data = np.array(
|
||||
[
|
||||
[-0.1, 0.2, -0.3],
|
||||
[0.7, 0.01, -0.1],
|
||||
[0.7, -0.9, 0.5],
|
||||
[0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
]
|
||||
)
|
||||
expected = data[:, 0]
|
||||
y = [1, 2, 1, 0, 0, 0]
|
||||
computed = tcl._max_samples(data, y)
|
||||
self.assertEqual(0, computed)
|
||||
computed_data = data[:, computed]
|
||||
self.assertEqual((6,), computed_data.shape)
|
||||
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
||||
|
||||
def test_impurity(self):
|
||||
tcl = self.build(criteria="impurity")
|
||||
data = np.array(
|
||||
[
|
||||
[-0.1, 0.2, -0.3],
|
||||
[0.7, 0.01, -0.1],
|
||||
[0.7, -0.9, 0.5],
|
||||
[0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
]
|
||||
)
|
||||
expected = data[:, 2]
|
||||
y = np.array([1, 2, 1, 0, 0, 0])
|
||||
computed = tcl._impurity(data, y)
|
||||
self.assertEqual(2, computed)
|
||||
computed_data = data[:, computed]
|
||||
self.assertEqual((6,), computed_data.shape)
|
||||
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
||||
|
||||
def test_generate_subspaces(self):
|
||||
features = 250
|
||||
for max_features in range(2, features):
|
||||
num = len(Splitter._generate_spaces(features, max_features))
|
||||
self.assertEqual(5, num)
|
||||
self.assertEqual(3, len(Splitter._generate_spaces(3, 2)))
|
||||
self.assertEqual(4, len(Splitter._generate_spaces(4, 3)))
|
||||
|
||||
def test_best_splitter_few_sets(self):
|
||||
X, y = load_iris(return_X_y=True)
|
||||
X = np.delete(X, 3, 1)
|
||||
tcl = self.build(splitter_type="best", random_state=self._random_state)
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
||||
self.assertListEqual([0, 2], list(computed))
|
||||
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
|
||||
|
||||
def test_splitter_parameter(self):
|
||||
expected_values = [
|
||||
[1, 4, 9, 12], # best entropy max_samples
|
||||
[1, 3, 6, 10], # best entropy impurity
|
||||
[6, 8, 10, 12], # best gini max_samples
|
||||
[7, 8, 10, 11], # best gini impurity
|
||||
[0, 3, 8, 12], # random entropy max_samples
|
||||
[0, 3, 9, 11], # random entropy impurity
|
||||
[0, 4, 7, 12], # random gini max_samples
|
||||
[0, 2, 5, 6], # random gini impurity
|
||||
]
|
||||
X, y = load_wine(return_X_y=True)
|
||||
rn = 0
|
||||
for splitter_type in ["best", "random"]:
|
||||
for criterion in ["entropy", "gini"]:
|
||||
for criteria in [
|
||||
"max_samples",
|
||||
"impurity",
|
||||
]:
|
||||
tcl = self.build(
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
criteria=criteria,
|
||||
)
|
||||
expected = expected_values.pop(0)
|
||||
random.seed(rn)
|
||||
rn += 1
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=4)
|
||||
# print(
|
||||
# "{}, # {:7s}{:8s}{:15s}".format(
|
||||
# list(computed),
|
||||
# splitter_type,
|
||||
# criterion,
|
||||
# criteria,
|
||||
# )
|
||||
# )
|
||||
self.assertListEqual(expected, list(computed))
|
||||
self.assertListEqual(
|
||||
X[:, computed].tolist(), dataset.tolist()
|
||||
)
|
444
stree/tests/Stree_test.py
Normal file
444
stree/tests/Stree_test.py
Normal file
@@ -0,0 +1,444 @@
|
||||
import os
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris, load_wine
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
from stree import Stree, Snode
|
||||
from .utils import load_dataset
|
||||
|
||||
|
||||
class Stree_test(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._random_state = 1
|
||||
self._kernels = ["linear", "rbf", "poly"]
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def _check_tree(self, node: Snode):
|
||||
"""Check recursively that the nodes that are not leaves have the
|
||||
correct number of labels and its sons have the right number of elements
|
||||
in their dataset
|
||||
|
||||
Arguments:
|
||||
node {Snode} -- node to check
|
||||
"""
|
||||
if node.is_leaf():
|
||||
return
|
||||
y_prediction = node._clf.predict(node._X)
|
||||
y_down = node.get_down()._y
|
||||
y_up = node.get_up()._y
|
||||
# Is a correct partition in terms of cadinality?
|
||||
# i.e. The partition algorithm didn't forget any sample
|
||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||
_, count_d = np.unique(y_down, return_counts=True)
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
number_up = count_u[i]
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
except IndexError:
|
||||
number_down = 0
|
||||
self.assertEqual(count_y[i], number_down + number_up)
|
||||
# Is the partition made the same as the prediction?
|
||||
# as the node is not a leaf...
|
||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[1], y_up.shape[0])
|
||||
self.assertEqual(count_yp[0], y_down.shape[0])
|
||||
self._check_tree(node.get_down())
|
||||
self._check_tree(node.get_up())
|
||||
|
||||
def test_build_tree(self):
|
||||
"""Check if the tree is built the same way as predictions of models"""
|
||||
warnings.filterwarnings("ignore")
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
clf.fit(*load_dataset(self._random_state))
|
||||
self._check_tree(clf.tree_)
|
||||
|
||||
def test_single_prediction(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||
self.assertEqual(yp[0], y[0])
|
||||
|
||||
def test_multiple_prediction(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = load_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
yp = clf.fit(X, y).predict(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||
|
||||
def test_single_vs_multiple_prediction(self):
|
||||
"""Check if predicting sample by sample gives the same result as
|
||||
predicting all samples at once
|
||||
"""
|
||||
X, y = load_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
clf.fit(X, y)
|
||||
# Compute prediction line by line
|
||||
yp_line = np.array([], dtype=int)
|
||||
for xp in X:
|
||||
yp_line = np.append(
|
||||
yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
|
||||
)
|
||||
# Compute prediction at once
|
||||
yp_once = clf.predict(X)
|
||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||
|
||||
def test_iterator_and_str(self):
|
||||
"""Check preorder iterator"""
|
||||
expected = [
|
||||
"root feaures=(0, 1, 2) impurity=1.0000 counts=(array([0, 1]), arr"
|
||||
"ay([750, 750]))",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.928297 impurity=0.37"
|
||||
"22 counts=(array([0, 1]), array([725, 56]))",
|
||||
"root - Up feaures=(0, 1, 2) impurity=0.2178 counts=(array([0, 1])"
|
||||
", array([ 25, 694]))",
|
||||
"root - Up - Down feaures=(0, 1, 2) impurity=0.8454 counts=(array("
|
||||
"[0, 1]), array([8, 3]))",
|
||||
"root - Up - Down - Down, <pure> - Leaf class=0 belief= 1.000000 i"
|
||||
"mpurity=0.0000 counts=(array([0]), array([7]))",
|
||||
"root - Up - Down - Up, <cgaf> - Leaf class=1 belief= 0.750000 imp"
|
||||
"urity=0.8113 counts=(array([0, 1]), array([1, 3]))",
|
||||
"root - Up - Up, <cgaf> - Leaf class=1 belief= 0.975989 impurity=0"
|
||||
".1634 counts=(array([0, 1]), array([ 17, 691]))",
|
||||
]
|
||||
computed = []
|
||||
expected_string = ""
|
||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||
clf.fit(*load_dataset(self._random_state))
|
||||
for node in clf:
|
||||
computed.append(str(node))
|
||||
expected_string += str(node) + "\n"
|
||||
self.assertListEqual(expected, computed)
|
||||
self.assertEqual(expected_string, str(clf))
|
||||
|
||||
@staticmethod
|
||||
def test_is_a_sklearn_classifier():
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
|
||||
check_estimator(Stree())
|
||||
|
||||
def test_exception_if_C_is_negative(self):
|
||||
tclf = Stree(C=-1)
|
||||
with self.assertRaises(ValueError):
|
||||
tclf.fit(*load_dataset(self._random_state))
|
||||
|
||||
def test_exception_if_bogus_split_criteria(self):
|
||||
tclf = Stree(split_criteria="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
tclf.fit(*load_dataset(self._random_state))
|
||||
|
||||
def test_check_max_depth_is_positive_or_None(self):
|
||||
tcl = Stree()
|
||||
self.assertIsNone(tcl.max_depth)
|
||||
tcl = Stree(max_depth=1)
|
||||
self.assertGreaterEqual(1, tcl.max_depth)
|
||||
with self.assertRaises(ValueError):
|
||||
tcl = Stree(max_depth=-1)
|
||||
tcl.fit(*load_dataset(self._random_state))
|
||||
|
||||
def test_check_max_depth(self):
|
||||
depths = (3, 4)
|
||||
for depth in depths:
|
||||
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
||||
tcl.fit(*load_dataset(self._random_state))
|
||||
self.assertEqual(depth, tcl.depth_)
|
||||
|
||||
def test_unfitted_tree_is_iterable(self):
|
||||
tcl = Stree()
|
||||
self.assertEqual(0, len(list(tcl)))
|
||||
|
||||
def test_min_samples_split(self):
|
||||
dataset = [[1], [2], [3]], [1, 1, 0]
|
||||
tcl_split = Stree(min_samples_split=3).fit(*dataset)
|
||||
self.assertIsNotNone(tcl_split.tree_.get_down())
|
||||
self.assertIsNotNone(tcl_split.tree_.get_up())
|
||||
tcl_nosplit = Stree(min_samples_split=4).fit(*dataset)
|
||||
self.assertIsNone(tcl_nosplit.tree_.get_down())
|
||||
self.assertIsNone(tcl_nosplit.tree_.get_up())
|
||||
|
||||
def test_simple_muticlass_dataset(self):
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(
|
||||
kernel=kernel,
|
||||
split_criteria="max_samples",
|
||||
random_state=self._random_state,
|
||||
)
|
||||
px = [[1, 2], [5, 6], [9, 10]]
|
||||
py = [0, 1, 2]
|
||||
clf.fit(px, py)
|
||||
self.assertEqual(1.0, clf.score(px, py))
|
||||
self.assertListEqual(py, clf.predict(px).tolist())
|
||||
self.assertListEqual(py, clf.classes_.tolist())
|
||||
|
||||
def test_muticlass_dataset(self):
|
||||
datasets = {
|
||||
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||
"Iris": load_wine(return_X_y=True),
|
||||
}
|
||||
outcomes = {
|
||||
"Synt": {
|
||||
"max_samples linear": 0.9606666666666667,
|
||||
"max_samples rbf": 0.7133333333333334,
|
||||
"max_samples poly": 0.49066666666666664,
|
||||
"impurity linear": 0.9606666666666667,
|
||||
"impurity rbf": 0.7133333333333334,
|
||||
"impurity poly": 0.49066666666666664,
|
||||
},
|
||||
"Iris": {
|
||||
"max_samples linear": 1.0,
|
||||
"max_samples rbf": 0.6910112359550562,
|
||||
"max_samples poly": 0.6966292134831461,
|
||||
"impurity linear": 1,
|
||||
"impurity rbf": 0.6910112359550562,
|
||||
"impurity poly": 0.6966292134831461,
|
||||
},
|
||||
}
|
||||
|
||||
for name, dataset in datasets.items():
|
||||
px, py = dataset
|
||||
for criteria in ["max_samples", "impurity"]:
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(
|
||||
C=55,
|
||||
max_iter=1e5,
|
||||
kernel=kernel,
|
||||
random_state=self._random_state,
|
||||
)
|
||||
clf.fit(px, py)
|
||||
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||
# print(
|
||||
# f"{name} {criteria} {kernel} {outcome} {clf.score(px"
|
||||
# ", py)}"
|
||||
# )
|
||||
self.assertAlmostEqual(outcome, clf.score(px, py))
|
||||
|
||||
def test_max_features(self):
|
||||
n_features = 16
|
||||
expected_values = [
|
||||
("auto", 4),
|
||||
("log2", 4),
|
||||
("sqrt", 4),
|
||||
(0.5, 8),
|
||||
(3, 3),
|
||||
(None, 16),
|
||||
]
|
||||
clf = Stree()
|
||||
clf.n_features_ = n_features
|
||||
for max_features, expected in expected_values:
|
||||
clf.set_params(**dict(max_features=max_features))
|
||||
computed = clf._initialize_max_features()
|
||||
self.assertEqual(expected, computed)
|
||||
# Check bogus max_features
|
||||
values = ["duck", -0.1, 0.0]
|
||||
for max_features in values:
|
||||
clf.set_params(**dict(max_features=max_features))
|
||||
with self.assertRaises(ValueError):
|
||||
_ = clf._initialize_max_features()
|
||||
|
||||
def test_get_subspaces(self):
|
||||
dataset = np.random.random((10, 16))
|
||||
y = np.random.randint(0, 2, 10)
|
||||
expected_values = [
|
||||
("auto", 4),
|
||||
("log2", 4),
|
||||
("sqrt", 4),
|
||||
(0.5, 8),
|
||||
(3, 3),
|
||||
(None, 16),
|
||||
]
|
||||
clf = Stree()
|
||||
for max_features, expected in expected_values:
|
||||
clf.set_params(**dict(max_features=max_features))
|
||||
clf.fit(dataset, y)
|
||||
computed, indices = clf.splitter_.get_subspace(
|
||||
dataset, y, clf.max_features_
|
||||
)
|
||||
self.assertListEqual(
|
||||
dataset[:, indices].tolist(), computed.tolist()
|
||||
)
|
||||
self.assertEqual(expected, len(indices))
|
||||
|
||||
def test_bogus_criterion(self):
|
||||
clf = Stree(criterion="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
clf.fit(*load_dataset())
|
||||
|
||||
def test_predict_feature_dimensions(self):
|
||||
X = np.random.rand(10, 5)
|
||||
y = np.random.randint(0, 2, 10)
|
||||
clf = Stree()
|
||||
clf.fit(X, y)
|
||||
with self.assertRaises(ValueError):
|
||||
clf.predict(X[:, :3])
|
||||
|
||||
# Tests of score
|
||||
|
||||
def test_score_binary(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
accuracies = [
|
||||
0.9506666666666667,
|
||||
0.9606666666666667,
|
||||
0.9433333333333334,
|
||||
]
|
||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||
clf = Stree(
|
||||
random_state=self._random_state,
|
||||
kernel=kernel,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
accuracy_score = clf.score(X, y)
|
||||
yp = clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||
|
||||
def test_score_max_features(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
clf = Stree(random_state=self._random_state, max_features=2)
|
||||
clf.fit(X, y)
|
||||
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
|
||||
|
||||
def test_bogus_splitter_parameter(self):
|
||||
clf = Stree(splitter="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
clf.fit(*load_dataset())
|
||||
|
||||
def test_weights_removing_class(self):
|
||||
# This patch solves an stderr message from sklearn svm lib
|
||||
# "WARNING: class label x specified in weight is not found"
|
||||
X = np.array(
|
||||
[
|
||||
[0.1, 0.1],
|
||||
[0.1, 0.2],
|
||||
[0.2, 0.1],
|
||||
[5, 6],
|
||||
[8, 9],
|
||||
[6, 7],
|
||||
[0.2, 0.2],
|
||||
]
|
||||
)
|
||||
y = np.array([0, 0, 0, 1, 1, 1, 0])
|
||||
epsilon = 1e-5
|
||||
weights = [1, 1, 1, 0, 0, 0, 1]
|
||||
weights = np.array(weights, dtype="float64")
|
||||
weights_epsilon = [x + epsilon for x in weights]
|
||||
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
|
||||
original = weights_no_zero.copy()
|
||||
clf = Stree()
|
||||
clf.fit(X, y)
|
||||
node = clf.train(
|
||||
X,
|
||||
y,
|
||||
weights,
|
||||
1,
|
||||
"test",
|
||||
)
|
||||
# if a class is lost with zero weights the patch adds epsilon
|
||||
self.assertListEqual(weights.tolist(), weights_epsilon)
|
||||
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
|
||||
# zero weights are ok when they don't erase a class
|
||||
_ = clf.train(X, y, weights_no_zero, 1, "test")
|
||||
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
|
||||
|
||||
def test_multiclass_classifier_integrity(self):
|
||||
"""Checks if the multiclass operation is done right"""
|
||||
X, y = load_iris(return_X_y=True)
|
||||
clf = Stree(random_state=0)
|
||||
clf.fit(X, y)
|
||||
score = clf.score(X, y)
|
||||
# Check accuracy of the whole model
|
||||
self.assertAlmostEquals(0.98, score, 5)
|
||||
svm = LinearSVC(random_state=0)
|
||||
svm.fit(X, y)
|
||||
self.assertAlmostEquals(0.9666666666666667, svm.score(X, y), 5)
|
||||
data = svm.decision_function(X)
|
||||
expected = [
|
||||
0.4444444444444444,
|
||||
0.35777777777777775,
|
||||
0.4569777777777778,
|
||||
]
|
||||
ty = data.copy()
|
||||
ty[data <= 0] = 0
|
||||
ty[data > 0] = 1
|
||||
ty = ty.astype(int)
|
||||
for i in range(3):
|
||||
self.assertAlmostEquals(
|
||||
expected[i],
|
||||
clf.splitter_._gini(ty[:, i]),
|
||||
)
|
||||
# 1st Branch
|
||||
# up has to have 50 samples of class 0
|
||||
# down should have 100 [50, 50]
|
||||
up = data[:, 2] > 0
|
||||
resup = np.unique(y[up], return_counts=True)
|
||||
resdn = np.unique(y[~up], return_counts=True)
|
||||
self.assertListEqual([1, 2], resup[0].tolist())
|
||||
self.assertListEqual([3, 50], resup[1].tolist())
|
||||
self.assertListEqual([0, 1], resdn[0].tolist())
|
||||
self.assertListEqual([50, 47], resdn[1].tolist())
|
||||
# 2nd Branch
|
||||
# up should have 53 samples of classes [1, 2] [3, 50]
|
||||
# down shoud have 47 samples of class 1
|
||||
node_up = clf.tree_.get_down().get_up()
|
||||
node_dn = clf.tree_.get_down().get_down()
|
||||
resup = np.unique(node_up._y, return_counts=True)
|
||||
resdn = np.unique(node_dn._y, return_counts=True)
|
||||
self.assertListEqual([1, 2], resup[0].tolist())
|
||||
self.assertListEqual([3, 50], resup[1].tolist())
|
||||
self.assertListEqual([1], resdn[0].tolist())
|
||||
self.assertListEqual([47], resdn[1].tolist())
|
||||
|
||||
def test_score_multiclass_rbf(self):
|
||||
X, y = load_dataset(
|
||||
random_state=self._random_state,
|
||||
n_classes=3,
|
||||
n_features=5,
|
||||
n_samples=500,
|
||||
)
|
||||
clf = Stree(kernel="rbf", random_state=self._random_state)
|
||||
self.assertEqual(0.824, clf.fit(X, y).score(X, y))
|
||||
X, y = load_wine(return_X_y=True)
|
||||
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
||||
|
||||
def test_score_multiclass_poly(self):
|
||||
X, y = load_dataset(
|
||||
random_state=self._random_state,
|
||||
n_classes=3,
|
||||
n_features=5,
|
||||
n_samples=500,
|
||||
)
|
||||
clf = Stree(
|
||||
kernel="poly", random_state=self._random_state, C=10, degree=5
|
||||
)
|
||||
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
||||
X, y = load_wine(return_X_y=True)
|
||||
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
||||
|
||||
def test_score_multiclass_linear(self):
|
||||
X, y = load_dataset(
|
||||
random_state=self._random_state,
|
||||
n_classes=3,
|
||||
n_features=5,
|
||||
n_samples=1500,
|
||||
)
|
||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||
X, y = load_wine(return_X_y=True)
|
||||
self.assertEqual(0.9550561797752809, clf.fit(X, y).score(X, y))
|
@@ -1,211 +0,0 @@
|
||||
import os
|
||||
import imghdr
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
import matplotlib
|
||||
import matplotlib.pyplot as plt
|
||||
import warnings
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
from stree import Stree_grapher, Snode_graph, Snode
|
||||
|
||||
|
||||
def get_dataset(random_state=0, n_features=3):
|
||||
X, y = make_classification(
|
||||
n_samples=1500,
|
||||
n_features=n_features,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
n_classes=2,
|
||||
n_clusters_per_class=2,
|
||||
class_sep=1.5,
|
||||
flip_y=0,
|
||||
weights=[0.5, 0.5],
|
||||
random_state=random_state,
|
||||
)
|
||||
return X, y
|
||||
|
||||
|
||||
class Stree_grapher_test(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._random_state = 1
|
||||
self._clf = Stree_grapher(dict(random_state=self._random_state))
|
||||
self._clf.fit(*get_dataset(self._random_state, n_features=4))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def test_iterator(self):
|
||||
"""Check preorder iterator
|
||||
"""
|
||||
expected = [
|
||||
"root",
|
||||
"root - Down",
|
||||
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.976023 counts"
|
||||
"=(array([0, 1]), array([ 17, 692]))",
|
||||
"root - Down - Up",
|
||||
"root - Down - Up - Down, <cgaf> - Leaf class=0 belief= 0.500000 "
|
||||
"counts=(array([0, 1]), array([1, 1]))",
|
||||
"root - Down - Up - Up, <cgaf> - Leaf class=0 belief= 0.888889 "
|
||||
"counts=(array([0, 1]), array([8, 1]))",
|
||||
"root - Up, <cgaf> - Leaf class=0 belief= 0.928205 counts=(array("
|
||||
"[0, 1]), array([724, 56]))",
|
||||
]
|
||||
computed = []
|
||||
for node in self._clf:
|
||||
computed.append(str(node))
|
||||
self.assertListEqual(expected, computed)
|
||||
|
||||
def test_score(self):
|
||||
X, y = get_dataset(self._random_state)
|
||||
accuracy_score = self._clf.score(X, y)
|
||||
yp = self._clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertGreater(accuracy_score, 0.86)
|
||||
|
||||
def test_save_all(self):
|
||||
folder_name = os.path.join(os.sep, "tmp", "stree")
|
||||
if os.path.isdir(folder_name):
|
||||
os.rmdir(folder_name)
|
||||
file_names = [
|
||||
os.path.join(folder_name, f"STnode{i}.png") for i in range(1, 8)
|
||||
]
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
matplotlib.use("Agg")
|
||||
self._clf.save_all(save_folder=folder_name)
|
||||
for file_name in file_names:
|
||||
self.assertTrue(os.path.exists(file_name))
|
||||
self.assertEqual("png", imghdr.what(file_name))
|
||||
os.remove(file_name)
|
||||
os.rmdir(folder_name)
|
||||
|
||||
def test_plot_all(self):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
matplotlib.use("Agg")
|
||||
num_figures_before = plt.gcf().number
|
||||
self._clf.plot_all()
|
||||
num_figures_after = plt.gcf().number
|
||||
self.assertEqual(7, num_figures_after - num_figures_before)
|
||||
|
||||
|
||||
class Snode_graph_test(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._random_state = 1
|
||||
self._clf = Stree_grapher(dict(random_state=self._random_state))
|
||||
self._clf.fit(*get_dataset(self._random_state))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def test_plot_size(self):
|
||||
default = self._clf._tree_gr.get_plot_size()
|
||||
expected = (17, 3)
|
||||
self._clf._tree_gr.set_plot_size(expected)
|
||||
self.assertEqual(expected, self._clf._tree_gr.get_plot_size())
|
||||
self._clf._tree_gr.set_plot_size(default)
|
||||
self.assertEqual(default, self._clf._tree_gr.get_plot_size())
|
||||
|
||||
def test_attributes_in_leaves_graph(self):
|
||||
"""Check if the attributes in leaves have correct values so they form a
|
||||
predictor
|
||||
"""
|
||||
|
||||
def check_leave(node: Snode_graph):
|
||||
if not node.is_leaf():
|
||||
check_leave(node.get_down())
|
||||
check_leave(node.get_up())
|
||||
return
|
||||
# Check Belief in leave
|
||||
classes, card = np.unique(node._y, return_counts=True)
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
if len(classes) > 1:
|
||||
try:
|
||||
belief = max_card / (max_card + min_card)
|
||||
except ZeroDivisionError:
|
||||
belief = 0.0
|
||||
else:
|
||||
belief = 1
|
||||
self.assertEqual(belief, node._belief)
|
||||
# Check Class
|
||||
class_computed = classes[card == max_card]
|
||||
self.assertEqual(class_computed, node._class)
|
||||
|
||||
check_leave(self._clf._tree_gr)
|
||||
|
||||
def test_nodes_graph_coefs(self):
|
||||
"""Check if the nodes of the tree have the right attributes filled
|
||||
"""
|
||||
|
||||
def run_tree(node: Snode_graph):
|
||||
if node._belief < 1:
|
||||
# only exclude pure leaves
|
||||
self.assertIsNotNone(node._clf)
|
||||
self.assertIsNotNone(node._clf.coef_)
|
||||
if node.is_leaf():
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
|
||||
run_tree(self._clf._tree_gr)
|
||||
|
||||
def test_save_hyperplane(self):
|
||||
folder_name = "/tmp/"
|
||||
file_name = os.path.join(folder_name, "STnode1.png")
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
matplotlib.use("Agg")
|
||||
self._clf._tree_gr.save_hyperplane(folder_name)
|
||||
self.assertTrue(os.path.exists(file_name))
|
||||
self.assertEqual("png", imghdr.what(file_name))
|
||||
os.remove(file_name)
|
||||
|
||||
def test_plot_hyperplane_with_distribution(self):
|
||||
plt.close()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
matplotlib.use("Agg")
|
||||
num_figures_before = plt.gcf().number
|
||||
self._clf._tree_gr.plot_hyperplane(plot_distribution=True)
|
||||
num_figures_after = plt.gcf().number
|
||||
self.assertEqual(1, num_figures_after - num_figures_before)
|
||||
|
||||
def test_plot_hyperplane_without_distribution(self):
|
||||
plt.close()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
matplotlib.use("Agg")
|
||||
num_figures_before = plt.gcf().number
|
||||
self._clf._tree_gr.plot_hyperplane(plot_distribution=False)
|
||||
num_figures_after = plt.gcf().number
|
||||
self.assertEqual(1, num_figures_after - num_figures_before)
|
||||
|
||||
def test_plot_distribution(self):
|
||||
plt.close()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
matplotlib.use("Agg")
|
||||
num_figures_before = plt.gcf().number
|
||||
self._clf._tree_gr.plot_distribution()
|
||||
num_figures_after = plt.gcf().number
|
||||
self.assertEqual(1, num_figures_after - num_figures_before)
|
||||
|
||||
def test_set_axis_limits(self):
|
||||
node = Snode_graph(Snode(None, None, None, "test"))
|
||||
limits = (-2, 2), (-3, 3), (-4, 4)
|
||||
node.set_axis_limits(limits)
|
||||
computed = node.get_axis_limits()
|
||||
x, y, z = limits
|
||||
xx, yy, zz = computed
|
||||
self.assertEqual(x, xx)
|
||||
self.assertEqual(y, yy)
|
||||
self.assertEqual(z, zz)
|
@@ -1,340 +0,0 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
from stree import Stree, Snode
|
||||
|
||||
|
||||
def get_dataset(random_state=0):
|
||||
X, y = make_classification(
|
||||
n_samples=1500,
|
||||
n_features=3,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
n_classes=2,
|
||||
n_clusters_per_class=2,
|
||||
class_sep=1.5,
|
||||
flip_y=0,
|
||||
weights=[0.5, 0.5],
|
||||
random_state=random_state,
|
||||
)
|
||||
return X, y
|
||||
|
||||
|
||||
class Stree_test(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._random_state = 1
|
||||
self._kernels = ["linear", "rbf", "poly"]
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def _check_tree(self, node: Snode):
|
||||
"""Check recursively that the nodes that are not leaves have the
|
||||
correct number of labels and its sons have the right number of elements
|
||||
in their dataset
|
||||
|
||||
Arguments:
|
||||
node {Snode} -- node to check
|
||||
"""
|
||||
if node.is_leaf():
|
||||
return
|
||||
y_prediction = node._clf.predict(node._X)
|
||||
y_down = node.get_down()._y
|
||||
y_up = node.get_up()._y
|
||||
# Is a correct partition in terms of cadinality?
|
||||
# i.e. The partition algorithm didn't forget any sample
|
||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||
_, count_d = np.unique(y_down, return_counts=True)
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
except IndexError:
|
||||
number_down = 0
|
||||
try:
|
||||
number_up = count_u[i]
|
||||
except IndexError:
|
||||
number_up = 0
|
||||
self.assertEqual(count_y[i], number_down + number_up)
|
||||
# Is the partition made the same as the prediction?
|
||||
# as the node is not a leaf...
|
||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
||||
self._check_tree(node.get_down())
|
||||
self._check_tree(node.get_up())
|
||||
|
||||
def test_build_tree(self):
|
||||
"""Check if the tree is built the same way as predictions of models
|
||||
"""
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
clf.fit(*get_dataset(self._random_state))
|
||||
self._check_tree(clf.tree_)
|
||||
|
||||
def _find_out(
|
||||
self, px: np.array, x_original: np.array, y_original
|
||||
) -> list:
|
||||
"""Find the original values of y for a given array of samples
|
||||
|
||||
Arguments:
|
||||
px {np.array} -- array of samples to search for
|
||||
x_original {np.array} -- original dataset
|
||||
y_original {[type]} -- original classes
|
||||
|
||||
Returns:
|
||||
np.array -- classes of the given samples
|
||||
"""
|
||||
res = []
|
||||
for needle in px:
|
||||
for row in range(x_original.shape[0]):
|
||||
if all(x_original[row, :] == needle):
|
||||
res.append(y_original[row])
|
||||
return res
|
||||
|
||||
def test_single_prediction(self):
|
||||
probs = [0.29026400766, 0.73105613, 0.0307635]
|
||||
X, y = get_dataset(self._random_state)
|
||||
for kernel, prob in zip(self._kernels, probs):
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||
self.assertEqual(yp[0], y[0])
|
||||
|
||||
def test_multiple_prediction(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = get_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
yp = clf.fit(X, y).predict(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||
|
||||
def test_score(self):
|
||||
X, y = get_dataset(self._random_state)
|
||||
for kernel, accuracy_expected in zip(
|
||||
self._kernels,
|
||||
[0.9506666666666667, 0.9606666666666667, 0.9433333333333334],
|
||||
):
|
||||
clf = Stree(random_state=self._random_state, kernel=kernel,)
|
||||
clf.fit(X, y)
|
||||
accuracy_score = clf.score(X, y)
|
||||
yp = clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||
|
||||
def test_single_predict_proba(self):
|
||||
"""Check the element 28 probability of being 1
|
||||
"""
|
||||
decimals = 5
|
||||
element = 28
|
||||
probs = [0.29026400766, 0.73105613, 0.0307635]
|
||||
X, y = get_dataset(self._random_state)
|
||||
self.assertEqual(1, y[element])
|
||||
for kernel, prob in zip(self._kernels, probs):
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
yp = clf.fit(X, y).predict_proba(
|
||||
X[element, :].reshape(-1, X.shape[1])
|
||||
)
|
||||
self.assertAlmostEqual(
|
||||
np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)
|
||||
)
|
||||
self.assertAlmostEqual(
|
||||
round(prob, decimals), round(yp[0, 1], decimals), decimals
|
||||
)
|
||||
|
||||
def test_multiple_predict_proba(self):
|
||||
# First 27 elements the predictions are the same as the truth
|
||||
num = 27
|
||||
X, y = get_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
clf.fit(X, y)
|
||||
yp = clf.predict_proba(X[:num, :])
|
||||
self.assertListEqual(
|
||||
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()
|
||||
)
|
||||
|
||||
def test_single_vs_multiple_prediction(self):
|
||||
"""Check if predicting sample by sample gives the same result as
|
||||
predicting all samples at once
|
||||
"""
|
||||
X, y = get_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
clf.fit(X, y)
|
||||
# Compute prediction line by line
|
||||
yp_line = np.array([], dtype=int)
|
||||
for xp in X:
|
||||
yp_line = np.append(
|
||||
yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
|
||||
)
|
||||
# Compute prediction at once
|
||||
yp_once = clf.predict(X)
|
||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||
|
||||
def test_iterator_and_str(self):
|
||||
"""Check preorder iterator
|
||||
"""
|
||||
expected = [
|
||||
"root",
|
||||
"root - Down",
|
||||
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts"
|
||||
"=(array([0, 1]), array([ 17, 691]))",
|
||||
"root - Down - Up",
|
||||
"root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 "
|
||||
"counts=(array([0, 1]), array([1, 3]))",
|
||||
"root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 "
|
||||
"counts=(array([0]), array([7]))",
|
||||
"root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array("
|
||||
"[0, 1]), array([725, 56]))",
|
||||
]
|
||||
computed = []
|
||||
expected_string = ""
|
||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||
clf.fit(*get_dataset(self._random_state))
|
||||
for node in clf:
|
||||
computed.append(str(node))
|
||||
expected_string += str(node) + "\n"
|
||||
self.assertListEqual(expected, computed)
|
||||
self.assertEqual(expected_string, str(clf))
|
||||
|
||||
def test_is_a_sklearn_classifier(self):
|
||||
import warnings
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
|
||||
check_estimator(Stree())
|
||||
|
||||
def test_exception_if_C_is_negative(self):
|
||||
tclf = Stree(C=-1)
|
||||
with self.assertRaises(ValueError):
|
||||
tclf.fit(*get_dataset(self._random_state))
|
||||
|
||||
def test_check_max_depth_is_positive_or_None(self):
|
||||
tcl = Stree()
|
||||
self.assertIsNone(tcl.max_depth)
|
||||
tcl = Stree(max_depth=1)
|
||||
self.assertGreaterEqual(1, tcl.max_depth)
|
||||
with self.assertRaises(ValueError):
|
||||
tcl = Stree(max_depth=-1)
|
||||
tcl.fit(*get_dataset(self._random_state))
|
||||
|
||||
def test_check_max_depth(self):
|
||||
depths = (3, 4)
|
||||
for depth in depths:
|
||||
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
||||
tcl.fit(*get_dataset(self._random_state))
|
||||
self.assertEqual(depth, tcl.depth_)
|
||||
|
||||
def test_unfitted_tree_is_iterable(self):
|
||||
tcl = Stree()
|
||||
self.assertEqual(0, len(list(tcl)))
|
||||
|
||||
def test_min_samples_split(self):
|
||||
tcl_split = Stree(min_samples_split=3)
|
||||
tcl_nosplit = Stree(min_samples_split=4)
|
||||
dataset = [[1], [2], [3]], [1, 1, 0]
|
||||
tcl_split.fit(*dataset)
|
||||
self.assertIsNotNone(tcl_split.tree_.get_down())
|
||||
self.assertIsNotNone(tcl_split.tree_.get_up())
|
||||
tcl_nosplit.fit(*dataset)
|
||||
self.assertIsNone(tcl_nosplit.tree_.get_down())
|
||||
self.assertIsNone(tcl_nosplit.tree_.get_up())
|
||||
|
||||
def test_muticlass_dataset(self):
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
px = [[1, 2], [3, 4], [5, 6]]
|
||||
py = [1, 2, 3]
|
||||
clf.fit(px, py)
|
||||
self.assertEqual(1.0, clf.score(px, py))
|
||||
self.assertListEqual([1, 2, 3], clf.predict(px).tolist())
|
||||
|
||||
|
||||
class Snode_test(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self._random_state = 1
|
||||
self._clf = Stree(random_state=self._random_state)
|
||||
self._clf.fit(*get_dataset(self._random_state))
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def test_attributes_in_leaves(self):
|
||||
"""Check if the attributes in leaves have correct values so they form a
|
||||
predictor
|
||||
"""
|
||||
|
||||
def check_leave(node: Snode):
|
||||
if not node.is_leaf():
|
||||
check_leave(node.get_down())
|
||||
check_leave(node.get_up())
|
||||
return
|
||||
# Check Belief in leave
|
||||
classes, card = np.unique(node._y, return_counts=True)
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
if len(classes) > 1:
|
||||
try:
|
||||
belief = max_card / (max_card + min_card)
|
||||
except ZeroDivisionError:
|
||||
belief = 0.0
|
||||
else:
|
||||
belief = 1
|
||||
self.assertEqual(belief, node._belief)
|
||||
# Check Class
|
||||
class_computed = classes[card == max_card]
|
||||
self.assertEqual(class_computed, node._class)
|
||||
|
||||
check_leave(self._clf.tree_)
|
||||
|
||||
def test_nodes_coefs(self):
|
||||
"""Check if the nodes of the tree have the right attributes filled
|
||||
"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node._belief < 1:
|
||||
# only exclude pure leaves
|
||||
self.assertIsNotNone(node._clf)
|
||||
self.assertIsNotNone(node._clf.coef_)
|
||||
if node.is_leaf():
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
|
||||
run_tree(self._clf.tree_)
|
||||
|
||||
def test_make_predictor_on_leaf(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
|
||||
test.make_predictor()
|
||||
self.assertEqual(1, test._class)
|
||||
self.assertEqual(0.75, test._belief)
|
||||
|
||||
def test_make_predictor_on_not_leaf(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
|
||||
test.set_up(Snode(None, [1], [1], "another_test"))
|
||||
test.make_predictor()
|
||||
self.assertIsNone(test._class)
|
||||
self.assertEqual(0, test._belief)
|
||||
|
||||
def test_make_predictor_on_leaf_bogus_data(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [], "test")
|
||||
test.make_predictor()
|
||||
self.assertIsNone(test._class)
|
@@ -1,9 +1,5 @@
|
||||
from .Strees_test import Stree_test, Snode_test
|
||||
from .Strees_grapher_test import Stree_grapher_test, Snode_graph_test
|
||||
from .Stree_test import Stree_test
|
||||
from .Snode_test import Snode_test
|
||||
from .Splitter_test import Splitter_test
|
||||
|
||||
__all__ = [
|
||||
"Stree_test",
|
||||
"Snode_test",
|
||||
"Stree_grapher_test",
|
||||
"Snode_graph_test",
|
||||
]
|
||||
__all__ = ["Stree_test", "Snode_test", "Splitter_test"]
|
||||
|
17
stree/tests/utils.py
Normal file
17
stree/tests/utils.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
|
||||
def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
||||
X, y = make_classification(
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
n_classes=n_classes,
|
||||
n_clusters_per_class=2,
|
||||
class_sep=1.5,
|
||||
flip_y=0,
|
||||
random_state=random_state,
|
||||
)
|
||||
return X, y
|
Reference in New Issue
Block a user