mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
40 Commits
0.9rc4
...
Adding-Git
Author | SHA1 | Date | |
---|---|---|---|
98881cbd45
|
|||
cdb9fd6faa
|
|||
82f7352f9a
|
|||
8359e442e5
|
|||
|
673081cdc5 | ||
|
36816074ff | ||
475ad7e752
|
|||
|
1c869e154e | ||
f5706c3159
|
|||
be552fdd6c
|
|||
5e3a8e3ec5
|
|||
554ec03c32
|
|||
4b7e4a3fb0
|
|||
76723993fd
|
|||
ecd0b86f4d
|
|||
3e52a4746c
|
|||
|
a20e45e8e7 | ||
9334951d1b
|
|||
736ab7ef20
|
|||
c94bc068bd
|
|||
502ee72799
|
|||
f1ee4de37b
|
|||
ae1c199e21
|
|||
1bfe273a70
|
|||
|
647d21bdb5 | ||
1d392d534f
|
|||
f360a2640c
|
|||
|
45510b43bc | ||
286a91a3d7
|
|||
5c31c2b2a5
|
|||
7e932de072
|
|||
26273e936a
|
|||
d7c0bc3bc5
|
|||
3a48d8b405
|
|||
05b462716e
|
|||
b824229121
|
|||
8ba9b1b6a1
|
|||
37577849db
|
|||
cb10aea36e
|
|||
b9f14aec05
|
13
.coveragerc
Normal file
13
.coveragerc
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
[run]
|
||||||
|
branch = True
|
||||||
|
source = stree
|
||||||
|
|
||||||
|
[report]
|
||||||
|
exclude_lines =
|
||||||
|
if self.debug:
|
||||||
|
pragma: no cover
|
||||||
|
raise NotImplementedError
|
||||||
|
if __name__ == .__main__.:
|
||||||
|
ignore_errors = True
|
||||||
|
omit =
|
||||||
|
stree/__init__.py
|
47
.github/workflows/main.yml
vendored
Normal file
47
.github/workflows/main.yml
vendored
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [master]
|
||||||
|
pull_request:
|
||||||
|
branches: [master]
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [macos-latest, ubuntu-latest]
|
||||||
|
python: [3.8]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python ${{ matrix.python }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
pip install -q --upgrade pip
|
||||||
|
pip install -q -r requirements.txt
|
||||||
|
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
||||||
|
- name: Lint
|
||||||
|
run: |
|
||||||
|
black --check --diff stree
|
||||||
|
flake8 --count stree
|
||||||
|
- name: Tests
|
||||||
|
run: |
|
||||||
|
coverage run -m unittest -v stree.tests
|
||||||
|
coverage xml
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v1
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
files: ./coverage.xml
|
||||||
|
- name: Run codacy-coverage-reporter
|
||||||
|
if: runner.os == 'Linux'
|
||||||
|
uses: codacy/codacy-coverage-reporter-action@master
|
||||||
|
with:
|
||||||
|
project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
|
||||||
|
coverage-reports: coverage.xml
|
5
.gitignore
vendored
5
.gitignore
vendored
@@ -129,4 +129,7 @@ dmypy.json
|
|||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
.idea
|
.idea
|
||||||
.vscode
|
.vscode
|
||||||
|
.pre-commit-config.yaml
|
||||||
|
|
||||||
|
**.csv
|
13
.travis.yml
13
.travis.yml
@@ -1,13 +0,0 @@
|
|||||||
language: python
|
|
||||||
os: linux
|
|
||||||
dist: xenial
|
|
||||||
install:
|
|
||||||
- pip install -r requirements.txt
|
|
||||||
notifications:
|
|
||||||
email:
|
|
||||||
recipients:
|
|
||||||
- ricardo.montanana@alu.uclm.es
|
|
||||||
on_success: never # default: change
|
|
||||||
on_failure: always # default: always
|
|
||||||
# command to run tests
|
|
||||||
script: python -m unittest stree.tests
|
|
18
README.md
18
README.md
@@ -1,8 +1,10 @@
|
|||||||
[](https://travis-ci.com/Doctorado-ML/STree)
|

|
||||||
|
[](https://codecov.io/gh/doctorado-ml/stree)
|
||||||
|
[](https://www.codacy.com/gh/Doctorado-ML/STree?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/STree&utm_campaign=Badge_Grade)
|
||||||
|
|
||||||
# Stree
|
# Stree
|
||||||
|
|
||||||
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn SVC models. Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@@ -16,17 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
|
|||||||
|
|
||||||
### Jupyter notebooks
|
### Jupyter notebooks
|
||||||
|
|
||||||
##### Slow launch but better integration
|
- [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
* [](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/test.ipynb) Test notebook
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
|
||||||
|
|
||||||
##### Fast launch but have to run first commented out cell for setup
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test.ipynb) Test notebook
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test2.ipynb) Another Test notebook
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
|
||||||
|
|
||||||
* [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics notebook
|
- [](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
|
||||||
|
|
||||||
### Command line
|
### Command line
|
||||||
|
|
||||||
|
12
codecov.yml
Normal file
12
codecov.yml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
overage:
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
target: 90%
|
||||||
|
comment:
|
||||||
|
layout: "reach, diff, flags, files"
|
||||||
|
behavior: default
|
||||||
|
require_changes: false
|
||||||
|
require_base: yes
|
||||||
|
require_head: yes
|
||||||
|
branches: null
|
60
main.py
60
main.py
@@ -1,57 +1,29 @@
|
|||||||
import time
|
import time
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
from stree import Stree
|
from stree import Stree
|
||||||
|
|
||||||
random_state=1
|
random_state = 1
|
||||||
|
|
||||||
def load_creditcard(n_examples=0):
|
X, y = load_iris(return_X_y=True)
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import random
|
|
||||||
df = pd.read_csv('data/creditcard.csv')
|
|
||||||
print("Fraud: {0:.3f}% {1}".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))
|
|
||||||
print("Valid: {0:.3f}% {1}".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))
|
|
||||||
y = np.expand_dims(df.Class.values, axis=1)
|
|
||||||
X = df.drop(['Class', 'Time', 'Amount'], axis=1).values
|
|
||||||
if n_examples > 0:
|
|
||||||
# Take first n_examples samples
|
|
||||||
X = X[:n_examples, :]
|
|
||||||
y = y[:n_examples, :]
|
|
||||||
else:
|
|
||||||
# Take all the positive samples with a number of random negatives
|
|
||||||
if n_examples < 0:
|
|
||||||
Xt = X[(y == 1).ravel()]
|
|
||||||
yt = y[(y == 1).ravel()]
|
|
||||||
indices = random.sample(range(X.shape[0]), -1 * n_examples)
|
|
||||||
X = np.append(Xt, X[indices], axis=0)
|
|
||||||
y = np.append(yt, y[indices], axis=0)
|
|
||||||
print("X.shape", X.shape, " y.shape", y.shape)
|
|
||||||
print("Fraud: {0:.3f}% {1}".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))
|
|
||||||
print("Valid: {0:.3f}% {1}".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))
|
|
||||||
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)
|
|
||||||
return Xtrain, Xtest, ytrain, ytest
|
|
||||||
|
|
||||||
# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
|
Xtrain, Xtest, ytrain, ytest = train_test_split(
|
||||||
# data = load_creditcard(5000) # Take the first 5000 samples
|
X, y, test_size=0.2, random_state=random_state
|
||||||
data = load_creditcard() # Take all the samples
|
)
|
||||||
|
|
||||||
Xtrain = data[0]
|
|
||||||
Xtest = data[1]
|
|
||||||
ytrain = data[2]
|
|
||||||
ytest = data[3]
|
|
||||||
|
|
||||||
now = time.time()
|
now = time.time()
|
||||||
clf = Stree(C=.01, random_state=random_state)
|
print("Predicting with max_features=sqrt(n_features)")
|
||||||
|
clf = Stree(C=0.01, random_state=random_state, max_features="auto")
|
||||||
|
clf.fit(Xtrain, ytrain)
|
||||||
|
print(f"Took {time.time() - now:.2f} seconds to train")
|
||||||
|
print(clf)
|
||||||
|
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
||||||
|
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
||||||
|
print("=" * 40)
|
||||||
|
print("Predicting with max_features=n_features")
|
||||||
|
clf = Stree(C=0.01, random_state=random_state)
|
||||||
clf.fit(Xtrain, ytrain)
|
clf.fit(Xtrain, ytrain)
|
||||||
print(f"Took {time.time() - now:.2f} seconds to train")
|
print(f"Took {time.time() - now:.2f} seconds to train")
|
||||||
print(clf)
|
print(clf)
|
||||||
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
|
||||||
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
|
||||||
proba = clf.predict_proba(Xtest)
|
|
||||||
print("Checking that we have correct probabilities, these are probabilities of sample belonging to class 1")
|
|
||||||
res0 = proba[proba[:, 0] == 0]
|
|
||||||
res1 = proba[proba[:, 0] == 1]
|
|
||||||
print("++++++++++res0 > .8++++++++++++")
|
|
||||||
print(res0[res0[:, 1] > .8])
|
|
||||||
print("**********res1 < .4************")
|
|
||||||
print(res1[res1[:, 1] < .4])
|
|
588
notebooks/benchmark.ipynb
Normal file
588
notebooks/benchmark.ipynb
Normal file
@@ -0,0 +1,588 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Compare STree with different estimators"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#\n",
|
||||||
|
"# Google Colab setup\n",
|
||||||
|
"#\n",
|
||||||
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import datetime, time\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn import tree\n",
|
||||||
|
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
|
||||||
|
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n",
|
||||||
|
"from stree import Stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"if not os.path.isfile('data/creditcard.csv'):\n",
|
||||||
|
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
||||||
|
" !tar xzf creditcard.tgz"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"2020-11-01 11:14:06\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(datetime.date.today(), time.strftime(\"%H:%M:%S\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Load dataset and normalize values"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Load Dataset\n",
|
||||||
|
"df = pd.read_csv('data/creditcard.csv')\n",
|
||||||
|
"df.shape\n",
|
||||||
|
"random_state = 2020"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||||
|
"print(\"Valid: {0:.3f}% {1:,}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Normalize Amount\n",
|
||||||
|
"from sklearn.preprocessing import RobustScaler\n",
|
||||||
|
"values = RobustScaler().fit_transform(df.Amount.values.reshape(-1, 1))\n",
|
||||||
|
"df['Amount_Scaled'] = values"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"X shape: (284807, 29)\ny shape: (284807,)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Remove unneeded features\n",
|
||||||
|
"y = df.Class.values\n",
|
||||||
|
"X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||||
|
"print(f\"X shape: {X.shape}\\ny shape: {y.shape}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build the models"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Divide dataset\n",
|
||||||
|
"train_size = .7\n",
|
||||||
|
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=train_size, shuffle=True, random_state=random_state, stratify=y)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Linear Tree\n",
|
||||||
|
"linear_tree = tree.DecisionTreeClassifier(random_state=random_state)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Random Forest\n",
|
||||||
|
"random_forest = RandomForestClassifier(random_state=random_state)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Stree\n",
|
||||||
|
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# AdaBoost\n",
|
||||||
|
"adaboost = AdaBoostClassifier(random_state=random_state)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Bagging\n",
|
||||||
|
"bagging = BaggingClassifier(random_state=random_state)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Do the test"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def try_model(name, model):\n",
|
||||||
|
" print(f\"************************** {name} **********************\")\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" model.fit(Xtrain, ytrain)\n",
|
||||||
|
" spent = time.time() - now\n",
|
||||||
|
" print(f\"Train Model {name} took: {spent:.4} seconds\")\n",
|
||||||
|
" predict = model.predict(Xtrain)\n",
|
||||||
|
" predictt = model.predict(Xtest)\n",
|
||||||
|
" print(f\"=========== {name} - Train {Xtrain.shape[0]:,} samples =============\",)\n",
|
||||||
|
" print(classification_report(ytrain, predict, digits=6))\n",
|
||||||
|
" print(f\"=========== {name} - Test {Xtest.shape[0]:,} samples =============\")\n",
|
||||||
|
" print(classification_report(ytest, predictt, digits=6))\n",
|
||||||
|
" print(\"Confusion Matrix in Train\")\n",
|
||||||
|
" print(confusion_matrix(ytrain, predict))\n",
|
||||||
|
" print(\"Confusion Matrix in Test\")\n",
|
||||||
|
" print(confusion_matrix(ytest, predictt))\n",
|
||||||
|
" return f1_score(ytest, predictt), spent"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"************************** Linear Tree **********************\n",
|
||||||
|
"Train Model Linear Tree took: 15.14 seconds\n",
|
||||||
|
"=========== Linear Tree - Train 199,364 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 1.000000 1.000000 1.000000 199020\n",
|
||||||
|
" 1 1.000000 1.000000 1.000000 344\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 1.000000 199364\n",
|
||||||
|
" macro avg 1.000000 1.000000 1.000000 199364\n",
|
||||||
|
"weighted avg 1.000000 1.000000 1.000000 199364\n",
|
||||||
|
"\n",
|
||||||
|
"=========== Linear Tree - Test 85,443 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999578 0.999613 0.999596 85295\n",
|
||||||
|
" 1 0.772414 0.756757 0.764505 148\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999192 85443\n",
|
||||||
|
" macro avg 0.885996 0.878185 0.882050 85443\n",
|
||||||
|
"weighted avg 0.999184 0.999192 0.999188 85443\n",
|
||||||
|
"\n",
|
||||||
|
"Confusion Matrix in Train\n",
|
||||||
|
"[[199020 0]\n",
|
||||||
|
" [ 0 344]]\n",
|
||||||
|
"Confusion Matrix in Test\n",
|
||||||
|
"[[85262 33]\n",
|
||||||
|
" [ 36 112]]\n",
|
||||||
|
"************************** Random Forest **********************\n",
|
||||||
|
"Train Model Random Forest took: 181.1 seconds\n",
|
||||||
|
"=========== Random Forest - Train 199,364 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 1.000000 1.000000 1.000000 199020\n",
|
||||||
|
" 1 1.000000 1.000000 1.000000 344\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 1.000000 199364\n",
|
||||||
|
" macro avg 1.000000 1.000000 1.000000 199364\n",
|
||||||
|
"weighted avg 1.000000 1.000000 1.000000 199364\n",
|
||||||
|
"\n",
|
||||||
|
"=========== Random Forest - Test 85,443 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999660 0.999965 0.999812 85295\n",
|
||||||
|
" 1 0.975410 0.804054 0.881481 148\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999625 85443\n",
|
||||||
|
" macro avg 0.987535 0.902009 0.940647 85443\n",
|
||||||
|
"weighted avg 0.999618 0.999625 0.999607 85443\n",
|
||||||
|
"\n",
|
||||||
|
"Confusion Matrix in Train\n",
|
||||||
|
"[[199020 0]\n",
|
||||||
|
" [ 0 344]]\n",
|
||||||
|
"Confusion Matrix in Test\n",
|
||||||
|
"[[85292 3]\n",
|
||||||
|
" [ 29 119]]\n",
|
||||||
|
"************************** Stree (SVM Tree) **********************\n",
|
||||||
|
"Train Model Stree (SVM Tree) took: 36.6 seconds\n",
|
||||||
|
"=========== Stree (SVM Tree) - Train 199,364 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999623 0.999864 0.999744 199020\n",
|
||||||
|
" 1 0.908784 0.781977 0.840625 344\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999488 199364\n",
|
||||||
|
" macro avg 0.954204 0.890921 0.920184 199364\n",
|
||||||
|
"weighted avg 0.999467 0.999488 0.999469 199364\n",
|
||||||
|
"\n",
|
||||||
|
"=========== Stree (SVM Tree) - Test 85,443 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999637 0.999918 0.999777 85295\n",
|
||||||
|
" 1 0.943548 0.790541 0.860294 148\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999555 85443\n",
|
||||||
|
" macro avg 0.971593 0.895229 0.930036 85443\n",
|
||||||
|
"weighted avg 0.999540 0.999555 0.999536 85443\n",
|
||||||
|
"\n",
|
||||||
|
"Confusion Matrix in Train\n",
|
||||||
|
"[[198993 27]\n",
|
||||||
|
" [ 75 269]]\n",
|
||||||
|
"Confusion Matrix in Test\n",
|
||||||
|
"[[85288 7]\n",
|
||||||
|
" [ 31 117]]\n",
|
||||||
|
"************************** AdaBoost model **********************\n",
|
||||||
|
"Train Model AdaBoost model took: 46.14 seconds\n",
|
||||||
|
"=========== AdaBoost model - Train 199,364 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999392 0.999678 0.999535 199020\n",
|
||||||
|
" 1 0.777003 0.648256 0.706815 344\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999072 199364\n",
|
||||||
|
" macro avg 0.888198 0.823967 0.853175 199364\n",
|
||||||
|
"weighted avg 0.999008 0.999072 0.999030 199364\n",
|
||||||
|
"\n",
|
||||||
|
"=========== AdaBoost model - Test 85,443 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999484 0.999707 0.999596 85295\n",
|
||||||
|
" 1 0.806202 0.702703 0.750903 148\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999192 85443\n",
|
||||||
|
" macro avg 0.902843 0.851205 0.875249 85443\n",
|
||||||
|
"weighted avg 0.999149 0.999192 0.999165 85443\n",
|
||||||
|
"\n",
|
||||||
|
"Confusion Matrix in Train\n",
|
||||||
|
"[[198956 64]\n",
|
||||||
|
" [ 121 223]]\n",
|
||||||
|
"Confusion Matrix in Test\n",
|
||||||
|
"[[85270 25]\n",
|
||||||
|
" [ 44 104]]\n",
|
||||||
|
"************************** Bagging model **********************\n",
|
||||||
|
"Train Model Bagging model took: 77.73 seconds\n",
|
||||||
|
"=========== Bagging model - Train 199,364 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999864 1.000000 0.999932 199020\n",
|
||||||
|
" 1 1.000000 0.921512 0.959153 344\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999865 199364\n",
|
||||||
|
" macro avg 0.999932 0.960756 0.979542 199364\n",
|
||||||
|
"weighted avg 0.999865 0.999865 0.999862 199364\n",
|
||||||
|
"\n",
|
||||||
|
"=========== Bagging model - Test 85,443 samples =============\n",
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" 0 0.999637 0.999953 0.999795 85295\n",
|
||||||
|
" 1 0.966942 0.790541 0.869888 148\n",
|
||||||
|
"\n",
|
||||||
|
" accuracy 0.999590 85443\n",
|
||||||
|
" macro avg 0.983289 0.895247 0.934842 85443\n",
|
||||||
|
"weighted avg 0.999580 0.999590 0.999570 85443\n",
|
||||||
|
"\n",
|
||||||
|
"Confusion Matrix in Train\n",
|
||||||
|
"[[199020 0]\n",
|
||||||
|
" [ 27 317]]\n",
|
||||||
|
"Confusion Matrix in Test\n",
|
||||||
|
"[[85291 4]\n",
|
||||||
|
" [ 31 117]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Train & Test models\n",
|
||||||
|
"models = {\n",
|
||||||
|
" 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n",
|
||||||
|
" 'AdaBoost model': adaboost, 'Bagging model': bagging\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"best_f1 = 0\n",
|
||||||
|
"outcomes = []\n",
|
||||||
|
"for name, model in models.items():\n",
|
||||||
|
" f1, time_spent = try_model(name, model)\n",
|
||||||
|
" outcomes.append((name, f1, time_spent))\n",
|
||||||
|
" if f1 > best_f1:\n",
|
||||||
|
" best_model = name\n",
|
||||||
|
" best_time = time_spent\n",
|
||||||
|
" best_f1 = f1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 181.07 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 15.14 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 181.07 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 36.60 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time: 46.14 seconds\t f1: 0.7509\nModel: Bagging model\t Time: 77.73 seconds\t f1: 0.8699\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\"*\"*110)\n",
|
||||||
|
"print(f\"*The best f1 model is {best_model}, with a f1 score: {best_f1:.4} in {best_time:.6} seconds with {train_size:,} samples in train dataset\")\n",
|
||||||
|
"print(\"*\"*110)\n",
|
||||||
|
"for name, f1, time_spent in outcomes:\n",
|
||||||
|
" print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "raw",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**************************************************************************************************************\n",
|
||||||
|
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
|
||||||
|
"**************************************************************************************************************\n",
|
||||||
|
"Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n",
|
||||||
|
"Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
|
||||||
|
"Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n",
|
||||||
|
"Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n",
|
||||||
|
"Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"```\n",
|
||||||
|
"******************************************************************************************************************\n",
|
||||||
|
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset\n",
|
||||||
|
"******************************************************************************************************************\n",
|
||||||
|
"Model: Linear Tree Time: 23.05 seconds\t f1: 0.7645\n",
|
||||||
|
"Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n",
|
||||||
|
"Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8603\n",
|
||||||
|
"Model: AdaBoost model\t Time: 73.83 seconds\t f1: 0.7509\n",
|
||||||
|
"Model: Neural Network\t Time: 25.47 seconds\t f1: 0.8328\n",
|
||||||
|
"Model: Bagging model\t Time: 77.93 seconds\t f1: 0.8699\n",
|
||||||
|
"\n",
|
||||||
|
"```"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'C': 0.01,\n",
|
||||||
|
" 'criterion': 'entropy',\n",
|
||||||
|
" 'degree': 3,\n",
|
||||||
|
" 'gamma': 'scale',\n",
|
||||||
|
" 'kernel': 'linear',\n",
|
||||||
|
" 'max_depth': None,\n",
|
||||||
|
" 'max_features': None,\n",
|
||||||
|
" 'max_iter': 1000.0,\n",
|
||||||
|
" 'min_samples_split': 0,\n",
|
||||||
|
" 'random_state': 2020,\n",
|
||||||
|
" 'split_criteria': 'impurity',\n",
|
||||||
|
" 'splitter': 'random',\n",
|
||||||
|
" 'tol': 0.0001}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 18
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"stree.get_params()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"hide_input": false,
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.8.4 64-bit ('general': venv)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python38464bitgeneralvenv77203c0a6afd4428bd66253ef62753dc"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.4-final"
|
||||||
|
},
|
||||||
|
"toc": {
|
||||||
|
"base_numbering": 1,
|
||||||
|
"nav_menu": {},
|
||||||
|
"number_sections": true,
|
||||||
|
"sideBar": true,
|
||||||
|
"skip_h1_title": false,
|
||||||
|
"title_cell": "Table of Contents",
|
||||||
|
"title_sidebar": "Contents",
|
||||||
|
"toc_cell": false,
|
||||||
|
"toc_position": {},
|
||||||
|
"toc_section_display": true,
|
||||||
|
"toc_window_display": false
|
||||||
|
},
|
||||||
|
"varInspector": {
|
||||||
|
"cols": {
|
||||||
|
"lenName": 16,
|
||||||
|
"lenType": 16,
|
||||||
|
"lenVar": 40
|
||||||
|
},
|
||||||
|
"kernels_config": {
|
||||||
|
"python": {
|
||||||
|
"delete_cmd_postfix": "",
|
||||||
|
"delete_cmd_prefix": "del ",
|
||||||
|
"library": "var_list.py",
|
||||||
|
"varRefreshCmd": "print(var_dic_list())"
|
||||||
|
},
|
||||||
|
"r": {
|
||||||
|
"delete_cmd_postfix": ") ",
|
||||||
|
"delete_cmd_prefix": "rm(",
|
||||||
|
"library": "var_list.r",
|
||||||
|
"varRefreshCmd": "cat(var_dic_list()) "
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"position": {
|
||||||
|
"height": "392px",
|
||||||
|
"left": "1518px",
|
||||||
|
"right": "20px",
|
||||||
|
"top": "40px",
|
||||||
|
"width": "392px"
|
||||||
|
},
|
||||||
|
"types_to_exclude": [
|
||||||
|
"module",
|
||||||
|
"function",
|
||||||
|
"builtin_function_or_method",
|
||||||
|
"instance",
|
||||||
|
"_Feature"
|
||||||
|
],
|
||||||
|
"window_display": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
@@ -1,18 +1,30 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Test Stree with AdaBoost and Bagging with different configurations"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import time\n",
|
"#\n",
|
||||||
"from sklearn.ensemble import AdaBoostClassifier\n",
|
"# Google Colab setup\n",
|
||||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
"#\n",
|
||||||
"from sklearn.svm import LinearSVC\n",
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
|
||||||
"from sklearn.datasets import load_iris\n",
|
|
||||||
"from stree import Stree"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -20,6 +32,18 @@
|
|||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import time\n",
|
||||||
|
"from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from stree import Stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import os\n",
|
"import os\n",
|
||||||
"if not os.path.isfile('data/creditcard.csv'):\n",
|
"if not os.path.isfile('data/creditcard.csv'):\n",
|
||||||
@@ -29,13 +53,21 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
|
"text": [
|
||||||
|
"Fraud: 0.173% 492\n",
|
||||||
|
"Valid: 99.827% 284315\n",
|
||||||
|
"X.shape (100492, 28) y.shape (100492,)\n",
|
||||||
|
"Fraud: 0.652% 655\n",
|
||||||
|
"Valid: 99.348% 99837\n"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
@@ -68,9 +100,10 @@
|
|||||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
"\n",
|
"\n",
|
||||||
"data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
|
"# data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
|
||||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||||
"# data = load_creditcard(0) # Take all the samples\n",
|
"# data = load_creditcard(0) # Take all the samples\n",
|
||||||
|
"data = load_creditcard(-100000)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Xtrain = data[0]\n",
|
"Xtrain = data[0]\n",
|
||||||
"Xtest = data[1]\n",
|
"Xtest = data[1]\n",
|
||||||
@@ -79,19 +112,37 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## STree alone with 100.000 samples and linear kernel"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"text": "Score Train: 0.986857825567503\nScore Test: 0.9805013927576601\nTook 0.12 seconds\n"
|
"text": [
|
||||||
|
"Score Train: 0.9985073353804162\nScore Test: 0.9983746848878864\nTook 35.80 seconds\n"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"now = time.time()\n",
|
"now = time.time()\n",
|
||||||
"clf = Stree(max_depth=3, random_state=random_state)\n",
|
"clf = Stree(max_depth=3, random_state=random_state, max_iter=1e3)\n",
|
||||||
"clf.fit(Xtrain, ytrain)\n",
|
"clf.fit(Xtrain, ytrain)\n",
|
||||||
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
|
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
|
||||||
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
|
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
|
||||||
@@ -99,71 +150,94 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Score Train: 0.997610513739546\nScore Test: 0.9721448467966574\nTook 7.80 seconds\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"now = time.time()\n",
|
"## Adaboost"
|
||||||
"clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n",
|
|
||||||
"clf2.fit(Xtrain, ytrain)\n",
|
|
||||||
"print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Score Test: \", clf2.score(Xtest, ytest))\n",
|
|
||||||
"print(f\"Took {time.time() - now:.2f} seconds\")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Score Train: 0.9796893667861409\nScore Test: 0.9554317548746518\nTook 0.48 seconds\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"now = time.time()\n",
|
"n_estimators = 10\n",
|
||||||
"clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n",
|
"C = 7\n",
|
||||||
"clf3.fit(Xtrain, ytrain)\n",
|
"max_depth = 3"
|
||||||
"print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Score Test: \", clf3.score(Xtest, ytest))\n",
|
|
||||||
"print(f\"Took {time.time() - now:.2f} seconds\")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"text": "Score Train: 1.0\nScore Test: 0.9721448467966574\nTook 0.86 seconds\n"
|
"text": [
|
||||||
|
"Kernel: linear\tTime: 49.66 seconds\tScore Train: 0.9983225\tScore Test: 0.9983083\n",
|
||||||
|
"Kernel: rbf\tTime: 12.73 seconds\tScore Train: 0.9934891\tScore Test: 0.9934656\n",
|
||||||
|
"Kernel: poly\tTime: 76.24 seconds\tScore Train: 0.9972706\tScore Test: 0.9969152\n"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"now = time.time()\n",
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
"clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n",
|
" now = time.time()\n",
|
||||||
"clf4.fit(Xtrain, ytrain)\n",
|
" clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state, max_iter=1e3), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
|
||||||
"print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n",
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
"print(\"Score Test: \", clf4.score(Xtest, ytest))\n",
|
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||||
"print(f\"Took {time.time() - now:.2f} seconds\")"
|
" score_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Bagging"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": [
|
||||||
|
"n_estimators = 10\n",
|
||||||
|
"C = 7\n",
|
||||||
|
"max_depth = 3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Kernel: linear\tTime: 231.51 seconds\tScore Train: 0.9984931\tScore Test: 0.9983083\n",
|
||||||
|
"Kernel: rbf\tTime: 114.77 seconds\tScore Train: 0.9992323\tScore Test: 0.9983083\n",
|
||||||
|
"Kernel: poly\tTime: 67.87 seconds\tScore Train: 0.9993319\tScore Test: 0.9985074\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state, max_iter=1e3), n_estimators=n_estimators, random_state=random_state)\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||||
|
" score_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@@ -177,12 +251,12 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.7.6-final"
|
"version": "3.8.4-final"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 2,
|
"orig_nbformat": 2,
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
"name": "python38464bitgeneralf6de308d3831407c8bd68d4a5e328a38",
|
||||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
"display_name": "Python 3.8.4 64-bit ('general')"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
587
notebooks/features.ipynb
Normal file
587
notebooks/features.ipynb
Normal file
@@ -0,0 +1,587 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Test sample_weight, kernels, C, sklearn estimator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"Uncomment the next cell if STree is not already installed"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#\n",
|
||||||
|
"# Google Colab setup\n",
|
||||||
|
"#\n",
|
||||||
|
"#!pip install git+https://github.com/doctorado-ml/stree"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.svm import SVC\n",
|
||||||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||||
|
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||||
|
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from stree import Stree\n",
|
||||||
|
"import time"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"if not os.path.isfile('data/creditcard.csv'):\n",
|
||||||
|
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
||||||
|
" !tar xzf creditcard.tgz"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (5492, 28) y.shape (5492,)\nFraud: 9.141% 502\nValid: 90.859% 4990\n[0.09183143 0.09183143 0.09183143 0.09183143] [0.09041262 0.09041262 0.09041262 0.09041262]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"\n",
|
||||||
|
"def load_creditcard(n_examples=0):\n",
|
||||||
|
" import pandas as pd\n",
|
||||||
|
" import numpy as np\n",
|
||||||
|
" import random\n",
|
||||||
|
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||||
|
" y = df.Class\n",
|
||||||
|
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||||
|
" if n_examples > 0:\n",
|
||||||
|
" # Take first n_examples samples\n",
|
||||||
|
" X = X[:n_examples, :]\n",
|
||||||
|
" y = y[:n_examples, :]\n",
|
||||||
|
" else:\n",
|
||||||
|
" # Take all the positive samples with a number of random negatives\n",
|
||||||
|
" if n_examples < 0:\n",
|
||||||
|
" Xt = X[(y == 1).ravel()]\n",
|
||||||
|
" yt = y[(y == 1).ravel()]\n",
|
||||||
|
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
||||||
|
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||||
|
" y = np.append(yt, y[indices], axis=0)\n",
|
||||||
|
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||||
|
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||||
|
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||||
|
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state)\n",
|
||||||
|
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||||
|
"\n",
|
||||||
|
"data = load_creditcard(-5000) # Take all true samples with up to 5000 of the others\n",
|
||||||
|
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||||
|
"# data = load_creditcard(-1000) # Take 1000 samples\n",
|
||||||
|
"\n",
|
||||||
|
"Xtrain = data[0]\n",
|
||||||
|
"Xtest = data[1]\n",
|
||||||
|
"ytrain = data[2]\n",
|
||||||
|
"ytest = data[3]\n",
|
||||||
|
"_, data = np.unique(ytrain, return_counts=True)\n",
|
||||||
|
"wtrain = (data[1] / np.sum(data), data[0] / np.sum(data))\n",
|
||||||
|
"_, data = np.unique(ytest, return_counts=True)\n",
|
||||||
|
"wtest = (data[1] / np.sum(data), data[0] / np.sum(data))\n",
|
||||||
|
"# Set weights inverse to its count class in dataset\n",
|
||||||
|
"weights = np.ones(Xtrain.shape[0],)\n",
|
||||||
|
"weights[ytrain==0] = wtrain[0]\n",
|
||||||
|
"weights[ytrain==1] = wtrain[1]\n",
|
||||||
|
"weights_test = np.ones(Xtest.shape[0],)\n",
|
||||||
|
"weights_test[ytest==0] = wtest[0]\n",
|
||||||
|
"weights_test[ytest==1] = wtest[1]\n",
|
||||||
|
"print(weights[:4], weights_test[:4])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tests"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test sample_weights\n",
|
||||||
|
"Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Accuracy of Train without weights 0.9851716961498439\n",
|
||||||
|
"Accuracy of Train with weights 0.986732570239334\n",
|
||||||
|
"Accuracy of Tests without weights 0.9866504854368932\n",
|
||||||
|
"Accuracy of Tests with weights 0.9781553398058253\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"C = 23\n",
|
||||||
|
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
|
||||||
|
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test accuracy with different kernels\n",
|
||||||
|
"Compute accuracy on train and test set with default hyperparmeters of every kernel"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Time: 26.03s\tKernel: linear\tAccuracy_train: 0.9851716961498439\tAccuracy_test: 0.9866504854368932\n",
|
||||||
|
"Time: 0.54s\tKernel: rbf\tAccuracy_train: 0.9947970863683663\tAccuracy_test: 0.9878640776699029\n",
|
||||||
|
"Time: 0.43s\tKernel: poly\tAccuracy_train: 0.9960978147762747\tAccuracy_test: 0.9854368932038835\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"random_state=1\n",
|
||||||
|
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)\n",
|
||||||
|
" accuracy_train = clf.score(Xtrain, ytrain)\n",
|
||||||
|
" accuracy_test = clf.score(Xtest, ytest)\n",
|
||||||
|
" time_spent = time.time() - now\n",
|
||||||
|
" print(f\"Time: {time_spent:.2f}s\\tKernel: {kernel}\\tAccuracy_train: {accuracy_train}\\tAccuracy_test: {accuracy_test}\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test diferent values of C"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"************** C=0.001 ****************************\n",
|
||||||
|
"Classifier's accuracy (train): 0.9828\n",
|
||||||
|
"Classifier's accuracy (test) : 0.9848\n",
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||||
|
"root - Down, <cgaf> - Leaf class=0 belief= 0.981716 impurity=0.1317 counts=(array([0, 1]), array([3490, 65]))\n",
|
||||||
|
"root - Up, <cgaf> - Leaf class=1 belief= 0.996540 impurity=0.0333 counts=(array([0, 1]), array([ 1, 288]))\n",
|
||||||
|
"\n",
|
||||||
|
"**************************************************\n",
|
||||||
|
"************** C=0.01 ****************************\n",
|
||||||
|
"Classifier's accuracy (train): 0.9834\n",
|
||||||
|
"Classifier's accuracy (test) : 0.9854\n",
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||||
|
"root - Down, <cgaf> - Leaf class=0 belief= 0.982269 impurity=0.1285 counts=(array([0, 1]), array([3490, 63]))\n",
|
||||||
|
"root - Up, <cgaf> - Leaf class=1 belief= 0.996564 impurity=0.0331 counts=(array([0, 1]), array([ 1, 290]))\n",
|
||||||
|
"\n",
|
||||||
|
"**************************************************\n",
|
||||||
|
"************** C=1 ****************************\n",
|
||||||
|
"Classifier's accuracy (train): 0.9847\n",
|
||||||
|
"Classifier's accuracy (test) : 0.9867\n",
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||||
|
"root - Down, <cgaf> - Leaf class=0 belief= 0.983371 impurity=0.1221 counts=(array([0, 1]), array([3489, 59]))\n",
|
||||||
|
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0584 counts=(array([0, 1]), array([ 2, 294]))\n",
|
||||||
|
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([2]))\n",
|
||||||
|
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([294]))\n",
|
||||||
|
"\n",
|
||||||
|
"**************************************************\n",
|
||||||
|
"************** C=5 ****************************\n",
|
||||||
|
"Classifier's accuracy (train): 0.9852\n",
|
||||||
|
"Classifier's accuracy (test) : 0.9867\n",
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||||
|
"root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\n",
|
||||||
|
"root - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\n",
|
||||||
|
"root - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\n",
|
||||||
|
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\n",
|
||||||
|
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\n",
|
||||||
|
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n",
|
||||||
|
"\n",
|
||||||
|
"**************************************************\n",
|
||||||
|
"************** C=17 ****************************\n",
|
||||||
|
"Classifier's accuracy (train): 0.9852\n",
|
||||||
|
"Classifier's accuracy (test) : 0.9867\n",
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||||
|
"root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\n",
|
||||||
|
"root - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\n",
|
||||||
|
"root - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\n",
|
||||||
|
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\n",
|
||||||
|
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\n",
|
||||||
|
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n",
|
||||||
|
"\n",
|
||||||
|
"**************************************************\n",
|
||||||
|
"64.5792 secs\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"t = time.time()\n",
|
||||||
|
"for C in (.001, .01, 1, 5, 17):\n",
|
||||||
|
" clf = Stree(C=C, kernel=\"linear\", random_state=random_state)\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" print(f\"************** C={C} ****************************\")\n",
|
||||||
|
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
|
||||||
|
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
|
||||||
|
" print(clf)\n",
|
||||||
|
" print(f\"**************************************************\")\n",
|
||||||
|
"print(f\"{time.time() - t:.4f} secs\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test iterator\n",
|
||||||
|
"Check different weays of using the iterator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\nroot - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\nroot - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#check iterator\n",
|
||||||
|
"for i in list(clf):\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\nroot - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\nroot - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#check iterator again\n",
|
||||||
|
"for i in clf:\n",
|
||||||
|
" print(i)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test STree is a sklearn estimator"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"1 functools.partial(<function check_no_attributes_set_in_init at 0x125acaee0>, 'Stree')\n",
|
||||||
|
"2 functools.partial(<function check_estimators_dtypes at 0x125ac7040>, 'Stree')\n",
|
||||||
|
"3 functools.partial(<function check_fit_score_takes_y at 0x125ac2ee0>, 'Stree')\n",
|
||||||
|
"4 functools.partial(<function check_sample_weights_pandas_series at 0x125ac0820>, 'Stree')\n",
|
||||||
|
"5 functools.partial(<function check_sample_weights_not_an_array at 0x125ac0940>, 'Stree')\n",
|
||||||
|
"6 functools.partial(<function check_sample_weights_list at 0x125ac0a60>, 'Stree')\n",
|
||||||
|
"7 functools.partial(<function check_sample_weights_shape at 0x125ac0b80>, 'Stree')\n",
|
||||||
|
"8 functools.partial(<function check_sample_weights_invariance at 0x125ac0ca0>, 'Stree')\n",
|
||||||
|
"9 functools.partial(<function check_estimators_fit_returns_self at 0x125aca040>, 'Stree')\n",
|
||||||
|
"10 functools.partial(<function check_estimators_fit_returns_self at 0x125aca040>, 'Stree', readonly_memmap=True)\n",
|
||||||
|
"11 functools.partial(<function check_complex_data at 0x125ac0e50>, 'Stree')\n",
|
||||||
|
"12 functools.partial(<function check_dtype_object at 0x125ac0dc0>, 'Stree')\n",
|
||||||
|
"13 functools.partial(<function check_estimators_empty_data_messages at 0x125ac7160>, 'Stree')\n",
|
||||||
|
"14 functools.partial(<function check_pipeline_consistency at 0x125ac2dc0>, 'Stree')\n",
|
||||||
|
"15 functools.partial(<function check_estimators_nan_inf at 0x125ac7280>, 'Stree')\n",
|
||||||
|
"16 functools.partial(<function check_estimators_overwrite_params at 0x125acadc0>, 'Stree')\n",
|
||||||
|
"17 functools.partial(<function check_estimator_sparse_data at 0x125ac0700>, 'Stree')\n",
|
||||||
|
"18 functools.partial(<function check_estimators_pickle at 0x125ac74c0>, 'Stree')\n",
|
||||||
|
"19 functools.partial(<function check_classifier_data_not_an_array at 0x125acd160>, 'Stree')\n",
|
||||||
|
"20 functools.partial(<function check_classifiers_one_label at 0x125ac7b80>, 'Stree')\n",
|
||||||
|
"21 functools.partial(<function check_classifiers_classes at 0x125aca5e0>, 'Stree')\n",
|
||||||
|
"22 functools.partial(<function check_estimators_partial_fit_n_features at 0x125ac75e0>, 'Stree')\n",
|
||||||
|
"23 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree')\n",
|
||||||
|
"24 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree', readonly_memmap=True)\n",
|
||||||
|
"25 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n",
|
||||||
|
"26 functools.partial(<function check_classifiers_regression_target at 0x125acdc10>, 'Stree')\n",
|
||||||
|
"27 functools.partial(<function check_supervised_y_no_nan at 0x125aab790>, 'Stree')\n",
|
||||||
|
"28 functools.partial(<function check_supervised_y_2d at 0x125aca280>, 'Stree')\n",
|
||||||
|
"29 functools.partial(<function check_estimators_unfitted at 0x125aca160>, 'Stree')\n",
|
||||||
|
"30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x125acd790>, 'Stree')\n",
|
||||||
|
"31 functools.partial(<function check_decision_proba_consistency at 0x125acdd30>, 'Stree')\n",
|
||||||
|
"32 functools.partial(<function check_fit2d_predict1d at 0x125ac23a0>, 'Stree')\n",
|
||||||
|
"33 functools.partial(<function check_methods_subset_invariance at 0x125ac2550>, 'Stree')\n",
|
||||||
|
"34 functools.partial(<function check_fit2d_1sample at 0x125ac2670>, 'Stree')\n",
|
||||||
|
"35 functools.partial(<function check_fit2d_1feature at 0x125ac2790>, 'Stree')\n",
|
||||||
|
"36 functools.partial(<function check_fit1d at 0x125ac28b0>, 'Stree')\n",
|
||||||
|
"37 functools.partial(<function check_get_params_invariance at 0x125acd9d0>, 'Stree')\n",
|
||||||
|
"38 functools.partial(<function check_set_params at 0x125acdaf0>, 'Stree')\n",
|
||||||
|
"39 functools.partial(<function check_dict_unchanged at 0x125ac0f70>, 'Stree')\n",
|
||||||
|
"40 functools.partial(<function check_dont_overwrite_parameters at 0x125ac2280>, 'Stree')\n",
|
||||||
|
"41 functools.partial(<function check_fit_idempotent at 0x125acdee0>, 'Stree')\n",
|
||||||
|
"42 functools.partial(<function check_n_features_in at 0x125acdf70>, 'Stree')\n",
|
||||||
|
"43 functools.partial(<function check_requires_y_none at 0x125ad1040>, 'Stree')\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Make checks one by one\n",
|
||||||
|
"c = 0\n",
|
||||||
|
"checks = check_estimator(Stree(), generate_only=True)\n",
|
||||||
|
"for check in checks:\n",
|
||||||
|
" c += 1\n",
|
||||||
|
" print(c, check[1])\n",
|
||||||
|
" check[1](check[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Check if the classifier is a sklearn estimator\n",
|
||||||
|
"check_estimator(Stree())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Compare to SVM"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"== Not Weighted ===\n",
|
||||||
|
"SVC train score ..: 0.9825702393340271\n",
|
||||||
|
"STree train score : 0.9841311134235172\n",
|
||||||
|
"SVC test score ...: 0.9830097087378641\n",
|
||||||
|
"STree test score .: 0.9848300970873787\n",
|
||||||
|
"==== Weighted =====\n",
|
||||||
|
"SVC train score ..: 0.9786680541103018\n",
|
||||||
|
"STree train score : 0.9802289281997919\n",
|
||||||
|
"SVC test score ...: 0.9805825242718447\n",
|
||||||
|
"STree test score .: 0.9817961165048543\n",
|
||||||
|
"*SVC test score ..: 0.9439939825655582\n",
|
||||||
|
"*STree test score : 0.9476832429673473\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"svc = SVC(C=7, kernel='rbf', gamma=.001, random_state=random_state)\n",
|
||||||
|
"clf = Stree(C=17, kernel='rbf', gamma=.001, random_state=random_state)\n",
|
||||||
|
"svc.fit(Xtrain, ytrain)\n",
|
||||||
|
"clf.fit(Xtrain, ytrain)\n",
|
||||||
|
"print(\"== Not Weighted ===\")\n",
|
||||||
|
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
|
||||||
|
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
"svc.fit(Xtrain, ytrain, weights)\n",
|
||||||
|
"clf.fit(Xtrain, ytrain, weights)\n",
|
||||||
|
"print(\"==== Weighted =====\")\n",
|
||||||
|
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
|
||||||
|
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
"print(\"*SVC test score ..:\", svc.score(Xtest, ytest, weights_test))\n",
|
||||||
|
"print(\"*STree test score :\", clf.score(Xtest, ytest, weights_test))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down, <cgaf> - Leaf class=0 belief= 0.990520 impurity=0.0773 counts=(array([0, 1]), array([3448, 33]))\nroot - Up, <cgaf> - Leaf class=1 belief= 0.881543 impurity=0.5249 counts=(array([0, 1]), array([ 43, 320]))\n\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(clf)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test max_features"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"****************************************\n",
|
||||||
|
"max_features None = 28\n",
|
||||||
|
"Train score : 0.9846514047866806\n",
|
||||||
|
"Test score .: 0.9866504854368932\n",
|
||||||
|
"Took 10.18 seconds\n",
|
||||||
|
"****************************************\n",
|
||||||
|
"max_features auto = 5\n",
|
||||||
|
"Train score : 0.9836108220603538\n",
|
||||||
|
"Test score .: 0.9842233009708737\n",
|
||||||
|
"Took 5.22 seconds\n",
|
||||||
|
"****************************************\n",
|
||||||
|
"max_features log2 = 4\n",
|
||||||
|
"Train score : 0.9791883454734651\n",
|
||||||
|
"Test score .: 0.9793689320388349\n",
|
||||||
|
"Took 2.05 seconds\n",
|
||||||
|
"****************************************\n",
|
||||||
|
"max_features 7 = 7\n",
|
||||||
|
"Train score : 0.9737252861602498\n",
|
||||||
|
"Test score .: 0.9739077669902912\n",
|
||||||
|
"Took 2.86 seconds\n",
|
||||||
|
"****************************************\n",
|
||||||
|
"max_features 0.5 = 14\n",
|
||||||
|
"Train score : 0.981789802289282\n",
|
||||||
|
"Test score .: 0.9824029126213593\n",
|
||||||
|
"Took 48.35 seconds\n",
|
||||||
|
"****************************************\n",
|
||||||
|
"max_features 0.1 = 2\n",
|
||||||
|
"Train score : 0.9638397502601457\n",
|
||||||
|
"Test score .: 0.9648058252427184\n",
|
||||||
|
"Took 0.35 seconds\n",
|
||||||
|
"****************************************\n",
|
||||||
|
"max_features 0.7 = 19\n",
|
||||||
|
"Train score : 0.9841311134235172\n",
|
||||||
|
"Test score .: 0.9860436893203883\n",
|
||||||
|
"Took 20.89 seconds\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for max_features in [None, \"auto\", \"log2\", 7, .5, .1, .7]:\n",
|
||||||
|
" now = time.time()\n",
|
||||||
|
" print(\"*\"*40)\n",
|
||||||
|
" clf = Stree(random_state=random_state, max_features=max_features)\n",
|
||||||
|
" clf.fit(Xtrain, ytrain)\n",
|
||||||
|
" print(f\"max_features {max_features} = {clf.max_features_}\")\n",
|
||||||
|
" print(\"Train score :\", clf.score(Xtrain, ytrain))\n",
|
||||||
|
" print(\"Test score .:\", clf.score(Xtest, ytest))\n",
|
||||||
|
" print(f\"Took {time.time() - now:.2f} seconds\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.4-final"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -1,225 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#\n",
|
|
||||||
"# Google Colab setup\n",
|
|
||||||
"#\n",
|
|
||||||
"#!pip install git+https://github.com/doctorado-ml/stree"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
|
||||||
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
|
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
|
||||||
"from stree import Stree\n",
|
|
||||||
"import time"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import os\n",
|
|
||||||
"if not os.path.isfile('data/creditcard.csv'):\n",
|
|
||||||
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
|
|
||||||
" !tar xzf creditcard.tgz"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 19,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"random_state=1\n",
|
|
||||||
"\n",
|
|
||||||
"def load_creditcard(n_examples=0):\n",
|
|
||||||
" import pandas as pd\n",
|
|
||||||
" import numpy as np\n",
|
|
||||||
" import random\n",
|
|
||||||
" df = pd.read_csv('data/creditcard.csv')\n",
|
|
||||||
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
|
||||||
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
|
||||||
" y = df.Class\n",
|
|
||||||
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
|
||||||
" if n_examples > 0:\n",
|
|
||||||
" # Take first n_examples samples\n",
|
|
||||||
" X = X[:n_examples, :]\n",
|
|
||||||
" y = y[:n_examples, :]\n",
|
|
||||||
" else:\n",
|
|
||||||
" # Take all the positive samples with a number of random negatives\n",
|
|
||||||
" if n_examples < 0:\n",
|
|
||||||
" Xt = X[(y == 1).ravel()]\n",
|
|
||||||
" yt = y[(y == 1).ravel()]\n",
|
|
||||||
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
|
||||||
" X = np.append(Xt, X[indices], axis=0)\n",
|
|
||||||
" y = np.append(yt, y[indices], axis=0)\n",
|
|
||||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
|
||||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
|
||||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
|
||||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
|
||||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
|
||||||
"\n",
|
|
||||||
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
|
||||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
|
||||||
"data = load_creditcard(-1000) # Take all the samples\n",
|
|
||||||
"\n",
|
|
||||||
"Xtrain = data[0]\n",
|
|
||||||
"Xtest = data[1]\n",
|
|
||||||
"ytrain = data[2]\n",
|
|
||||||
"ytest = data[3]\n",
|
|
||||||
"# Set weights inverse to its count class in dataset\n",
|
|
||||||
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
|
|
||||||
"weights[ytrain==1] = 1.99755 "
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 21,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "stream",
|
|
||||||
"name": "stdout",
|
|
||||||
"text": "Accuracy of Train without weights 0.996415770609319\nAccuracy of Train with weights 0.994026284348865\nAccuracy of Tests without weights 0.9665738161559888\nAccuracy of Tests with weights 0.9721448467966574\n"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"C = 23\n",
|
|
||||||
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
|
|
||||||
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
|
|
||||||
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"tags": [
|
|
||||||
"outputPrepend"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"t = time.time()\n",
|
|
||||||
"for C in (.001, .01, 1, 5, 17):\n",
|
|
||||||
" clf = Stree(C=C, random_state=random_state)\n",
|
|
||||||
" clf.fit(Xtrain, ytrain)\n",
|
|
||||||
" print(f\"************** C={C} ****************************\")\n",
|
|
||||||
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
|
|
||||||
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
|
|
||||||
" print(clf)\n",
|
|
||||||
" print(f\"**************************************************\")\n",
|
|
||||||
"print(f\"{time.time() - t:.4f} secs\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from sklearn.preprocessing import StandardScaler\n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from sklearn.calibration import CalibratedClassifierCV\n",
|
|
||||||
"scaler = StandardScaler()\n",
|
|
||||||
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
|
|
||||||
"cclf.fit(Xtrain, ytrain)\n",
|
|
||||||
"res = cclf.predict_proba(Xtest)\n",
|
|
||||||
"print(res[:4, :])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#check iterator\n",
|
|
||||||
"for i in list(clf):\n",
|
|
||||||
" print(i)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#check iterator again\n",
|
|
||||||
"for i in clf:\n",
|
|
||||||
" print(i)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Check if the classifier is a sklearn estimator\n",
|
|
||||||
"from sklearn.utils.estimator_checks import check_estimator\n",
|
|
||||||
"check_estimator(Stree())"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Make checks one by one\n",
|
|
||||||
"c = 0\n",
|
|
||||||
"checks = check_estimator(Stree(), generate_only=True)\n",
|
|
||||||
"for check in checks:\n",
|
|
||||||
" c += 1\n",
|
|
||||||
" print(c, check[1])\n",
|
|
||||||
" check[1](check[0])"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.7.6-final"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
@@ -1,197 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"#\n",
|
|
||||||
"# Google Colab setup\n",
|
|
||||||
"#\n",
|
|
||||||
"#!pip install git+https://github.com/doctorado-ml/stree"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "error",
|
|
||||||
"ename": "ModuleNotFoundError",
|
|
||||||
"evalue": "No module named 'stree'",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[0;32m<ipython-input-12-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
||||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'stree'"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import time\n",
|
|
||||||
"import random\n",
|
|
||||||
"import numpy as np\n",
|
|
||||||
"from sklearn.datasets import make_blobs\n",
|
|
||||||
"from sklearn.svm import LinearSVC\n",
|
|
||||||
"from stree import Stree, Stree_grapher"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def build_data(random_state):\n",
|
|
||||||
" random.seed(random_state)\n",
|
|
||||||
" X, y = make_blobs(centers=10, n_features=3, n_samples=500, random_state=random_state)\n",
|
|
||||||
" def make_binary(y):\n",
|
|
||||||
" for i in range(2, 10):\n",
|
|
||||||
" y[y==i] = random.randint(0, 1)\n",
|
|
||||||
" return y\n",
|
|
||||||
" y = make_binary(y)\n",
|
|
||||||
" #print(X.shape, np.unique(y), y[y==0].shape, y[y==1].shape)\n",
|
|
||||||
" return X, y"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "error",
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'Stree_grapher' is not defined",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[0;32m<ipython-input-4-b909470cb406>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m.01\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m200\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m#gr.save_all(save_folder='data/', save_prefix='7')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'Stree_grapher' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"X, y = build_data(10)\n",
|
|
||||||
"gr = Stree_grapher(dict(C=.01, max_iter=200))\n",
|
|
||||||
"gr.fit(X, y)\n",
|
|
||||||
"#gr.save_all(save_folder='data/', save_prefix='7')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "error",
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'gr' is not defined",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[0;32m<ipython-input-5-efa3db892bfd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"print(gr)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 6,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "error",
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'gr' is not defined",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[0;32m<ipython-input-6-0e62f081c9aa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Agg'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_folder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'data/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import matplotlib\n",
|
|
||||||
"matplotlib.use('Agg')\n",
|
|
||||||
"gr.save_all(save_folder='data/')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "error",
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'gr' is not defined",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[0;32m<ipython-input-7-b0484cfe9d26>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m#%matplotlib inline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'widget'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tree_gr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_hyperplane\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
|
|
||||||
"#%matplotlib inline\n",
|
|
||||||
"%matplotlib widget\n",
|
|
||||||
"gr._tree_gr.plot_hyperplane()"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 8,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"output_type": "error",
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'gr' is not defined",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"\u001b[0;32m<ipython-input-8-4277c1aacbe2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'inline'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m#%matplotlib widget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
|
|
||||||
"%matplotlib inline\n",
|
|
||||||
"#%matplotlib widget\n",
|
|
||||||
"gr.plot_all()"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.7.6-final"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
16
pyproject.toml
Normal file
16
pyproject.toml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
[tool.black]
|
||||||
|
line-length = 79
|
||||||
|
include = '\.pyi?$'
|
||||||
|
exclude = '''
|
||||||
|
/(
|
||||||
|
\.git
|
||||||
|
| \.hg
|
||||||
|
| \.mypy_cache
|
||||||
|
| \.tox
|
||||||
|
| \.venv
|
||||||
|
| _build
|
||||||
|
| buck-out
|
||||||
|
| build
|
||||||
|
| dist
|
||||||
|
)/
|
||||||
|
'''
|
@@ -1,5 +1,4 @@
|
|||||||
numpy
|
numpy
|
||||||
scikit-learn
|
scikit-learn==0.23.2
|
||||||
pandas
|
pandas
|
||||||
matplotlib
|
|
||||||
ipympl
|
ipympl
|
41
setup.py
41
setup.py
@@ -1,41 +1,36 @@
|
|||||||
import setuptools
|
import setuptools
|
||||||
|
|
||||||
__version__ = "0.9rc4"
|
__version__ = "0.9rc6"
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
|
|
||||||
|
|
||||||
def readme():
|
def readme():
|
||||||
with open('README.md') as f:
|
with open("README.md") as f:
|
||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='STree',
|
name="STree",
|
||||||
version=__version__,
|
version=__version__,
|
||||||
license='MIT License',
|
license="MIT License",
|
||||||
description='Oblique decision tree with svm nodes',
|
description="Oblique decision tree with svm nodes",
|
||||||
long_description=readme(),
|
long_description=readme(),
|
||||||
long_description_content_type='text/markdown',
|
long_description_content_type="text/markdown",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(),
|
||||||
url='https://github.com/doctorado-ml/stree',
|
url="https://github.com/doctorado-ml/stree",
|
||||||
author=__author__,
|
author=__author__,
|
||||||
author_email='ricardo.montanana@alu.uclm.es',
|
author_email="ricardo.montanana@alu.uclm.es",
|
||||||
keywords='scikit-learn oblique-classifier oblique-decision-tree decision-\
|
keywords="scikit-learn oblique-classifier oblique-decision-tree decision-\
|
||||||
tree svm svc',
|
tree svm svc",
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 4 - Beta',
|
"Development Status :: 4 - Beta",
|
||||||
'License :: OSI Approved :: MIT License',
|
"License :: OSI Approved :: MIT License",
|
||||||
'Programming Language :: Python :: 3.7',
|
"Programming Language :: Python :: 3.8",
|
||||||
'Natural Language :: English',
|
"Natural Language :: English",
|
||||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
'Intended Audience :: Science/Research'
|
"Intended Audience :: Science/Research",
|
||||||
],
|
|
||||||
install_requires=[
|
|
||||||
'scikit-learn>=0.23.0',
|
|
||||||
'numpy',
|
|
||||||
'matplotlib',
|
|
||||||
'ipympl'
|
|
||||||
],
|
],
|
||||||
|
install_requires=["scikit-learn==0.23.2", "numpy", "ipympl"],
|
||||||
test_suite="stree.tests",
|
test_suite="stree.tests",
|
||||||
zip_safe=False
|
zip_safe=False,
|
||||||
)
|
)
|
||||||
|
737
stree/Strees.py
737
stree/Strees.py
@@ -1,20 +1,30 @@
|
|||||||
'''
|
"""
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
__version__ = "0.9"
|
__version__ = "0.9"
|
||||||
Build an oblique tree classifier based on SVM Trees
|
Build an oblique tree classifier based on SVM Trees
|
||||||
Uses LinearSVC
|
"""
|
||||||
'''
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import numbers
|
||||||
|
import random
|
||||||
|
import warnings
|
||||||
|
from math import log, factorial
|
||||||
|
from typing import Optional
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
|
from sklearn.utils import check_consistent_length
|
||||||
from sklearn.utils.multiclass import check_classification_targets
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, \
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
_check_sample_weight
|
from sklearn.utils.validation import (
|
||||||
|
check_X_y,
|
||||||
|
check_array,
|
||||||
|
check_is_fitted,
|
||||||
|
_check_sample_weight,
|
||||||
|
)
|
||||||
|
from sklearn.metrics._classification import _weighted_sum, _check_targets
|
||||||
|
|
||||||
|
|
||||||
class Snode:
|
class Snode:
|
||||||
@@ -22,23 +32,49 @@ class Snode:
|
|||||||
dataset assigned to it
|
dataset assigned to it
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray,
|
def __init__(
|
||||||
title: str):
|
self,
|
||||||
|
clf: SVC,
|
||||||
|
X: np.ndarray,
|
||||||
|
y: np.ndarray,
|
||||||
|
features: np.array,
|
||||||
|
impurity: float,
|
||||||
|
title: str,
|
||||||
|
weight: np.ndarray = None,
|
||||||
|
):
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._vector = None if clf is None else clf.coef_
|
|
||||||
self._interceptor = 0. if clf is None else clf.intercept_
|
|
||||||
self._title = title
|
self._title = title
|
||||||
self._belief = 0.
|
self._belief = 0.0
|
||||||
# Only store dataset in Testing
|
# Only store dataset in Testing
|
||||||
self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None
|
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
self._y = y
|
self._y = y
|
||||||
self._down = None
|
self._down = None
|
||||||
self._up = None
|
self._up = None
|
||||||
self._class = None
|
self._class = None
|
||||||
|
self._feature = None
|
||||||
|
self._sample_weight = (
|
||||||
|
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||||
|
)
|
||||||
|
self._features = features
|
||||||
|
self._impurity = impurity
|
||||||
|
self._partition_column: int = -1
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def copy(cls, node: 'Snode') -> 'Snode':
|
def copy(cls, node: "Snode") -> "Snode":
|
||||||
return cls(node._clf, node._X, node._y, node._title)
|
return cls(
|
||||||
|
node._clf,
|
||||||
|
node._X,
|
||||||
|
node._y,
|
||||||
|
node._features,
|
||||||
|
node._impurity,
|
||||||
|
node._title,
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_partition_column(self, col: int):
|
||||||
|
self._partition_column = col
|
||||||
|
|
||||||
|
def get_partition_column(self) -> int:
|
||||||
|
return self._partition_column
|
||||||
|
|
||||||
def set_down(self, son):
|
def set_down(self, son):
|
||||||
self._down = son
|
self._down = son
|
||||||
@@ -49,10 +85,10 @@ class Snode:
|
|||||||
def is_leaf(self) -> bool:
|
def is_leaf(self) -> bool:
|
||||||
return self._up is None and self._down is None
|
return self._up is None and self._down is None
|
||||||
|
|
||||||
def get_down(self) -> 'Snode':
|
def get_down(self) -> "Snode":
|
||||||
return self._down
|
return self._down
|
||||||
|
|
||||||
def get_up(self) -> 'Snode':
|
def get_up(self) -> "Snode":
|
||||||
return self._up
|
return self._up
|
||||||
|
|
||||||
def make_predictor(self):
|
def make_predictor(self):
|
||||||
@@ -64,37 +100,38 @@ class Snode:
|
|||||||
classes, card = np.unique(self._y, return_counts=True)
|
classes, card = np.unique(self._y, return_counts=True)
|
||||||
if len(classes) > 1:
|
if len(classes) > 1:
|
||||||
max_card = max(card)
|
max_card = max(card)
|
||||||
min_card = min(card)
|
|
||||||
try:
|
|
||||||
self._belief = max_card / (max_card + min_card)
|
|
||||||
except ZeroDivisionError:
|
|
||||||
self._belief = 0.
|
|
||||||
self._class = classes[card == max_card][0]
|
self._class = classes[card == max_card][0]
|
||||||
|
self._belief = max_card / np.sum(card)
|
||||||
else:
|
else:
|
||||||
self._belief = 1
|
self._belief = 1
|
||||||
self._class = classes[0]
|
try:
|
||||||
|
self._class = classes[0]
|
||||||
|
except IndexError:
|
||||||
|
self._class = None
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
|
count_values = np.unique(self._y, return_counts=True)
|
||||||
if self.is_leaf():
|
if self.is_leaf():
|
||||||
count_values = np.unique(self._y, return_counts=True)
|
return (
|
||||||
result = f"{self._title} - Leaf class={self._class} belief="\
|
f"{self._title} - Leaf class={self._class} belief="
|
||||||
f"{self._belief: .6f} counts={count_values}"
|
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||||
return result
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
return f"{self._title}"
|
return (
|
||||||
|
f"{self._title} feaures={self._features} impurity="
|
||||||
|
f"{self._impurity:.4f} "
|
||||||
|
f"counts={count_values}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Siterator:
|
class Siterator:
|
||||||
"""Stree preorder iterator
|
"""Stree preorder iterator"""
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, tree: Snode):
|
def __init__(self, tree: Snode):
|
||||||
self._stack = []
|
self._stack = []
|
||||||
self._push(tree)
|
self._push(tree)
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def _push(self, node: Snode):
|
def _push(self, node: Snode):
|
||||||
if node is not None:
|
if node is not None:
|
||||||
self._stack.append(node)
|
self._stack.append(node)
|
||||||
@@ -108,49 +145,220 @@ class Siterator:
|
|||||||
return node
|
return node
|
||||||
|
|
||||||
|
|
||||||
class Stree(BaseEstimator, ClassifierMixin):
|
class Splitter:
|
||||||
"""Estimator that is based on binary trees of svm nodes
|
def __init__(
|
||||||
can deal with sample_weights in predict, used in boosting sklearn methods
|
self,
|
||||||
inheriting from BaseEstimator implements get_params and set_params methods
|
clf: SVC = None,
|
||||||
inheriting from ClassifierMixin implement the attribute _estimator_type
|
criterion: str = None,
|
||||||
with "classifier" as value
|
splitter_type: str = None,
|
||||||
"""
|
criteria: str = None,
|
||||||
|
min_samples_split: int = None,
|
||||||
|
random_state=None,
|
||||||
|
):
|
||||||
|
self._clf = clf
|
||||||
|
self._random_state = random_state
|
||||||
|
if random_state is not None:
|
||||||
|
random.seed(random_state)
|
||||||
|
self._criterion = criterion
|
||||||
|
self._min_samples_split = min_samples_split
|
||||||
|
self._criteria = criteria
|
||||||
|
self._splitter_type = splitter_type
|
||||||
|
|
||||||
def __init__(self, C: float = 1.0, max_iter: int = 1000,
|
if clf is None:
|
||||||
random_state: int = None, max_depth: int = None,
|
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||||
tol: float = 1e-4, use_predictions: bool = False):
|
|
||||||
self.max_iter = max_iter
|
|
||||||
self.C = C
|
|
||||||
self.random_state = random_state
|
|
||||||
self.use_predictions = use_predictions
|
|
||||||
self.max_depth = max_depth
|
|
||||||
self.tol = tol
|
|
||||||
|
|
||||||
def _more_tags(self) -> dict:
|
if criterion not in ["gini", "entropy"]:
|
||||||
"""Required by sklearn to tell that this estimator is a binary classifier
|
raise ValueError(
|
||||||
|
f"criterion must be gini or entropy got({criterion})"
|
||||||
|
)
|
||||||
|
|
||||||
:return: the tag required
|
if criteria not in [
|
||||||
:rtype: dict
|
"max_samples",
|
||||||
|
"impurity",
|
||||||
|
]:
|
||||||
|
raise ValueError(
|
||||||
|
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||||
|
)
|
||||||
|
|
||||||
|
if splitter_type not in ["random", "best"]:
|
||||||
|
raise ValueError(
|
||||||
|
f"splitter must be either random or best, got({splitter_type})"
|
||||||
|
)
|
||||||
|
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||||
|
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||||
|
|
||||||
|
def partition_impurity(self, y: np.array) -> np.array:
|
||||||
|
return self.criterion_function(y)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _gini(y: np.array) -> float:
|
||||||
|
_, count = np.unique(y, return_counts=True)
|
||||||
|
return 1 - np.sum(np.square(count / np.sum(count)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _entropy(y: np.array) -> float:
|
||||||
|
n_labels = len(y)
|
||||||
|
if n_labels <= 1:
|
||||||
|
return 0
|
||||||
|
counts = np.bincount(y)
|
||||||
|
proportions = counts / n_labels
|
||||||
|
n_classes = np.count_nonzero(proportions)
|
||||||
|
if n_classes <= 1:
|
||||||
|
return 0
|
||||||
|
entropy = 0.0
|
||||||
|
# Compute standard entropy.
|
||||||
|
for prop in proportions:
|
||||||
|
if prop != 0.0:
|
||||||
|
entropy -= prop * log(prop, n_classes)
|
||||||
|
return entropy
|
||||||
|
|
||||||
|
def information_gain(
|
||||||
|
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||||
|
) -> float:
|
||||||
|
imp_prev = self.criterion_function(labels)
|
||||||
|
card_up = card_dn = imp_up = imp_dn = 0
|
||||||
|
if labels_up is not None:
|
||||||
|
card_up = labels_up.shape[0]
|
||||||
|
imp_up = self.criterion_function(labels_up)
|
||||||
|
if labels_dn is not None:
|
||||||
|
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||||
|
imp_dn = self.criterion_function(labels_dn)
|
||||||
|
samples = card_up + card_dn
|
||||||
|
if samples == 0:
|
||||||
|
return 0.0
|
||||||
|
else:
|
||||||
|
result = (
|
||||||
|
imp_prev
|
||||||
|
- (card_up / samples) * imp_up
|
||||||
|
- (card_dn / samples) * imp_dn
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _select_best_set(
|
||||||
|
self, dataset: np.array, labels: np.array, features_sets: list
|
||||||
|
) -> list:
|
||||||
|
max_gain = 0
|
||||||
|
selected = None
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
for feature_set in features_sets:
|
||||||
|
self._clf.fit(dataset[:, feature_set], labels)
|
||||||
|
node = Snode(
|
||||||
|
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
||||||
|
)
|
||||||
|
self.partition(dataset, node, train=True)
|
||||||
|
y1, y2 = self.part(labels)
|
||||||
|
gain = self.information_gain(labels, y1, y2)
|
||||||
|
if gain > max_gain:
|
||||||
|
max_gain = gain
|
||||||
|
selected = feature_set
|
||||||
|
return selected if selected is not None else feature_set
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_spaces(features: int, max_features: int) -> list:
|
||||||
|
comb = set()
|
||||||
|
# Generate at most 5 combinations
|
||||||
|
if max_features == features:
|
||||||
|
set_length = 1
|
||||||
|
else:
|
||||||
|
number = factorial(features) / (
|
||||||
|
factorial(max_features) * factorial(features - max_features)
|
||||||
|
)
|
||||||
|
set_length = min(5, number)
|
||||||
|
while len(comb) < set_length:
|
||||||
|
comb.add(
|
||||||
|
tuple(sorted(random.sample(range(features), max_features)))
|
||||||
|
)
|
||||||
|
return list(comb)
|
||||||
|
|
||||||
|
def _get_subspaces_set(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> np.array:
|
||||||
|
features_sets = self._generate_spaces(dataset.shape[1], max_features)
|
||||||
|
if len(features_sets) > 1:
|
||||||
|
if self._splitter_type == "random":
|
||||||
|
index = random.randint(0, len(features_sets) - 1)
|
||||||
|
return features_sets[index]
|
||||||
|
else:
|
||||||
|
return self._select_best_set(dataset, labels, features_sets)
|
||||||
|
else:
|
||||||
|
return features_sets[0]
|
||||||
|
|
||||||
|
def get_subspace(
|
||||||
|
self, dataset: np.array, labels: np.array, max_features: int
|
||||||
|
) -> tuple:
|
||||||
|
"""Return the best/random subspace to make a split"""
|
||||||
|
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||||
|
return dataset[:, indices], indices
|
||||||
|
|
||||||
|
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
||||||
|
"""return column of dataset to be taken into account to split dataset
|
||||||
|
|
||||||
|
:param data: distances to hyper plane of every class
|
||||||
|
:type data: np.array (m, n_classes)
|
||||||
|
:param y: vector of labels (classes)
|
||||||
|
:type y: np.array (m,)
|
||||||
|
:return: column of dataset to be taken into account to split dataset
|
||||||
|
:rtype: int
|
||||||
"""
|
"""
|
||||||
return {'binary_only': True, 'requires_y': True}
|
max_gain = 0
|
||||||
|
selected = -1
|
||||||
|
for col in range(data.shape[1]):
|
||||||
|
tup = y[data[:, col] > 0]
|
||||||
|
tdn = y[data[:, col] <= 0]
|
||||||
|
info_gain = self.information_gain(y, tup, tdn)
|
||||||
|
if info_gain > max_gain:
|
||||||
|
selected = col
|
||||||
|
max_gain = info_gain
|
||||||
|
return selected
|
||||||
|
|
||||||
def _linear_function(self, data: np.array, node: Snode) -> np.array:
|
@staticmethod
|
||||||
"""Compute the distance of set of samples to a hyperplane, in
|
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||||
multiclass classification it should compute the distance to a
|
"""return column of dataset to be taken into account to split dataset
|
||||||
hyperplane of each class
|
|
||||||
|
|
||||||
:param data: dataset of samples
|
:param data: distances to hyper plane of every class
|
||||||
:type data: np.array
|
:type data: np.array (m, n_classes)
|
||||||
:param node: the node that contains the hyperplance coefficients
|
:param y: vector of labels (classes)
|
||||||
:type node: Snode
|
:type y: np.array (m,)
|
||||||
:return: array of distances of each sample to the hyperplane
|
:return: column of dataset to be taken into account to split dataset
|
||||||
:rtype: np.array
|
:rtype: int
|
||||||
"""
|
"""
|
||||||
coef = node._vector[0, :].reshape(-1, data.shape[1])
|
# select the class with max number of samples
|
||||||
return data.dot(coef.T) + node._interceptor[0]
|
_, samples = np.unique(y, return_counts=True)
|
||||||
|
return np.argmax(samples)
|
||||||
|
|
||||||
def _split_array(self, origin: np.array, down: np.array) -> list:
|
def partition(self, samples: np.array, node: Snode, train: bool):
|
||||||
"""Split an array in two based on indices passed as down and its complement
|
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||||
|
that should go to one side of the tree (down)
|
||||||
|
|
||||||
|
"""
|
||||||
|
# data contains the distances of every sample to every class hyperplane
|
||||||
|
# array of (m, nc) nc = # classes
|
||||||
|
data = self._distances(node, samples)
|
||||||
|
if data.shape[0] < self._min_samples_split:
|
||||||
|
# there aren't enough samples to split
|
||||||
|
self._up = np.ones((data.shape[0]), dtype=bool)
|
||||||
|
return
|
||||||
|
if data.ndim > 1:
|
||||||
|
# split criteria for multiclass
|
||||||
|
# Convert data to a (m, 1) array selecting values for samples
|
||||||
|
if train:
|
||||||
|
# in train time we have to compute the column to take into
|
||||||
|
# account to split the dataset
|
||||||
|
col = self.decision_criteria(data, node._y)
|
||||||
|
node.set_partition_column(col)
|
||||||
|
else:
|
||||||
|
# in predcit time just use the column computed in train time
|
||||||
|
# is taking the classifier of class <col>
|
||||||
|
col = node.get_partition_column()
|
||||||
|
if col == -1:
|
||||||
|
# No partition is producing information gain
|
||||||
|
data = np.ones(data.shape)
|
||||||
|
data = data[:, col]
|
||||||
|
self._up = data > 0
|
||||||
|
|
||||||
|
def part(self, origin: np.array) -> list:
|
||||||
|
"""Split an array in two based on indices (down) and its complement
|
||||||
|
partition has to be called first to establish down indices
|
||||||
|
|
||||||
:param origin: dataset to split
|
:param origin: dataset to split
|
||||||
:type origin: np.array
|
:type origin: np.array
|
||||||
@@ -159,88 +367,141 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
:return: list with two splits of the array
|
:return: list with two splits of the array
|
||||||
:rtype: list
|
:rtype: list
|
||||||
"""
|
"""
|
||||||
up = ~down
|
down = ~self._up
|
||||||
return origin[up[:, 0]] if any(up) else None, \
|
return [
|
||||||
origin[down[:, 0]] if any(down) else None
|
origin[self._up] if any(self._up) else None,
|
||||||
|
origin[down] if any(down) else None,
|
||||||
|
]
|
||||||
|
|
||||||
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
@staticmethod
|
||||||
|
def _distances(node: Snode, data: np.ndarray) -> np.array:
|
||||||
"""Compute distances of the samples to the hyperplane of the node
|
"""Compute distances of the samples to the hyperplane of the node
|
||||||
|
|
||||||
:param node: node containing the svm classifier
|
:param node: node containing the svm classifier
|
||||||
:type node: Snode
|
:type node: Snode
|
||||||
:param data: samples to find out distance to hyperplane
|
:param data: samples to find out distance to hyperplane
|
||||||
:type data: np.ndarray
|
:type data: np.ndarray
|
||||||
:return: array of shape (m, 1) with the distances of every sample to
|
:return: array of shape (m, nc) with the distances of every sample to
|
||||||
the hyperplane of the node
|
the hyperplane of every class. nc = # of classes
|
||||||
:rtype: np.array
|
:rtype: np.array
|
||||||
"""
|
"""
|
||||||
if self.use_predictions:
|
return node._clf.decision_function(data[:, node._features])
|
||||||
res = np.expand_dims(node._clf.decision_function(data), 1)
|
|
||||||
else:
|
|
||||||
"""doesn't work with multiclass as each sample has to do inner
|
|
||||||
product with its own coefficients computes positition of every
|
|
||||||
sample is w.r.t. the hyperplane
|
|
||||||
"""
|
|
||||||
res = self._linear_function(data, node)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def _split_criteria(self, data: np.array) -> np.array:
|
|
||||||
"""Set the criteria to split arrays
|
|
||||||
|
|
||||||
:param data: [description]
|
class Stree(BaseEstimator, ClassifierMixin):
|
||||||
:type data: np.array
|
"""Estimator that is based on binary trees of svm nodes
|
||||||
:return: [description]
|
can deal with sample_weights in predict, used in boosting sklearn methods
|
||||||
:rtype: np.array
|
inheriting from BaseEstimator implements get_params and set_params methods
|
||||||
|
inheriting from ClassifierMixin implement the attribute _estimator_type
|
||||||
|
with "classifier" as value
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
C: float = 1.0,
|
||||||
|
kernel: str = "linear",
|
||||||
|
max_iter: int = 1e5,
|
||||||
|
random_state: int = None,
|
||||||
|
max_depth: int = None,
|
||||||
|
tol: float = 1e-4,
|
||||||
|
degree: int = 3,
|
||||||
|
gamma="scale",
|
||||||
|
split_criteria: str = "impurity",
|
||||||
|
criterion: str = "entropy",
|
||||||
|
min_samples_split: int = 0,
|
||||||
|
max_features=None,
|
||||||
|
splitter: str = "random",
|
||||||
|
):
|
||||||
|
self.max_iter = max_iter
|
||||||
|
self.C = C
|
||||||
|
self.kernel = kernel
|
||||||
|
self.random_state = random_state
|
||||||
|
self.max_depth = max_depth
|
||||||
|
self.tol = tol
|
||||||
|
self.gamma = gamma
|
||||||
|
self.degree = degree
|
||||||
|
self.min_samples_split = min_samples_split
|
||||||
|
self.split_criteria = split_criteria
|
||||||
|
self.max_features = max_features
|
||||||
|
self.criterion = criterion
|
||||||
|
self.splitter = splitter
|
||||||
|
|
||||||
|
def _more_tags(self) -> dict:
|
||||||
|
"""Required by sklearn to supply features of the classifier
|
||||||
|
|
||||||
|
:return: the tag required
|
||||||
|
:rtype: dict
|
||||||
"""
|
"""
|
||||||
return data > 0
|
return {"requires_y": True}
|
||||||
|
|
||||||
def fit(self, X: np.ndarray, y: np.ndarray,
|
def fit(
|
||||||
sample_weight: np.array = None) -> 'Stree':
|
self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
|
||||||
|
) -> "Stree":
|
||||||
"""Build the tree based on the dataset of samples and its labels
|
"""Build the tree based on the dataset of samples and its labels
|
||||||
|
|
||||||
|
:param X: dataset of samples to make predictions
|
||||||
|
:type X: np.array
|
||||||
|
:param y: samples labels
|
||||||
|
:type y: np.array
|
||||||
|
:param sample_weight: weights of the samples. Rescale C per sample.
|
||||||
|
Hi' weights force the classifier to put more emphasis on these points
|
||||||
|
:type sample_weight: np.array optional
|
||||||
:raises ValueError: if parameters C or max_depth are out of bounds
|
:raises ValueError: if parameters C or max_depth are out of bounds
|
||||||
:return: itself to be able to chain actions: fit().predict() ...
|
:return: itself to be able to chain actions: fit().predict() ...
|
||||||
:rtype: Stree
|
:rtype: Stree
|
||||||
"""
|
"""
|
||||||
# Check parameters are Ok.
|
# Check parameters are Ok.
|
||||||
if type(y).__name__ == 'np.ndarray':
|
|
||||||
y = y.ravel()
|
|
||||||
if self.C < 0:
|
if self.C < 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Penalty term must be positive... got (C={self.C:f})")
|
f"Penalty term must be positive... got (C={self.C:f})"
|
||||||
self.__max_depth = np.iinfo(
|
)
|
||||||
np.int32).max if self.max_depth is None else self.max_depth
|
self.__max_depth = (
|
||||||
|
np.iinfo(np.int32).max
|
||||||
|
if self.max_depth is None
|
||||||
|
else self.max_depth
|
||||||
|
)
|
||||||
if self.__max_depth < 1:
|
if self.__max_depth < 1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Maximum depth has to be greater than 1... got (max_depth=\
|
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||||
{self.max_depth})")
|
{self.max_depth})"
|
||||||
|
)
|
||||||
|
|
||||||
check_classification_targets(y)
|
check_classification_targets(y)
|
||||||
X, y = check_X_y(X, y)
|
X, y = check_X_y(X, y)
|
||||||
sample_weight = _check_sample_weight(sample_weight, X)
|
sample_weight = _check_sample_weight(
|
||||||
|
sample_weight, X, dtype=np.float64
|
||||||
|
)
|
||||||
check_classification_targets(y)
|
check_classification_targets(y)
|
||||||
# Initialize computed parameters
|
# Initialize computed parameters
|
||||||
|
self.splitter_ = Splitter(
|
||||||
|
clf=self._build_clf(),
|
||||||
|
criterion=self.criterion,
|
||||||
|
splitter_type=self.splitter,
|
||||||
|
criteria=self.split_criteria,
|
||||||
|
random_state=self.random_state,
|
||||||
|
min_samples_split=self.min_samples_split,
|
||||||
|
)
|
||||||
|
if self.random_state is not None:
|
||||||
|
random.seed(self.random_state)
|
||||||
self.classes_, y = np.unique(y, return_inverse=True)
|
self.classes_, y = np.unique(y, return_inverse=True)
|
||||||
|
self.n_classes_ = self.classes_.shape[0]
|
||||||
self.n_iter_ = self.max_iter
|
self.n_iter_ = self.max_iter
|
||||||
self.depth_ = 0
|
self.depth_ = 0
|
||||||
|
self.n_features_ = X.shape[1]
|
||||||
self.n_features_in_ = X.shape[1]
|
self.n_features_in_ = X.shape[1]
|
||||||
self.tree_ = self.train(X, y, sample_weight, 1, 'root')
|
self.max_features_ = self._initialize_max_features()
|
||||||
|
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
||||||
self._build_predictor()
|
self._build_predictor()
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _build_predictor(self):
|
def train(
|
||||||
"""Process the leaves to make them predictors
|
self,
|
||||||
"""
|
X: np.ndarray,
|
||||||
def run_tree(node: Snode):
|
y: np.ndarray,
|
||||||
if node.is_leaf():
|
sample_weight: np.ndarray,
|
||||||
node.make_predictor()
|
depth: int,
|
||||||
return
|
title: str,
|
||||||
run_tree(node.get_down())
|
) -> Optional[Snode]:
|
||||||
run_tree(node.get_up())
|
|
||||||
|
|
||||||
run_tree(self.tree_)
|
|
||||||
|
|
||||||
def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray,
|
|
||||||
depth: int, title: str) -> Snode:
|
|
||||||
"""Recursive function to split the original dataset into predictor
|
"""Recursive function to split the original dataset into predictor
|
||||||
nodes (leaves)
|
nodes (leaves)
|
||||||
|
|
||||||
@@ -248,7 +509,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
:type X: np.ndarray
|
:type X: np.ndarray
|
||||||
:param y: samples labels
|
:param y: samples labels
|
||||||
:type y: np.ndarray
|
:type y: np.ndarray
|
||||||
:param sample_weight: weight of samples (used in boosting)
|
:param sample_weight: weight of samples. Rescale C per sample.
|
||||||
|
Hi weights force the classifier to put more emphasis on these points.
|
||||||
:type sample_weight: np.ndarray
|
:type sample_weight: np.ndarray
|
||||||
:param depth: actual depth in the tree
|
:param depth: actual depth in the tree
|
||||||
:type depth: int
|
:type depth: int
|
||||||
@@ -261,25 +523,83 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
return None
|
return None
|
||||||
if np.unique(y).shape[0] == 1:
|
if np.unique(y).shape[0] == 1:
|
||||||
# only 1 class => pure dataset
|
# only 1 class => pure dataset
|
||||||
return Snode(None, X, y, title + ', <pure>')
|
return Snode(
|
||||||
|
clf=None,
|
||||||
|
X=X,
|
||||||
|
y=y,
|
||||||
|
features=X.shape[1],
|
||||||
|
impurity=0.0,
|
||||||
|
title=title + ", <pure>",
|
||||||
|
weight=sample_weight,
|
||||||
|
)
|
||||||
# Train the model
|
# Train the model
|
||||||
clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
|
clf = self._build_clf()
|
||||||
C=self.C) # , sample_weight=sample_weight)
|
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||||
clf.fit(X, y, sample_weight=sample_weight)
|
# solve WARNING: class label 0 specified in weight is not found
|
||||||
tree = Snode(clf, X, y, title)
|
# in bagging
|
||||||
|
if any(sample_weight == 0):
|
||||||
|
indices = sample_weight == 0
|
||||||
|
y_next = y[~indices]
|
||||||
|
# touch weights if removing any class
|
||||||
|
if np.unique(y_next).shape[0] != self.n_classes_:
|
||||||
|
sample_weight += 1e-5
|
||||||
|
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||||
|
impurity = self.splitter_.partition_impurity(y)
|
||||||
|
node = Snode(clf, X, y, features, impurity, title, sample_weight)
|
||||||
self.depth_ = max(depth, self.depth_)
|
self.depth_ = max(depth, self.depth_)
|
||||||
down = self._split_criteria(self._distances(tree, X))
|
self.splitter_.partition(X, node, True)
|
||||||
X_U, X_D = self._split_array(X, down)
|
X_U, X_D = self.splitter_.part(X)
|
||||||
y_u, y_d = self._split_array(y, down)
|
y_u, y_d = self.splitter_.part(y)
|
||||||
sw_u, sw_d = self._split_array(sample_weight, down)
|
sw_u, sw_d = self.splitter_.part(sample_weight)
|
||||||
if X_U is None or X_D is None:
|
if X_U is None or X_D is None:
|
||||||
# didn't part anything
|
# didn't part anything
|
||||||
return Snode(clf, X, y, title + ', <cgaf>')
|
return Snode(
|
||||||
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + ' - Up'))
|
clf,
|
||||||
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + ' - Down'))
|
X,
|
||||||
return tree
|
y,
|
||||||
|
features=X.shape[1],
|
||||||
|
impurity=impurity,
|
||||||
|
title=title + ", <cgaf>",
|
||||||
|
weight=sample_weight,
|
||||||
|
)
|
||||||
|
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||||
|
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||||
|
return node
|
||||||
|
|
||||||
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
|
def _build_predictor(self):
|
||||||
|
"""Process the leaves to make them predictors"""
|
||||||
|
|
||||||
|
def run_tree(node: Snode):
|
||||||
|
if node.is_leaf():
|
||||||
|
node.make_predictor()
|
||||||
|
return
|
||||||
|
run_tree(node.get_down())
|
||||||
|
run_tree(node.get_up())
|
||||||
|
|
||||||
|
run_tree(self.tree_)
|
||||||
|
|
||||||
|
def _build_clf(self):
|
||||||
|
"""Build the correct classifier for the node"""
|
||||||
|
return (
|
||||||
|
LinearSVC(
|
||||||
|
max_iter=self.max_iter,
|
||||||
|
random_state=self.random_state,
|
||||||
|
C=self.C,
|
||||||
|
tol=self.tol,
|
||||||
|
)
|
||||||
|
if self.kernel == "linear"
|
||||||
|
else SVC(
|
||||||
|
kernel=self.kernel,
|
||||||
|
max_iter=self.max_iter,
|
||||||
|
tol=self.tol,
|
||||||
|
C=self.C,
|
||||||
|
gamma=self.gamma,
|
||||||
|
degree=self.degree,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _reorder_results(y: np.array, indices: np.array) -> np.array:
|
||||||
"""Reorder an array based on the array of indices passed
|
"""Reorder an array based on the array of indices passed
|
||||||
|
|
||||||
:param y: data untidy
|
:param y: data untidy
|
||||||
@@ -289,12 +609,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
:return: array y ordered
|
:return: array y ordered
|
||||||
:rtype: np.array
|
:rtype: np.array
|
||||||
"""
|
"""
|
||||||
if y.ndim > 1 and y.shape[1] > 1:
|
# return array of same type given in y
|
||||||
# if predict_proba return np.array of floats
|
y_ordered = y.copy()
|
||||||
y_ordered = np.zeros(y.shape, dtype=float)
|
|
||||||
else:
|
|
||||||
# return array of same type given in y
|
|
||||||
y_ordered = y.copy()
|
|
||||||
indices = indices.astype(int)
|
indices = indices.astype(int)
|
||||||
for i, index in enumerate(indices):
|
for i, index in enumerate(indices):
|
||||||
y_ordered[index] = y[i]
|
y_ordered[index] = y[i]
|
||||||
@@ -308,108 +624,70 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
:return: array of labels
|
:return: array of labels
|
||||||
:rtype: np.array
|
:rtype: np.array
|
||||||
"""
|
"""
|
||||||
def predict_class(xp: np.array, indices: np.array,
|
|
||||||
node: Snode) -> np.array:
|
def predict_class(
|
||||||
|
xp: np.array, indices: np.array, node: Snode
|
||||||
|
) -> np.array:
|
||||||
if xp is None:
|
if xp is None:
|
||||||
return [], []
|
return [], []
|
||||||
if node.is_leaf():
|
if node.is_leaf():
|
||||||
# set a class for every sample in dataset
|
# set a class for every sample in dataset
|
||||||
prediction = np.full((xp.shape[0], 1), node._class)
|
prediction = np.full((xp.shape[0], 1), node._class)
|
||||||
return prediction, indices
|
return prediction, indices
|
||||||
down = self._split_criteria(self._distances(node, xp))
|
self.splitter_.partition(xp, node, train=False)
|
||||||
X_U, X_D = self._split_array(xp, down)
|
x_u, x_d = self.splitter_.part(xp)
|
||||||
i_u, i_d = self._split_array(indices, down)
|
i_u, i_d = self.splitter_.part(indices)
|
||||||
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
|
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
|
||||||
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
|
prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
|
||||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
||||||
|
|
||||||
# sklearn check
|
# sklearn check
|
||||||
check_is_fitted(self, ['tree_'])
|
check_is_fitted(self, ["tree_"])
|
||||||
# Input validation
|
# Input validation
|
||||||
X = check_array(X)
|
X = check_array(X)
|
||||||
|
if X.shape[1] != self.n_features_:
|
||||||
|
raise ValueError(
|
||||||
|
f"Expected {self.n_features_} features but got "
|
||||||
|
f"({X.shape[1]})"
|
||||||
|
)
|
||||||
# setup prediction & make it happen
|
# setup prediction & make it happen
|
||||||
indices = np.arange(X.shape[0])
|
indices = np.arange(X.shape[0])
|
||||||
result = self._reorder_results(
|
result = (
|
||||||
*predict_class(X, indices, self.tree_)).astype(int).ravel()
|
self._reorder_results(*predict_class(X, indices, self.tree_))
|
||||||
|
.astype(int)
|
||||||
|
.ravel()
|
||||||
|
)
|
||||||
return self.classes_[result]
|
return self.classes_[result]
|
||||||
|
|
||||||
def predict_proba(self, X: np.array) -> np.array:
|
def score(
|
||||||
"""Computes an approximation of the probability of samples belonging to
|
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||||
class 0 and 1
|
) -> float:
|
||||||
:param X: dataset
|
|
||||||
:type X: np.array
|
|
||||||
:return: array array of shape (m, num_classes), probability of being
|
|
||||||
each class
|
|
||||||
:rtype: np.array
|
|
||||||
"""
|
|
||||||
def predict_class(xp: np.array, indices: np.array, dist: np.array,
|
|
||||||
node: Snode) -> np.array:
|
|
||||||
"""Run the tree to compute predictions
|
|
||||||
|
|
||||||
:param xp: subdataset of samples
|
|
||||||
:type xp: np.array
|
|
||||||
:param indices: indices of subdataset samples to rebuild original
|
|
||||||
order
|
|
||||||
:type indices: np.array
|
|
||||||
:param dist: distances of every sample to the hyperplane or the
|
|
||||||
father node
|
|
||||||
:type dist: np.array
|
|
||||||
:param node: node of the leaf with the class
|
|
||||||
:type node: Snode
|
|
||||||
:return: array of labels and distances, array of indices
|
|
||||||
:rtype: np.array
|
|
||||||
"""
|
|
||||||
if xp is None:
|
|
||||||
return [], []
|
|
||||||
if node.is_leaf():
|
|
||||||
# set a class for every sample in dataset
|
|
||||||
prediction = np.full((xp.shape[0], 1), node._class)
|
|
||||||
prediction_proba = dist
|
|
||||||
return np.append(prediction, prediction_proba, axis=1), indices
|
|
||||||
distances = self._distances(node, xp)
|
|
||||||
down = self._split_criteria(distances)
|
|
||||||
X_U, X_D = self._split_array(xp, down)
|
|
||||||
i_u, i_d = self._split_array(indices, down)
|
|
||||||
di_u, di_d = self._split_array(distances, down)
|
|
||||||
prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
|
|
||||||
prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
|
|
||||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
|
||||||
|
|
||||||
# sklearn check
|
|
||||||
check_is_fitted(self, ['tree_'])
|
|
||||||
# Input validation
|
|
||||||
X = check_array(X)
|
|
||||||
# setup prediction & make it happen
|
|
||||||
indices = np.arange(X.shape[0])
|
|
||||||
empty_dist = np.empty((X.shape[0], 1), dtype=float)
|
|
||||||
result, indices = predict_class(X, indices, empty_dist, self.tree_)
|
|
||||||
result = result.reshape(X.shape[0], 2)
|
|
||||||
# Turn distances to hyperplane into probabilities based on fitting
|
|
||||||
# distances of samples to its hyperplane that classified them, to the
|
|
||||||
# sigmoid function
|
|
||||||
# Probability of being 1
|
|
||||||
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
|
|
||||||
# Probability of being 0
|
|
||||||
result[:, 0] = 1 - result[:, 1]
|
|
||||||
return self._reorder_results(result, indices)
|
|
||||||
|
|
||||||
def score(self, X: np.array, y: np.array) -> float:
|
|
||||||
"""Compute accuracy of the prediction
|
"""Compute accuracy of the prediction
|
||||||
|
|
||||||
:param X: dataset of samples to make predictions
|
:param X: dataset of samples to make predictions
|
||||||
:type X: np.array
|
:type X: np.array
|
||||||
:param y: samples labels
|
:param y_true: samples labels
|
||||||
:type y: np.array
|
:type y_true: np.array
|
||||||
|
:param sample_weight: weights of the samples. Rescale C per sample.
|
||||||
|
Hi' weights force the classifier to put more emphasis on these points
|
||||||
|
:type sample_weight: np.array optional
|
||||||
:return: accuracy of the prediction
|
:return: accuracy of the prediction
|
||||||
:rtype: float
|
:rtype: float
|
||||||
"""
|
"""
|
||||||
# sklearn check
|
# sklearn check
|
||||||
check_is_fitted(self)
|
check_is_fitted(self)
|
||||||
yp = self.predict(X).reshape(y.shape)
|
check_classification_targets(y)
|
||||||
return np.mean(yp == y)
|
X, y = check_X_y(X, y)
|
||||||
|
y_pred = self.predict(X).reshape(y.shape)
|
||||||
|
# Compute accuracy for each possible representation
|
||||||
|
_, y_true, y_pred = _check_targets(y, y_pred)
|
||||||
|
check_consistent_length(y_true, y_pred, sample_weight)
|
||||||
|
score = y_true == y_pred
|
||||||
|
return _weighted_sum(score, sample_weight, normalize=True)
|
||||||
|
|
||||||
def __iter__(self) -> Siterator:
|
def __iter__(self) -> Siterator:
|
||||||
"""Create an iterator to be able to visit the nodes of the tree in preorder,
|
"""Create an iterator to be able to visit the nodes of the tree in
|
||||||
can make a list with all the nodes in preorder
|
preorder, can make a list with all the nodes in preorder
|
||||||
|
|
||||||
:return: an iterator, can for i in... and list(...)
|
:return: an iterator, can for i in... and list(...)
|
||||||
:rtype: Siterator
|
:rtype: Siterator
|
||||||
@@ -426,7 +704,38 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
:return: description of nodes in the tree in preorder
|
:return: description of nodes in the tree in preorder
|
||||||
:rtype: str
|
:rtype: str
|
||||||
"""
|
"""
|
||||||
output = ''
|
output = ""
|
||||||
for i in self:
|
for i in self:
|
||||||
output += str(i) + '\n'
|
output += str(i) + "\n"
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
def _initialize_max_features(self) -> int:
|
||||||
|
if isinstance(self.max_features, str):
|
||||||
|
if self.max_features == "auto":
|
||||||
|
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||||
|
elif self.max_features == "sqrt":
|
||||||
|
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||||
|
elif self.max_features == "log2":
|
||||||
|
max_features = max(1, int(np.log2(self.n_features_)))
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid value for max_features. "
|
||||||
|
"Allowed string values are 'auto', "
|
||||||
|
"'sqrt' or 'log2'."
|
||||||
|
)
|
||||||
|
elif self.max_features is None:
|
||||||
|
max_features = self.n_features_
|
||||||
|
elif isinstance(self.max_features, numbers.Integral):
|
||||||
|
max_features = self.max_features
|
||||||
|
else: # float
|
||||||
|
if self.max_features > 0.0:
|
||||||
|
max_features = max(
|
||||||
|
1, int(self.max_features * self.n_features_)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid value for max_features."
|
||||||
|
"Allowed float must be in range (0, 1] "
|
||||||
|
f"got ({self.max_features})"
|
||||||
|
)
|
||||||
|
return max_features
|
||||||
|
@@ -1,189 +0,0 @@
|
|||||||
'''
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
|
||||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
|
||||||
__license__ = "MIT"
|
|
||||||
__version__ = "0.9"
|
|
||||||
Plot 3D views of nodes in Stree
|
|
||||||
'''
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
from mpl_toolkits.mplot3d import Axes3D
|
|
||||||
|
|
||||||
from .Strees import Stree, Snode, Siterator
|
|
||||||
|
|
||||||
|
|
||||||
class Snode_graph(Snode):
|
|
||||||
|
|
||||||
def __init__(self, node: Stree):
|
|
||||||
self._plot_size = (8, 8)
|
|
||||||
self._xlimits = (None, None)
|
|
||||||
self._ylimits = (None, None)
|
|
||||||
self._zlimits = (None, None)
|
|
||||||
n = Snode.copy(node)
|
|
||||||
super().__init__(n._clf, n._X, n._y, n._title)
|
|
||||||
|
|
||||||
def set_plot_size(self, size: tuple):
|
|
||||||
self._plot_size = size
|
|
||||||
|
|
||||||
def _is_pure(self) -> bool:
|
|
||||||
"""is considered pure a leaf node with one label
|
|
||||||
"""
|
|
||||||
if self.is_leaf():
|
|
||||||
return self._belief == 1.
|
|
||||||
return False
|
|
||||||
|
|
||||||
def set_axis_limits(self, limits: tuple):
|
|
||||||
self._xlimits = limits[0]
|
|
||||||
self._ylimits = limits[1]
|
|
||||||
self._zlimits = limits[2]
|
|
||||||
|
|
||||||
def _set_graphics_axis(self, ax: Axes3D):
|
|
||||||
ax.set_xlim(self._xlimits)
|
|
||||||
ax.set_ylim(self._ylimits)
|
|
||||||
ax.set_zlim(self._zlimits)
|
|
||||||
|
|
||||||
def save_hyperplane(self, save_folder: str = './', save_prefix: str = '',
|
|
||||||
save_seq: int = 1):
|
|
||||||
_, fig = self.plot_hyperplane()
|
|
||||||
name = f"{save_folder}{save_prefix}STnode{save_seq}.png"
|
|
||||||
fig.savefig(name, bbox_inches='tight')
|
|
||||||
plt.close(fig)
|
|
||||||
|
|
||||||
def _get_cmap(self):
|
|
||||||
cmap = 'jet'
|
|
||||||
if self._is_pure() and self._class == 1:
|
|
||||||
cmap = 'jet_r'
|
|
||||||
return cmap
|
|
||||||
|
|
||||||
def _graph_title(self):
|
|
||||||
n_class, card = np.unique(self._y, return_counts=True)
|
|
||||||
return f"{self._title} {n_class} {card}"
|
|
||||||
|
|
||||||
def plot_hyperplane(self, plot_distribution: bool = True):
|
|
||||||
fig = plt.figure(figsize=self._plot_size)
|
|
||||||
ax = fig.add_subplot(1, 1, 1, projection='3d')
|
|
||||||
if not self._is_pure():
|
|
||||||
# Can't plot hyperplane of leaves with one label because it hasn't
|
|
||||||
# classiffier
|
|
||||||
# get the splitting hyperplane
|
|
||||||
def hyperplane(x, y): return (-self._interceptor
|
|
||||||
- self._vector[0][0] * x
|
|
||||||
- self._vector[0][1] * y) \
|
|
||||||
/ self._vector[0][2]
|
|
||||||
|
|
||||||
tmpx = np.linspace(self._X[:, 0].min(), self._X[:, 0].max())
|
|
||||||
tmpy = np.linspace(self._X[:, 1].min(), self._X[:, 1].max())
|
|
||||||
xx, yy = np.meshgrid(tmpx, tmpy)
|
|
||||||
ax.plot_surface(xx, yy, hyperplane(xx, yy), alpha=.5,
|
|
||||||
antialiased=True, rstride=1, cstride=1,
|
|
||||||
cmap='seismic')
|
|
||||||
self._set_graphics_axis(ax)
|
|
||||||
if plot_distribution:
|
|
||||||
self.plot_distribution(ax)
|
|
||||||
else:
|
|
||||||
plt.title(self._graph_title())
|
|
||||||
plt.show()
|
|
||||||
return ax, fig
|
|
||||||
|
|
||||||
def plot_distribution(self, ax: Axes3D = None):
|
|
||||||
if ax is None:
|
|
||||||
fig = plt.figure(figsize=self._plot_size)
|
|
||||||
ax = fig.add_subplot(1, 1, 1, projection='3d')
|
|
||||||
plt.title(self._graph_title())
|
|
||||||
cmap = self._get_cmap()
|
|
||||||
ax.scatter(self._X[:, 0], self._X[:, 1],
|
|
||||||
self._X[:, 2], c=self._y, cmap=cmap)
|
|
||||||
ax.set_xlabel('X0')
|
|
||||||
ax.set_ylabel('X1')
|
|
||||||
ax.set_zlabel('X2')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
||||||
class Stree_grapher(Stree):
|
|
||||||
"""Build 3d graphs of any dataset, if it's more than 3 features PCA shall
|
|
||||||
make its magic
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, params: dict):
|
|
||||||
self._plot_size = (8, 8)
|
|
||||||
self._tree_gr = None
|
|
||||||
# make Snode store X's
|
|
||||||
os.environ['TESTING'] = '1'
|
|
||||||
self._fitted = False
|
|
||||||
self._pca = None
|
|
||||||
super().__init__(**params)
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
try:
|
|
||||||
os.environ.pop('TESTING')
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
plt.close('all')
|
|
||||||
|
|
||||||
def _copy_tree(self, node: Snode) -> Snode_graph:
|
|
||||||
mirror = Snode_graph(node)
|
|
||||||
# clone node
|
|
||||||
mirror._class = node._class
|
|
||||||
mirror._belief = node._belief
|
|
||||||
if node.get_down() is not None:
|
|
||||||
mirror.set_down(self._copy_tree(node.get_down()))
|
|
||||||
if node.get_up() is not None:
|
|
||||||
mirror.set_up(self._copy_tree(node.get_up()))
|
|
||||||
return mirror
|
|
||||||
|
|
||||||
def fit(self, X: np.array, y: np.array) -> Stree:
|
|
||||||
"""Fit the Stree and copy the tree in a Snode_graph tree
|
|
||||||
|
|
||||||
:param X: Dataset
|
|
||||||
:type X: np.array
|
|
||||||
:param y: Labels
|
|
||||||
:type y: np.array
|
|
||||||
:return: Stree model
|
|
||||||
:rtype: Stree
|
|
||||||
"""
|
|
||||||
if X.shape[1] != 3:
|
|
||||||
self._pca = PCA(n_components=3)
|
|
||||||
X = self._pca.fit_transform(X)
|
|
||||||
res = super().fit(X, y)
|
|
||||||
self._tree_gr = self._copy_tree(self.tree_)
|
|
||||||
self._fitted = True
|
|
||||||
return res
|
|
||||||
|
|
||||||
def score(self, X: np.array, y: np.array) -> float:
|
|
||||||
self._check_fitted()
|
|
||||||
if X.shape[1] != 3:
|
|
||||||
X = self._pca.transform(X)
|
|
||||||
return super().score(X, y)
|
|
||||||
|
|
||||||
def _check_fitted(self):
|
|
||||||
if not self._fitted:
|
|
||||||
raise Exception('Have to fit the grapher first!')
|
|
||||||
|
|
||||||
def save_all(self, save_folder: str = './', save_prefix: str = ''):
|
|
||||||
"""Save all the node plots in png format, each with a sequence number
|
|
||||||
|
|
||||||
:param save_folder: folder where the plots are saved, defaults to './'
|
|
||||||
:type save_folder: str, optional
|
|
||||||
"""
|
|
||||||
self._check_fitted()
|
|
||||||
if not os.path.isdir(save_folder):
|
|
||||||
os.mkdir(save_folder)
|
|
||||||
seq = 1
|
|
||||||
for node in self:
|
|
||||||
node.save_hyperplane(save_folder=save_folder,
|
|
||||||
save_prefix=save_prefix, save_seq=seq)
|
|
||||||
seq += 1
|
|
||||||
|
|
||||||
def plot_all(self):
|
|
||||||
"""Plots all the nodes
|
|
||||||
"""
|
|
||||||
self._check_fitted()
|
|
||||||
for node in self:
|
|
||||||
node.plot_hyperplane()
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return Siterator(self._tree_gr)
|
|
@@ -1,2 +1,3 @@
|
|||||||
from .Strees import Stree, Snode, Siterator
|
from .Strees import Stree, Snode, Siterator, Splitter
|
||||||
from .Strees_grapher import Stree_grapher, Snode_graph
|
|
||||||
|
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
||||||
|
96
stree/tests/Snode_test.py
Normal file
96
stree/tests/Snode_test.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from stree import Stree, Snode
|
||||||
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
|
class Snode_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._random_state = 1
|
||||||
|
self._clf = Stree(random_state=self._random_state)
|
||||||
|
self._clf.fit(*load_dataset(self._random_state))
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUp(cls):
|
||||||
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
|
def test_attributes_in_leaves(self):
|
||||||
|
"""Check if the attributes in leaves have correct values so they form a
|
||||||
|
predictor
|
||||||
|
"""
|
||||||
|
|
||||||
|
def check_leave(node: Snode):
|
||||||
|
if not node.is_leaf():
|
||||||
|
check_leave(node.get_down())
|
||||||
|
check_leave(node.get_up())
|
||||||
|
return
|
||||||
|
# Check Belief in leave
|
||||||
|
classes, card = np.unique(node._y, return_counts=True)
|
||||||
|
max_card = max(card)
|
||||||
|
min_card = min(card)
|
||||||
|
if len(classes) > 1:
|
||||||
|
belief = max_card / (max_card + min_card)
|
||||||
|
else:
|
||||||
|
belief = 1
|
||||||
|
self.assertEqual(belief, node._belief)
|
||||||
|
# Check Class
|
||||||
|
class_computed = classes[card == max_card]
|
||||||
|
self.assertEqual(class_computed, node._class)
|
||||||
|
# Check Partition column
|
||||||
|
self.assertEqual(node._partition_column, -1)
|
||||||
|
|
||||||
|
check_leave(self._clf.tree_)
|
||||||
|
|
||||||
|
def test_nodes_coefs(self):
|
||||||
|
"""Check if the nodes of the tree have the right attributes filled"""
|
||||||
|
|
||||||
|
def run_tree(node: Snode):
|
||||||
|
if node._belief < 1:
|
||||||
|
# only exclude pure leaves
|
||||||
|
self.assertIsNotNone(node._clf)
|
||||||
|
self.assertIsNotNone(node._clf.coef_)
|
||||||
|
if node.is_leaf():
|
||||||
|
return
|
||||||
|
run_tree(node.get_up())
|
||||||
|
run_tree(node.get_down())
|
||||||
|
|
||||||
|
model = Stree(self._random_state)
|
||||||
|
model.fit(*load_dataset(self._random_state, 3, 4))
|
||||||
|
run_tree(model.tree_)
|
||||||
|
|
||||||
|
def test_make_predictor_on_leaf(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
|
test.make_predictor()
|
||||||
|
self.assertEqual(1, test._class)
|
||||||
|
self.assertEqual(0.75, test._belief)
|
||||||
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
|
||||||
|
def test_make_predictor_on_not_leaf(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||||
|
test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
|
||||||
|
test.make_predictor()
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
self.assertEqual(0, test._belief)
|
||||||
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
self.assertEqual(-1, test.get_up()._partition_column)
|
||||||
|
|
||||||
|
def test_make_predictor_on_leaf_bogus_data(self):
|
||||||
|
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
||||||
|
test.make_predictor()
|
||||||
|
self.assertIsNone(test._class)
|
||||||
|
self.assertEqual(-1, test._partition_column)
|
||||||
|
|
||||||
|
def test_copy_node(self):
|
||||||
|
px = [1, 2, 3, 4]
|
||||||
|
py = [1]
|
||||||
|
test = Snode(Stree(), px, py, [], 0.0, "test")
|
||||||
|
computed = Snode.copy(test)
|
||||||
|
self.assertListEqual(computed._X, px)
|
||||||
|
self.assertListEqual(computed._y, py)
|
||||||
|
self.assertEqual("test", computed._title)
|
||||||
|
self.assertIsInstance(computed._clf, Stree)
|
||||||
|
self.assertEqual(test._partition_column, computed._partition_column)
|
224
stree/tests/Splitter_test.py
Normal file
224
stree/tests/Splitter_test.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
import random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.datasets import load_wine, load_iris
|
||||||
|
from stree import Splitter
|
||||||
|
|
||||||
|
|
||||||
|
class Splitter_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._random_state = 1
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build(
|
||||||
|
clf=SVC,
|
||||||
|
min_samples_split=0,
|
||||||
|
splitter_type="random",
|
||||||
|
criterion="gini",
|
||||||
|
criteria="max_samples",
|
||||||
|
random_state=None,
|
||||||
|
):
|
||||||
|
return Splitter(
|
||||||
|
clf=clf(random_state=random_state, kernel="rbf"),
|
||||||
|
min_samples_split=min_samples_split,
|
||||||
|
splitter_type=splitter_type,
|
||||||
|
criterion=criterion,
|
||||||
|
criteria=criteria,
|
||||||
|
random_state=random_state,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUp(cls):
|
||||||
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
|
def test_init(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.build(criterion="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.build(splitter_type="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
self.build(criteria="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = Splitter(clf=None)
|
||||||
|
for splitter_type in ["best", "random"]:
|
||||||
|
for criterion in ["gini", "entropy"]:
|
||||||
|
for criteria in ["max_samples", "impurity"]:
|
||||||
|
tcl = self.build(
|
||||||
|
splitter_type=splitter_type,
|
||||||
|
criterion=criterion,
|
||||||
|
criteria=criteria,
|
||||||
|
)
|
||||||
|
self.assertEqual(splitter_type, tcl._splitter_type)
|
||||||
|
self.assertEqual(criterion, tcl._criterion)
|
||||||
|
self.assertEqual(criteria, tcl._criteria)
|
||||||
|
|
||||||
|
def test_gini(self):
|
||||||
|
expected_values = [
|
||||||
|
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
|
||||||
|
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
|
||||||
|
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
|
||||||
|
([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
|
||||||
|
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
|
||||||
|
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
|
||||||
|
([0], 0),
|
||||||
|
([1, 1, 1, 1], 0),
|
||||||
|
]
|
||||||
|
for labels, expected in expected_values:
|
||||||
|
self.assertAlmostEqual(expected, Splitter._gini(labels))
|
||||||
|
tcl = self.build(criterion="gini")
|
||||||
|
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||||
|
|
||||||
|
def test_entropy(self):
|
||||||
|
expected_values = [
|
||||||
|
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
|
||||||
|
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
|
||||||
|
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
|
||||||
|
([0, 0, 1, 1, 1, 1, 0, 0], 1),
|
||||||
|
([0, 0, 1, 1, 2, 2, 3, 3], 1),
|
||||||
|
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
|
||||||
|
([1], 0),
|
||||||
|
([0, 0, 0, 0], 0),
|
||||||
|
]
|
||||||
|
for labels, expected in expected_values:
|
||||||
|
self.assertAlmostEqual(expected, Splitter._entropy(labels))
|
||||||
|
tcl = self.build(criterion="entropy")
|
||||||
|
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||||
|
|
||||||
|
def test_information_gain(self):
|
||||||
|
expected_values = [
|
||||||
|
(
|
||||||
|
[0, 1, 1, 1, 1, 1],
|
||||||
|
[0, 0, 0, 1],
|
||||||
|
0.16333333333333333,
|
||||||
|
0.25642589168200297,
|
||||||
|
),
|
||||||
|
(
|
||||||
|
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
|
||||||
|
[5, 3, 2, 1, 1],
|
||||||
|
0.007381776239907684,
|
||||||
|
-0.03328610916207225,
|
||||||
|
),
|
||||||
|
([], [], 0.0, 0.0),
|
||||||
|
([1], [], 0.0, 0.0),
|
||||||
|
([], [1], 0.0, 0.0),
|
||||||
|
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
|
||||||
|
([], [1, 1, 1, 2], 0.0, 0.0),
|
||||||
|
(None, [1, 2, 3], 0.0, 0.0),
|
||||||
|
([1, 2, 3], None, 0.0, 0.0),
|
||||||
|
]
|
||||||
|
for yu, yd, expected_gini, expected_entropy in expected_values:
|
||||||
|
yu = np.array(yu, dtype=np.int32) if yu is not None else None
|
||||||
|
yd = np.array(yd, dtype=np.int32) if yd is not None else None
|
||||||
|
if yu is not None and yd is not None:
|
||||||
|
complete = np.append(yu, yd)
|
||||||
|
elif yd is not None:
|
||||||
|
complete = yd
|
||||||
|
else:
|
||||||
|
complete = yu
|
||||||
|
tcl = self.build(criterion="gini")
|
||||||
|
computed = tcl.information_gain(complete, yu, yd)
|
||||||
|
self.assertAlmostEqual(expected_gini, computed)
|
||||||
|
tcl = self.build(criterion="entropy")
|
||||||
|
computed = tcl.information_gain(complete, yu, yd)
|
||||||
|
self.assertAlmostEqual(expected_entropy, computed)
|
||||||
|
|
||||||
|
def test_max_samples(self):
|
||||||
|
tcl = self.build(criteria="max_samples")
|
||||||
|
data = np.array(
|
||||||
|
[
|
||||||
|
[-0.1, 0.2, -0.3],
|
||||||
|
[0.7, 0.01, -0.1],
|
||||||
|
[0.7, -0.9, 0.5],
|
||||||
|
[0.1, 0.2, 0.3],
|
||||||
|
[-0.1, 0.2, 0.3],
|
||||||
|
[-0.1, 0.2, 0.3],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = data[:, 0]
|
||||||
|
y = [1, 2, 1, 0, 0, 0]
|
||||||
|
computed = tcl._max_samples(data, y)
|
||||||
|
self.assertEqual(0, computed)
|
||||||
|
computed_data = data[:, computed]
|
||||||
|
self.assertEqual((6,), computed_data.shape)
|
||||||
|
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
||||||
|
|
||||||
|
def test_impurity(self):
|
||||||
|
tcl = self.build(criteria="impurity")
|
||||||
|
data = np.array(
|
||||||
|
[
|
||||||
|
[-0.1, 0.2, -0.3],
|
||||||
|
[0.7, 0.01, -0.1],
|
||||||
|
[0.7, -0.9, 0.5],
|
||||||
|
[0.1, 0.2, 0.3],
|
||||||
|
[-0.1, 0.2, 0.3],
|
||||||
|
[-0.1, 0.2, 0.3],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
expected = data[:, 2]
|
||||||
|
y = np.array([1, 2, 1, 0, 0, 0])
|
||||||
|
computed = tcl._impurity(data, y)
|
||||||
|
self.assertEqual(2, computed)
|
||||||
|
computed_data = data[:, computed]
|
||||||
|
self.assertEqual((6,), computed_data.shape)
|
||||||
|
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
||||||
|
|
||||||
|
def test_generate_subspaces(self):
|
||||||
|
features = 250
|
||||||
|
for max_features in range(2, features):
|
||||||
|
num = len(Splitter._generate_spaces(features, max_features))
|
||||||
|
self.assertEqual(5, num)
|
||||||
|
self.assertEqual(3, len(Splitter._generate_spaces(3, 2)))
|
||||||
|
self.assertEqual(4, len(Splitter._generate_spaces(4, 3)))
|
||||||
|
|
||||||
|
def test_best_splitter_few_sets(self):
|
||||||
|
X, y = load_iris(return_X_y=True)
|
||||||
|
X = np.delete(X, 3, 1)
|
||||||
|
tcl = self.build(splitter_type="best", random_state=self._random_state)
|
||||||
|
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
||||||
|
self.assertListEqual([0, 2], list(computed))
|
||||||
|
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
|
||||||
|
|
||||||
|
def test_splitter_parameter(self):
|
||||||
|
expected_values = [
|
||||||
|
[1, 4, 9, 12], # best entropy max_samples
|
||||||
|
[1, 3, 6, 10], # best entropy impurity
|
||||||
|
[6, 8, 10, 12], # best gini max_samples
|
||||||
|
[7, 8, 10, 11], # best gini impurity
|
||||||
|
[0, 3, 8, 12], # random entropy max_samples
|
||||||
|
[0, 3, 9, 11], # random entropy impurity
|
||||||
|
[0, 4, 7, 12], # random gini max_samples
|
||||||
|
[0, 2, 5, 6], # random gini impurity
|
||||||
|
]
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
rn = 0
|
||||||
|
for splitter_type in ["best", "random"]:
|
||||||
|
for criterion in ["entropy", "gini"]:
|
||||||
|
for criteria in [
|
||||||
|
"max_samples",
|
||||||
|
"impurity",
|
||||||
|
]:
|
||||||
|
tcl = self.build(
|
||||||
|
splitter_type=splitter_type,
|
||||||
|
criterion=criterion,
|
||||||
|
criteria=criteria,
|
||||||
|
)
|
||||||
|
expected = expected_values.pop(0)
|
||||||
|
random.seed(rn)
|
||||||
|
rn += 1
|
||||||
|
dataset, computed = tcl.get_subspace(X, y, max_features=4)
|
||||||
|
# print(
|
||||||
|
# "{}, # {:7s}{:8s}{:15s}".format(
|
||||||
|
# list(computed),
|
||||||
|
# splitter_type,
|
||||||
|
# criterion,
|
||||||
|
# criteria,
|
||||||
|
# )
|
||||||
|
# )
|
||||||
|
self.assertListEqual(expected, list(computed))
|
||||||
|
self.assertListEqual(
|
||||||
|
X[:, computed].tolist(), dataset.tolist()
|
||||||
|
)
|
444
stree/tests/Stree_test.py
Normal file
444
stree/tests/Stree_test.py
Normal file
@@ -0,0 +1,444 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.datasets import load_iris, load_wine
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
|
from stree import Stree, Snode
|
||||||
|
from .utils import load_dataset
|
||||||
|
|
||||||
|
|
||||||
|
class Stree_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self._random_state = 1
|
||||||
|
self._kernels = ["linear", "rbf", "poly"]
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def setUp(cls):
|
||||||
|
os.environ["TESTING"] = "1"
|
||||||
|
|
||||||
|
def _check_tree(self, node: Snode):
|
||||||
|
"""Check recursively that the nodes that are not leaves have the
|
||||||
|
correct number of labels and its sons have the right number of elements
|
||||||
|
in their dataset
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
node {Snode} -- node to check
|
||||||
|
"""
|
||||||
|
if node.is_leaf():
|
||||||
|
return
|
||||||
|
y_prediction = node._clf.predict(node._X)
|
||||||
|
y_down = node.get_down()._y
|
||||||
|
y_up = node.get_up()._y
|
||||||
|
# Is a correct partition in terms of cadinality?
|
||||||
|
# i.e. The partition algorithm didn't forget any sample
|
||||||
|
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
||||||
|
unique_y, count_y = np.unique(node._y, return_counts=True)
|
||||||
|
_, count_d = np.unique(y_down, return_counts=True)
|
||||||
|
_, count_u = np.unique(y_up, return_counts=True)
|
||||||
|
#
|
||||||
|
for i in unique_y:
|
||||||
|
number_up = count_u[i]
|
||||||
|
try:
|
||||||
|
number_down = count_d[i]
|
||||||
|
except IndexError:
|
||||||
|
number_down = 0
|
||||||
|
self.assertEqual(count_y[i], number_down + number_up)
|
||||||
|
# Is the partition made the same as the prediction?
|
||||||
|
# as the node is not a leaf...
|
||||||
|
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||||
|
self.assertEqual(count_yp[1], y_up.shape[0])
|
||||||
|
self.assertEqual(count_yp[0], y_down.shape[0])
|
||||||
|
self._check_tree(node.get_down())
|
||||||
|
self._check_tree(node.get_up())
|
||||||
|
|
||||||
|
def test_build_tree(self):
|
||||||
|
"""Check if the tree is built the same way as predictions of models"""
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
clf.fit(*load_dataset(self._random_state))
|
||||||
|
self._check_tree(clf.tree_)
|
||||||
|
|
||||||
|
def test_single_prediction(self):
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
|
||||||
|
self.assertEqual(yp[0], y[0])
|
||||||
|
|
||||||
|
def test_multiple_prediction(self):
|
||||||
|
# First 27 elements the predictions are the same as the truth
|
||||||
|
num = 27
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
yp = clf.fit(X, y).predict(X[:num, :])
|
||||||
|
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||||
|
|
||||||
|
def test_single_vs_multiple_prediction(self):
|
||||||
|
"""Check if predicting sample by sample gives the same result as
|
||||||
|
predicting all samples at once
|
||||||
|
"""
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||||
|
clf.fit(X, y)
|
||||||
|
# Compute prediction line by line
|
||||||
|
yp_line = np.array([], dtype=int)
|
||||||
|
for xp in X:
|
||||||
|
yp_line = np.append(
|
||||||
|
yp_line, clf.predict(xp.reshape(-1, X.shape[1]))
|
||||||
|
)
|
||||||
|
# Compute prediction at once
|
||||||
|
yp_once = clf.predict(X)
|
||||||
|
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||||
|
|
||||||
|
def test_iterator_and_str(self):
|
||||||
|
"""Check preorder iterator"""
|
||||||
|
expected = [
|
||||||
|
"root feaures=(0, 1, 2) impurity=1.0000 counts=(array([0, 1]), arr"
|
||||||
|
"ay([750, 750]))",
|
||||||
|
"root - Down, <cgaf> - Leaf class=0 belief= 0.928297 impurity=0.37"
|
||||||
|
"22 counts=(array([0, 1]), array([725, 56]))",
|
||||||
|
"root - Up feaures=(0, 1, 2) impurity=0.2178 counts=(array([0, 1])"
|
||||||
|
", array([ 25, 694]))",
|
||||||
|
"root - Up - Down feaures=(0, 1, 2) impurity=0.8454 counts=(array("
|
||||||
|
"[0, 1]), array([8, 3]))",
|
||||||
|
"root - Up - Down - Down, <pure> - Leaf class=0 belief= 1.000000 i"
|
||||||
|
"mpurity=0.0000 counts=(array([0]), array([7]))",
|
||||||
|
"root - Up - Down - Up, <cgaf> - Leaf class=1 belief= 0.750000 imp"
|
||||||
|
"urity=0.8113 counts=(array([0, 1]), array([1, 3]))",
|
||||||
|
"root - Up - Up, <cgaf> - Leaf class=1 belief= 0.975989 impurity=0"
|
||||||
|
".1634 counts=(array([0, 1]), array([ 17, 691]))",
|
||||||
|
]
|
||||||
|
computed = []
|
||||||
|
expected_string = ""
|
||||||
|
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||||
|
clf.fit(*load_dataset(self._random_state))
|
||||||
|
for node in clf:
|
||||||
|
computed.append(str(node))
|
||||||
|
expected_string += str(node) + "\n"
|
||||||
|
self.assertListEqual(expected, computed)
|
||||||
|
self.assertEqual(expected_string, str(clf))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def test_is_a_sklearn_classifier():
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
|
from sklearn.utils.estimator_checks import check_estimator
|
||||||
|
|
||||||
|
check_estimator(Stree())
|
||||||
|
|
||||||
|
def test_exception_if_C_is_negative(self):
|
||||||
|
tclf = Stree(C=-1)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
tclf.fit(*load_dataset(self._random_state))
|
||||||
|
|
||||||
|
def test_exception_if_bogus_split_criteria(self):
|
||||||
|
tclf = Stree(split_criteria="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
tclf.fit(*load_dataset(self._random_state))
|
||||||
|
|
||||||
|
def test_check_max_depth_is_positive_or_None(self):
|
||||||
|
tcl = Stree()
|
||||||
|
self.assertIsNone(tcl.max_depth)
|
||||||
|
tcl = Stree(max_depth=1)
|
||||||
|
self.assertGreaterEqual(1, tcl.max_depth)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
tcl = Stree(max_depth=-1)
|
||||||
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
|
|
||||||
|
def test_check_max_depth(self):
|
||||||
|
depths = (3, 4)
|
||||||
|
for depth in depths:
|
||||||
|
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
||||||
|
tcl.fit(*load_dataset(self._random_state))
|
||||||
|
self.assertEqual(depth, tcl.depth_)
|
||||||
|
|
||||||
|
def test_unfitted_tree_is_iterable(self):
|
||||||
|
tcl = Stree()
|
||||||
|
self.assertEqual(0, len(list(tcl)))
|
||||||
|
|
||||||
|
def test_min_samples_split(self):
|
||||||
|
dataset = [[1], [2], [3]], [1, 1, 0]
|
||||||
|
tcl_split = Stree(min_samples_split=3).fit(*dataset)
|
||||||
|
self.assertIsNotNone(tcl_split.tree_.get_down())
|
||||||
|
self.assertIsNotNone(tcl_split.tree_.get_up())
|
||||||
|
tcl_nosplit = Stree(min_samples_split=4).fit(*dataset)
|
||||||
|
self.assertIsNone(tcl_nosplit.tree_.get_down())
|
||||||
|
self.assertIsNone(tcl_nosplit.tree_.get_up())
|
||||||
|
|
||||||
|
def test_simple_muticlass_dataset(self):
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(
|
||||||
|
kernel=kernel,
|
||||||
|
split_criteria="max_samples",
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
|
px = [[1, 2], [5, 6], [9, 10]]
|
||||||
|
py = [0, 1, 2]
|
||||||
|
clf.fit(px, py)
|
||||||
|
self.assertEqual(1.0, clf.score(px, py))
|
||||||
|
self.assertListEqual(py, clf.predict(px).tolist())
|
||||||
|
self.assertListEqual(py, clf.classes_.tolist())
|
||||||
|
|
||||||
|
def test_muticlass_dataset(self):
|
||||||
|
datasets = {
|
||||||
|
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||||
|
"Iris": load_wine(return_X_y=True),
|
||||||
|
}
|
||||||
|
outcomes = {
|
||||||
|
"Synt": {
|
||||||
|
"max_samples linear": 0.9606666666666667,
|
||||||
|
"max_samples rbf": 0.7133333333333334,
|
||||||
|
"max_samples poly": 0.49066666666666664,
|
||||||
|
"impurity linear": 0.9606666666666667,
|
||||||
|
"impurity rbf": 0.7133333333333334,
|
||||||
|
"impurity poly": 0.49066666666666664,
|
||||||
|
},
|
||||||
|
"Iris": {
|
||||||
|
"max_samples linear": 1.0,
|
||||||
|
"max_samples rbf": 0.6910112359550562,
|
||||||
|
"max_samples poly": 0.6966292134831461,
|
||||||
|
"impurity linear": 1,
|
||||||
|
"impurity rbf": 0.6910112359550562,
|
||||||
|
"impurity poly": 0.6966292134831461,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, dataset in datasets.items():
|
||||||
|
px, py = dataset
|
||||||
|
for criteria in ["max_samples", "impurity"]:
|
||||||
|
for kernel in self._kernels:
|
||||||
|
clf = Stree(
|
||||||
|
C=55,
|
||||||
|
max_iter=1e5,
|
||||||
|
kernel=kernel,
|
||||||
|
random_state=self._random_state,
|
||||||
|
)
|
||||||
|
clf.fit(px, py)
|
||||||
|
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||||
|
# print(
|
||||||
|
# f"{name} {criteria} {kernel} {outcome} {clf.score(px"
|
||||||
|
# ", py)}"
|
||||||
|
# )
|
||||||
|
self.assertAlmostEqual(outcome, clf.score(px, py))
|
||||||
|
|
||||||
|
def test_max_features(self):
|
||||||
|
n_features = 16
|
||||||
|
expected_values = [
|
||||||
|
("auto", 4),
|
||||||
|
("log2", 4),
|
||||||
|
("sqrt", 4),
|
||||||
|
(0.5, 8),
|
||||||
|
(3, 3),
|
||||||
|
(None, 16),
|
||||||
|
]
|
||||||
|
clf = Stree()
|
||||||
|
clf.n_features_ = n_features
|
||||||
|
for max_features, expected in expected_values:
|
||||||
|
clf.set_params(**dict(max_features=max_features))
|
||||||
|
computed = clf._initialize_max_features()
|
||||||
|
self.assertEqual(expected, computed)
|
||||||
|
# Check bogus max_features
|
||||||
|
values = ["duck", -0.1, 0.0]
|
||||||
|
for max_features in values:
|
||||||
|
clf.set_params(**dict(max_features=max_features))
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
_ = clf._initialize_max_features()
|
||||||
|
|
||||||
|
def test_get_subspaces(self):
|
||||||
|
dataset = np.random.random((10, 16))
|
||||||
|
y = np.random.randint(0, 2, 10)
|
||||||
|
expected_values = [
|
||||||
|
("auto", 4),
|
||||||
|
("log2", 4),
|
||||||
|
("sqrt", 4),
|
||||||
|
(0.5, 8),
|
||||||
|
(3, 3),
|
||||||
|
(None, 16),
|
||||||
|
]
|
||||||
|
clf = Stree()
|
||||||
|
for max_features, expected in expected_values:
|
||||||
|
clf.set_params(**dict(max_features=max_features))
|
||||||
|
clf.fit(dataset, y)
|
||||||
|
computed, indices = clf.splitter_.get_subspace(
|
||||||
|
dataset, y, clf.max_features_
|
||||||
|
)
|
||||||
|
self.assertListEqual(
|
||||||
|
dataset[:, indices].tolist(), computed.tolist()
|
||||||
|
)
|
||||||
|
self.assertEqual(expected, len(indices))
|
||||||
|
|
||||||
|
def test_bogus_criterion(self):
|
||||||
|
clf = Stree(criterion="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(*load_dataset())
|
||||||
|
|
||||||
|
def test_predict_feature_dimensions(self):
|
||||||
|
X = np.random.rand(10, 5)
|
||||||
|
y = np.random.randint(0, 2, 10)
|
||||||
|
clf = Stree()
|
||||||
|
clf.fit(X, y)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.predict(X[:, :3])
|
||||||
|
|
||||||
|
# Tests of score
|
||||||
|
|
||||||
|
def test_score_binary(self):
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
accuracies = [
|
||||||
|
0.9506666666666667,
|
||||||
|
0.9606666666666667,
|
||||||
|
0.9433333333333334,
|
||||||
|
]
|
||||||
|
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||||
|
clf = Stree(
|
||||||
|
random_state=self._random_state,
|
||||||
|
kernel=kernel,
|
||||||
|
)
|
||||||
|
clf.fit(X, y)
|
||||||
|
accuracy_score = clf.score(X, y)
|
||||||
|
yp = clf.predict(X)
|
||||||
|
accuracy_computed = np.mean(yp == y)
|
||||||
|
self.assertEqual(accuracy_score, accuracy_computed)
|
||||||
|
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||||
|
|
||||||
|
def test_score_max_features(self):
|
||||||
|
X, y = load_dataset(self._random_state)
|
||||||
|
clf = Stree(random_state=self._random_state, max_features=2)
|
||||||
|
clf.fit(X, y)
|
||||||
|
self.assertAlmostEqual(0.9246666666666666, clf.score(X, y))
|
||||||
|
|
||||||
|
def test_bogus_splitter_parameter(self):
|
||||||
|
clf = Stree(splitter="duck")
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
clf.fit(*load_dataset())
|
||||||
|
|
||||||
|
def test_weights_removing_class(self):
|
||||||
|
# This patch solves an stderr message from sklearn svm lib
|
||||||
|
# "WARNING: class label x specified in weight is not found"
|
||||||
|
X = np.array(
|
||||||
|
[
|
||||||
|
[0.1, 0.1],
|
||||||
|
[0.1, 0.2],
|
||||||
|
[0.2, 0.1],
|
||||||
|
[5, 6],
|
||||||
|
[8, 9],
|
||||||
|
[6, 7],
|
||||||
|
[0.2, 0.2],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
y = np.array([0, 0, 0, 1, 1, 1, 0])
|
||||||
|
epsilon = 1e-5
|
||||||
|
weights = [1, 1, 1, 0, 0, 0, 1]
|
||||||
|
weights = np.array(weights, dtype="float64")
|
||||||
|
weights_epsilon = [x + epsilon for x in weights]
|
||||||
|
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
|
||||||
|
original = weights_no_zero.copy()
|
||||||
|
clf = Stree()
|
||||||
|
clf.fit(X, y)
|
||||||
|
node = clf.train(
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
weights,
|
||||||
|
1,
|
||||||
|
"test",
|
||||||
|
)
|
||||||
|
# if a class is lost with zero weights the patch adds epsilon
|
||||||
|
self.assertListEqual(weights.tolist(), weights_epsilon)
|
||||||
|
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
|
||||||
|
# zero weights are ok when they don't erase a class
|
||||||
|
_ = clf.train(X, y, weights_no_zero, 1, "test")
|
||||||
|
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
|
||||||
|
|
||||||
|
def test_multiclass_classifier_integrity(self):
|
||||||
|
"""Checks if the multiclass operation is done right"""
|
||||||
|
X, y = load_iris(return_X_y=True)
|
||||||
|
clf = Stree(random_state=0)
|
||||||
|
clf.fit(X, y)
|
||||||
|
score = clf.score(X, y)
|
||||||
|
# Check accuracy of the whole model
|
||||||
|
self.assertAlmostEquals(0.98, score, 5)
|
||||||
|
svm = LinearSVC(random_state=0)
|
||||||
|
svm.fit(X, y)
|
||||||
|
self.assertAlmostEquals(0.9666666666666667, svm.score(X, y), 5)
|
||||||
|
data = svm.decision_function(X)
|
||||||
|
expected = [
|
||||||
|
0.4444444444444444,
|
||||||
|
0.35777777777777775,
|
||||||
|
0.4569777777777778,
|
||||||
|
]
|
||||||
|
ty = data.copy()
|
||||||
|
ty[data <= 0] = 0
|
||||||
|
ty[data > 0] = 1
|
||||||
|
ty = ty.astype(int)
|
||||||
|
for i in range(3):
|
||||||
|
self.assertAlmostEquals(
|
||||||
|
expected[i],
|
||||||
|
clf.splitter_._gini(ty[:, i]),
|
||||||
|
)
|
||||||
|
# 1st Branch
|
||||||
|
# up has to have 50 samples of class 0
|
||||||
|
# down should have 100 [50, 50]
|
||||||
|
up = data[:, 2] > 0
|
||||||
|
resup = np.unique(y[up], return_counts=True)
|
||||||
|
resdn = np.unique(y[~up], return_counts=True)
|
||||||
|
self.assertListEqual([1, 2], resup[0].tolist())
|
||||||
|
self.assertListEqual([3, 50], resup[1].tolist())
|
||||||
|
self.assertListEqual([0, 1], resdn[0].tolist())
|
||||||
|
self.assertListEqual([50, 47], resdn[1].tolist())
|
||||||
|
# 2nd Branch
|
||||||
|
# up should have 53 samples of classes [1, 2] [3, 50]
|
||||||
|
# down shoud have 47 samples of class 1
|
||||||
|
node_up = clf.tree_.get_down().get_up()
|
||||||
|
node_dn = clf.tree_.get_down().get_down()
|
||||||
|
resup = np.unique(node_up._y, return_counts=True)
|
||||||
|
resdn = np.unique(node_dn._y, return_counts=True)
|
||||||
|
self.assertListEqual([1, 2], resup[0].tolist())
|
||||||
|
self.assertListEqual([3, 50], resup[1].tolist())
|
||||||
|
self.assertListEqual([1], resdn[0].tolist())
|
||||||
|
self.assertListEqual([47], resdn[1].tolist())
|
||||||
|
|
||||||
|
def test_score_multiclass_rbf(self):
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(kernel="rbf", random_state=self._random_state)
|
||||||
|
self.assertEqual(0.824, clf.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_poly(self):
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=500,
|
||||||
|
)
|
||||||
|
clf = Stree(
|
||||||
|
kernel="poly", random_state=self._random_state, C=10, degree=5
|
||||||
|
)
|
||||||
|
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
||||||
|
|
||||||
|
def test_score_multiclass_linear(self):
|
||||||
|
X, y = load_dataset(
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_classes=3,
|
||||||
|
n_features=5,
|
||||||
|
n_samples=1500,
|
||||||
|
)
|
||||||
|
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||||
|
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||||
|
X, y = load_wine(return_X_y=True)
|
||||||
|
self.assertEqual(0.9550561797752809, clf.fit(X, y).score(X, y))
|
@@ -1,357 +0,0 @@
|
|||||||
import os
|
|
||||||
import unittest
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.datasets import make_classification
|
|
||||||
|
|
||||||
from stree import Stree, Snode
|
|
||||||
|
|
||||||
|
|
||||||
class Stree_test(unittest.TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
os.environ['TESTING'] = '1'
|
|
||||||
self._random_state = 1
|
|
||||||
self._clf = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=False)
|
|
||||||
self._clf.fit(*self._get_Xy())
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def tearDownClass(cls):
|
|
||||||
try:
|
|
||||||
os.environ.pop('TESTING')
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _get_Xy(self):
|
|
||||||
X, y = make_classification(n_samples=1500, n_features=3,
|
|
||||||
n_informative=3, n_redundant=0,
|
|
||||||
n_repeated=0, n_classes=2,
|
|
||||||
n_clusters_per_class=2, class_sep=1.5,
|
|
||||||
flip_y=0, weights=[0.5, 0.5],
|
|
||||||
random_state=self._random_state)
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
def _check_tree(self, node: Snode):
|
|
||||||
"""Check recursively that the nodes that are not leaves have the
|
|
||||||
correct number of labels and its sons have the right number of elements
|
|
||||||
in their dataset
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
node {Snode} -- node to check
|
|
||||||
"""
|
|
||||||
if node.is_leaf():
|
|
||||||
return
|
|
||||||
y_prediction = node._clf.predict(node._X)
|
|
||||||
y_down = node.get_down()._y
|
|
||||||
y_up = node.get_up()._y
|
|
||||||
# Is a correct partition in terms of cadinality?
|
|
||||||
# i.e. The partition algorithm didn't forget any sample
|
|
||||||
self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
|
|
||||||
unique_y, count_y = np.unique(node._y, return_counts=True)
|
|
||||||
_, count_d = np.unique(y_down, return_counts=True)
|
|
||||||
_, count_u = np.unique(y_up, return_counts=True)
|
|
||||||
#
|
|
||||||
for i in unique_y:
|
|
||||||
try:
|
|
||||||
number_down = count_d[i]
|
|
||||||
except IndexError:
|
|
||||||
number_down = 0
|
|
||||||
try:
|
|
||||||
number_up = count_u[i]
|
|
||||||
except IndexError:
|
|
||||||
number_up = 0
|
|
||||||
self.assertEqual(count_y[i], number_down + number_up)
|
|
||||||
# Is the partition made the same as the prediction?
|
|
||||||
# as the node is not a leaf...
|
|
||||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
|
||||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
|
||||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
|
||||||
self._check_tree(node.get_down())
|
|
||||||
self._check_tree(node.get_up())
|
|
||||||
|
|
||||||
def test_build_tree(self):
|
|
||||||
"""Check if the tree is built the same way as predictions of models
|
|
||||||
"""
|
|
||||||
self._check_tree(self._clf.tree_)
|
|
||||||
|
|
||||||
def _get_file_data(self, file_name: str) -> tuple:
|
|
||||||
"""Return X, y from data, y is the last column in array
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
file_name {str} -- the file name
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple -- tuple with samples, categories
|
|
||||||
"""
|
|
||||||
data = np.genfromtxt(file_name, delimiter=',')
|
|
||||||
data = np.array(data)
|
|
||||||
column_y = data.shape[1] - 1
|
|
||||||
fy = data[:, column_y]
|
|
||||||
fx = np.delete(data, column_y, axis=1)
|
|
||||||
return fx, fy
|
|
||||||
|
|
||||||
def _find_out(self, px: np.array, x_original: np.array,
|
|
||||||
y_original) -> list:
|
|
||||||
"""Find the original values of y for a given array of samples
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
px {np.array} -- array of samples to search for
|
|
||||||
x_original {np.array} -- original dataset
|
|
||||||
y_original {[type]} -- original classes
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
np.array -- classes of the given samples
|
|
||||||
"""
|
|
||||||
res = []
|
|
||||||
for needle in px:
|
|
||||||
for row in range(x_original.shape[0]):
|
|
||||||
if all(x_original[row, :] == needle):
|
|
||||||
res.append(y_original[row])
|
|
||||||
return res
|
|
||||||
|
|
||||||
def test_single_prediction(self):
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
|
|
||||||
self.assertEqual(yp[0], y[0])
|
|
||||||
|
|
||||||
def test_multiple_prediction(self):
|
|
||||||
# First 27 elements the predictions are the same as the truth
|
|
||||||
num = 27
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict(X[:num, :])
|
|
||||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
|
||||||
|
|
||||||
def test_score(self):
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
accuracy_score = self._clf.score(X, y)
|
|
||||||
yp = self._clf.predict(X)
|
|
||||||
accuracy_computed = np.mean(yp == y)
|
|
||||||
self.assertEqual(accuracy_score, accuracy_computed)
|
|
||||||
self.assertGreater(accuracy_score, 0.9)
|
|
||||||
|
|
||||||
def test_single_predict_proba(self):
|
|
||||||
"""Check that element 28 has a prediction different that the current
|
|
||||||
label
|
|
||||||
"""
|
|
||||||
# Element 28 has a different prediction than the truth
|
|
||||||
decimals = 5
|
|
||||||
prob = 0.29026400766
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1]))
|
|
||||||
self.assertEqual(np.round(1 - prob, decimals),
|
|
||||||
np.round(yp[0:, 0], decimals))
|
|
||||||
self.assertEqual(1, y[28])
|
|
||||||
|
|
||||||
self.assertAlmostEqual(
|
|
||||||
round(prob, decimals),
|
|
||||||
round(yp[0, 1], decimals),
|
|
||||||
decimals
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_multiple_predict_proba(self):
|
|
||||||
# First 27 elements the predictions are the same as the truth
|
|
||||||
num = 27
|
|
||||||
decimals = 5
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
yp = self._clf.predict_proba(X[:num, :])
|
|
||||||
self.assertListEqual(
|
|
||||||
y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist())
|
|
||||||
expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833,
|
|
||||||
0.14269291, 0.85193236,
|
|
||||||
0.29876058, 0.7282164, 0.85958616, 0.89517877,
|
|
||||||
0.99745224, 0.18860349,
|
|
||||||
0.30756427, 0.8318412, 0.18981198, 0.15564624,
|
|
||||||
0.25740655, 0.22923355,
|
|
||||||
0.87365959, 0.49928689, 0.95574351, 0.28761257,
|
|
||||||
0.28906333, 0.32643692,
|
|
||||||
0.29788483, 0.01657364, 0.81149083]
|
|
||||||
expected = np.round(expected_proba, decimals=decimals).tolist()
|
|
||||||
computed = np.round(yp[:, 1], decimals=decimals).tolist()
|
|
||||||
for i in range(len(expected)):
|
|
||||||
self.assertAlmostEqual(expected[i], computed[i], decimals)
|
|
||||||
|
|
||||||
def build_models(self):
|
|
||||||
"""Build and train two models, model_clf will use the sklearn
|
|
||||||
classifier to compute predictions and split data. model_computed will
|
|
||||||
use vector of coefficients to compute both predictions and splitted
|
|
||||||
data
|
|
||||||
"""
|
|
||||||
model_clf = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=True)
|
|
||||||
model_computed = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=False)
|
|
||||||
X, y = self._get_Xy()
|
|
||||||
model_clf.fit(X, y)
|
|
||||||
model_computed.fit(X, y)
|
|
||||||
return model_clf, model_computed, X, y
|
|
||||||
|
|
||||||
def test_use_model_predict(self):
|
|
||||||
"""Check that we get the same results wether we use the estimator in
|
|
||||||
nodes to compute labels or we use the hyperplane and the position of
|
|
||||||
samples wrt to it
|
|
||||||
"""
|
|
||||||
use_clf, use_math, X, _ = self.build_models()
|
|
||||||
self.assertListEqual(
|
|
||||||
use_clf.predict(X).tolist(),
|
|
||||||
use_math.predict(X).tolist()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_use_model_score(self):
|
|
||||||
use_clf, use_math, X, y = self.build_models()
|
|
||||||
b = use_math.score(X, y)
|
|
||||||
self.assertEqual(
|
|
||||||
use_clf.score(X, y),
|
|
||||||
b
|
|
||||||
)
|
|
||||||
self.assertGreater(b, .95)
|
|
||||||
|
|
||||||
def test_use_model_predict_proba(self):
|
|
||||||
use_clf, use_math, X, _ = self.build_models()
|
|
||||||
self.assertListEqual(
|
|
||||||
use_clf.predict_proba(X).tolist(),
|
|
||||||
use_math.predict_proba(X).tolist()
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_single_vs_multiple_prediction(self):
|
|
||||||
"""Check if predicting sample by sample gives the same result as
|
|
||||||
predicting all samples at once
|
|
||||||
"""
|
|
||||||
X, _ = self._get_Xy()
|
|
||||||
# Compute prediction line by line
|
|
||||||
yp_line = np.array([], dtype=int)
|
|
||||||
for xp in X:
|
|
||||||
yp_line = np.append(yp_line, self._clf.predict(
|
|
||||||
xp.reshape(-1, X.shape[1])))
|
|
||||||
# Compute prediction at once
|
|
||||||
yp_once = self._clf.predict(X)
|
|
||||||
#
|
|
||||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
|
||||||
|
|
||||||
def test_iterator(self):
|
|
||||||
"""Check preorder iterator
|
|
||||||
"""
|
|
||||||
expected = [
|
|
||||||
'root',
|
|
||||||
'root - Down',
|
|
||||||
'root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts'
|
|
||||||
'=(array([0, 1]), array([ 17, 691]))',
|
|
||||||
'root - Down - Up',
|
|
||||||
'root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 '
|
|
||||||
'counts=(array([0, 1]), array([1, 3]))',
|
|
||||||
'root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 '
|
|
||||||
'counts=(array([0]), array([7]))',
|
|
||||||
'root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array('
|
|
||||||
'[0, 1]), array([725, 56]))',
|
|
||||||
]
|
|
||||||
computed = []
|
|
||||||
for node in self._clf:
|
|
||||||
computed.append(str(node))
|
|
||||||
self.assertListEqual(expected, computed)
|
|
||||||
|
|
||||||
def test_is_a_sklearn_classifier(self):
|
|
||||||
import warnings
|
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
|
||||||
warnings.filterwarnings('ignore', category=ConvergenceWarning)
|
|
||||||
warnings.filterwarnings('ignore', category=RuntimeWarning)
|
|
||||||
from sklearn.utils.estimator_checks import check_estimator
|
|
||||||
check_estimator(Stree())
|
|
||||||
|
|
||||||
def test_exception_if_C_is_negative(self):
|
|
||||||
tclf = Stree(C=-1)
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
tclf.fit(*self._get_Xy())
|
|
||||||
|
|
||||||
def test_check_max_depth_is_positive_or_None(self):
|
|
||||||
tcl = Stree()
|
|
||||||
self.assertIsNone(tcl.max_depth)
|
|
||||||
tcl = Stree(max_depth=1)
|
|
||||||
self.assertGreaterEqual(1, tcl.max_depth)
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
tcl = Stree(max_depth=-1)
|
|
||||||
tcl.fit(*self._get_Xy())
|
|
||||||
|
|
||||||
def test_check_max_depth(self):
|
|
||||||
depth = 3
|
|
||||||
tcl = Stree(random_state=self._random_state, max_depth=depth)
|
|
||||||
tcl.fit(*self._get_Xy())
|
|
||||||
self.assertEqual(depth, tcl.depth_)
|
|
||||||
|
|
||||||
def test_unfitted_tree_is_iterable(self):
|
|
||||||
tcl = Stree()
|
|
||||||
self.assertEqual(0, len(list(tcl)))
|
|
||||||
|
|
||||||
|
|
||||||
class Snode_test(unittest.TestCase):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
os.environ['TESTING'] = '1'
|
|
||||||
self._random_state = 1
|
|
||||||
self._clf = Stree(random_state=self._random_state,
|
|
||||||
use_predictions=True)
|
|
||||||
self._clf.fit(*self._get_Xy())
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def tearDownClass(cls):
|
|
||||||
"""[summary]
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
os.environ.pop('TESTING')
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _get_Xy(self):
|
|
||||||
X, y = make_classification(n_samples=1500, n_features=3,
|
|
||||||
n_informative=3, n_redundant=0, n_classes=2,
|
|
||||||
n_repeated=0, n_clusters_per_class=2,
|
|
||||||
class_sep=1.5, flip_y=0, weights=[0.5, 0.5],
|
|
||||||
random_state=self._random_state)
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
def test_attributes_in_leaves(self):
|
|
||||||
"""Check if the attributes in leaves have correct values so they form a
|
|
||||||
predictor
|
|
||||||
"""
|
|
||||||
|
|
||||||
def check_leave(node: Snode):
|
|
||||||
if not node.is_leaf():
|
|
||||||
check_leave(node.get_down())
|
|
||||||
check_leave(node.get_up())
|
|
||||||
return
|
|
||||||
# Check Belief in leave
|
|
||||||
classes, card = np.unique(node._y, return_counts=True)
|
|
||||||
max_card = max(card)
|
|
||||||
min_card = min(card)
|
|
||||||
if len(classes) > 1:
|
|
||||||
try:
|
|
||||||
belief = max_card / (max_card + min_card)
|
|
||||||
except ZeroDivisionError:
|
|
||||||
belief = 0.
|
|
||||||
else:
|
|
||||||
belief = 1
|
|
||||||
self.assertEqual(belief, node._belief)
|
|
||||||
# Check Class
|
|
||||||
class_computed = classes[card == max_card]
|
|
||||||
self.assertEqual(class_computed, node._class)
|
|
||||||
|
|
||||||
check_leave(self._clf.tree_)
|
|
||||||
|
|
||||||
def test_nodes_coefs(self):
|
|
||||||
"""Check if the nodes of the tree have the right attributes filled
|
|
||||||
"""
|
|
||||||
|
|
||||||
def run_tree(node: Snode):
|
|
||||||
if node._belief < 1:
|
|
||||||
# only exclude pure leaves
|
|
||||||
self.assertIsNotNone(node._clf)
|
|
||||||
self.assertIsNotNone(node._clf.coef_)
|
|
||||||
self.assertIsNotNone(node._vector)
|
|
||||||
self.assertIsNotNone(node._interceptor)
|
|
||||||
if node.is_leaf():
|
|
||||||
return
|
|
||||||
run_tree(node.get_down())
|
|
||||||
run_tree(node.get_up())
|
|
||||||
|
|
||||||
run_tree(self._clf.tree_)
|
|
@@ -1 +1,5 @@
|
|||||||
from .Strees_test import Stree_test, Snode_test
|
from .Stree_test import Stree_test
|
||||||
|
from .Snode_test import Snode_test
|
||||||
|
from .Splitter_test import Splitter_test
|
||||||
|
|
||||||
|
__all__ = ["Stree_test", "Snode_test", "Splitter_test"]
|
||||||
|
17
stree/tests/utils.py
Normal file
17
stree/tests/utils.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from sklearn.datasets import make_classification
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
||||||
|
X, y = make_classification(
|
||||||
|
n_samples=n_samples,
|
||||||
|
n_features=n_features,
|
||||||
|
n_informative=3,
|
||||||
|
n_redundant=0,
|
||||||
|
n_repeated=0,
|
||||||
|
n_classes=n_classes,
|
||||||
|
n_clusters_per_class=2,
|
||||||
|
class_sep=1.5,
|
||||||
|
flip_y=0,
|
||||||
|
random_state=random_state,
|
||||||
|
)
|
||||||
|
return X, y
|
Reference in New Issue
Block a user