mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-18 00:46:02 +00:00
Compare commits
9 Commits
add_subspa
...
0.9rc5
Author | SHA1 | Date | |
---|---|---|---|
f5706c3159
|
|||
be552fdd6c
|
|||
5e3a8e3ec5
|
|||
554ec03c32
|
|||
4b7e4a3fb0
|
|||
76723993fd
|
|||
ecd0b86f4d
|
|||
3e52a4746c
|
|||
|
a20e45e8e7 |
@@ -10,5 +10,4 @@ exclude_lines =
|
||||
if __name__ == .__main__.:
|
||||
ignore_errors = True
|
||||
omit =
|
||||
stree/tests/*
|
||||
stree/__init__.py
|
2
.gitignore
vendored
2
.gitignore
vendored
@@ -131,3 +131,5 @@ dmypy.json
|
||||
.idea
|
||||
.vscode
|
||||
.pre-commit-config.yaml
|
||||
|
||||
**.csv
|
File diff suppressed because one or more lines are too long
@@ -4,7 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test AdaBoost with different configurations"
|
||||
"# Test Stree with AdaBoost and Bagging with different configurations"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -34,11 +34,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"from sklearn.ensemble import AdaBoostClassifier\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"from sklearn.svm import LinearSVC, SVC\n",
|
||||
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||
"from sklearn.datasets import load_iris\n",
|
||||
"from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from stree import Stree"
|
||||
]
|
||||
},
|
||||
@@ -57,12 +54,14 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n"
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.644% 647\nValid: 99.356% 99845\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -117,18 +116,20 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## STree alone on the whole dataset and linear kernel"
|
||||
"## STree alone with 100.000 samples and linear kernel"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Score Train: 0.9985499829409757\nScore Test: 0.998407854584052\nTook 39.45 seconds\n"
|
||||
"text": "Score Train: 0.9985784146480154\nScore Test: 0.9981093273185617\nTook 73.27 seconds\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -144,7 +145,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Different kernels with different configuations"
|
||||
"## Adaboost"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -161,18 +162,20 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n"
|
||||
"text": "Kernel: linear\tTime: 93.78 seconds\tScore Train: 0.9983083\tScore Test: 0.9983083\nKernel: rbf\tTime: 18.32 seconds\tScore Train: 0.9935602\tScore Test: 0.9935651\nKernel: poly\tTime: 69.68 seconds\tScore Train: 0.9973132\tScore Test: 0.9972801\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||
" now = time.time()\n",
|
||||
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf.fit(Xtrain, ytrain)\n",
|
||||
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||
" score_test = clf.score(Xtest, ytest)\n",
|
||||
@@ -183,24 +186,37 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test algorithm SAMME in AdaBoost to check speed/accuracy"
|
||||
"## Bagging"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"n_estimators = 10\n",
|
||||
"C = 7\n",
|
||||
"max_depth = 3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n"
|
||||
"text": "Kernel: linear\tTime: 387.06 seconds\tScore Train: 0.9985784\tScore Test: 0.9981093\nKernel: rbf\tTime: 144.00 seconds\tScore Train: 0.9992750\tScore Test: 0.9983415\nKernel: poly\tTime: 101.78 seconds\tScore Train: 0.9992466\tScore Test: 0.9981757\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||
" now = time.time()\n",
|
||||
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
" clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf.fit(Xtrain, ytrain)\n",
|
||||
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||
" score_test = clf.score(Xtest, ytest)\n",
|
||||
@@ -223,7 +239,7 @@
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||
"name": "python37664bitgeneralvenve3128601eb614c5da59c5055670b6040",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||
}
|
||||
},
|
File diff suppressed because one or more lines are too long
@@ -66,7 +66,8 @@
|
||||
"id": "z9Q-YUfBDZEq",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b"
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"random_state=1\n",
|
||||
@@ -112,7 +113,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -137,25 +138,25 @@
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5],\n",
|
||||
" 'base_estimator__C': [1, 3],\n",
|
||||
" 'base_estimator__C': [7, 55],\n",
|
||||
" 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
|
||||
"}"
|
||||
],
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}"
|
||||
"text/plain": "{'C': 1.0,\n 'criterion': 'gini',\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_features': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'split_criteria': 'max_samples',\n 'splitter': 'random',\n 'tol': 0.0001}"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 14
|
||||
"execution_count": 6
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -168,28 +169,29 @@
|
||||
"id": "CrcB8o6EDZE5",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762"
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"random_state=2020\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state)\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
|
||||
"grid.fit(Xtrain, ytrain)"
|
||||
],
|
||||
"execution_count": 11,
|
||||
"execution_count": 7,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 3.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 4.2s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 4.8s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 7.2s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 8.9s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 10.7s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.7s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 1.5min finished\n"
|
||||
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.0s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.4s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 3.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 4.3s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 6.6s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 8.1s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 9.4s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 10.1s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 11.1s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 12.3s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 13.6s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 14.9s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 16.2s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 17.6s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 19.1s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 21.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 25.9s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 30.4s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 36.7s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 38.1s\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 39.6s\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 41.9s\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 44.9s\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 48.2s\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 49.2s finished\n"
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
|
||||
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=55, max_depth=3, tol=0.01)],\n 'base_estimator__C': [7, 55],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 11
|
||||
"execution_count": 7
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -199,19 +201,20 @@
|
||||
"id": "ZjX88NoYDZE8",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344"
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"print(\"Best estimator: \", grid.best_estimator_)\n",
|
||||
"print(\"Best hyperparameters: \", grid.best_params_)\n",
|
||||
"print(\"Best accuracy: \", grid.best_score_)"
|
||||
],
|
||||
"execution_count": 16,
|
||||
"execution_count": 8,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Best estimator: AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy: 0.9492316893632683\n"
|
||||
"text": "Best estimator: AdaBoostClassifier(algorithm='SAMME',\n base_estimator=Stree(C=55, max_depth=3, tol=0.01),\n learning_rate=0.5, n_estimators=25, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=55, max_depth=3, tol=0.01), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\nBest accuracy: 0.9559440559440558\n"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
2
setup.py
2
setup.py
@@ -1,6 +1,6 @@
|
||||
import setuptools
|
||||
|
||||
__version__ = "0.9rc4"
|
||||
__version__ = "0.9rc5"
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
|
||||
|
||||
|
132
stree/Strees.py
132
stree/Strees.py
@@ -10,6 +10,7 @@ import os
|
||||
import numbers
|
||||
import random
|
||||
import warnings
|
||||
from math import log
|
||||
from itertools import combinations
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
@@ -39,6 +40,7 @@ class Snode:
|
||||
features: np.array,
|
||||
impurity: float,
|
||||
title: str,
|
||||
weight: np.ndarray = None,
|
||||
):
|
||||
self._clf = clf
|
||||
self._title = title
|
||||
@@ -50,7 +52,9 @@ class Snode:
|
||||
self._up = None
|
||||
self._class = None
|
||||
self._feature = None
|
||||
self._sample_weight = None
|
||||
self._sample_weight = (
|
||||
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||
)
|
||||
self._features = features
|
||||
self._impurity = impurity
|
||||
|
||||
@@ -163,10 +167,10 @@ class Splitter:
|
||||
f"criterion must be gini or entropy got({criterion})"
|
||||
)
|
||||
|
||||
if criteria not in ["min_distance", "max_samples"]:
|
||||
if criteria not in ["min_distance", "max_samples", "max_distance"]:
|
||||
raise ValueError(
|
||||
f"split_criteria has to be min_distance or \
|
||||
max_samples got ({criteria})"
|
||||
"split_criteria has to be min_distance "
|
||||
f"max_distance or max_samples got ({criteria})"
|
||||
)
|
||||
|
||||
if splitter_type not in ["random", "best"]:
|
||||
@@ -186,24 +190,47 @@ class Splitter:
|
||||
|
||||
@staticmethod
|
||||
def _entropy(y: np.array) -> float:
|
||||
_, count = np.unique(y, return_counts=True)
|
||||
proportion = count / np.sum(count)
|
||||
return -np.sum(proportion * np.log2(proportion))
|
||||
n_labels = len(y)
|
||||
if n_labels <= 1:
|
||||
return 0
|
||||
counts = np.bincount(y)
|
||||
proportions = counts / n_labels
|
||||
n_classes = np.count_nonzero(proportions)
|
||||
if n_classes <= 1:
|
||||
return 0
|
||||
entropy = 0.0
|
||||
# Compute standard entropy.
|
||||
for prop in proportions:
|
||||
if prop != 0.0:
|
||||
entropy -= prop * log(prop, n_classes)
|
||||
return entropy
|
||||
|
||||
def information_gain(
|
||||
self, labels_up: np.array, labels_dn: np.array
|
||||
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||
) -> float:
|
||||
card_up = labels_up.shape[0] if labels_up is not None else 0
|
||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||
imp_prev = self.criterion_function(labels)
|
||||
card_up = card_dn = imp_up = imp_dn = 0
|
||||
if labels_up is not None:
|
||||
card_up = labels_up.shape[0]
|
||||
imp_up = self.criterion_function(labels_up)
|
||||
if labels_dn is not None:
|
||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||
imp_dn = self.criterion_function(labels_dn)
|
||||
samples = card_up + card_dn
|
||||
up = card_up / samples * self.criterion_function(labels_up)
|
||||
dn = card_dn / samples * self.criterion_function(labels_dn)
|
||||
return up + dn
|
||||
if samples == 0:
|
||||
return 0.0
|
||||
else:
|
||||
result = (
|
||||
imp_prev
|
||||
- (card_up / samples) * imp_up
|
||||
- (card_dn / samples) * imp_dn
|
||||
)
|
||||
return result
|
||||
|
||||
def _select_best_set(
|
||||
self, dataset: np.array, labels: np.array, features_sets: list
|
||||
) -> list:
|
||||
min_impurity = 1
|
||||
max_gain = 0
|
||||
selected = None
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
for feature_set in features_sets:
|
||||
@@ -213,11 +240,11 @@ class Splitter:
|
||||
)
|
||||
self.partition(dataset, node)
|
||||
y1, y2 = self.part(labels)
|
||||
impurity = self.information_gain(y1, y2)
|
||||
if impurity < min_impurity:
|
||||
min_impurity = impurity
|
||||
gain = self.information_gain(labels, y1, y2)
|
||||
if gain > max_gain:
|
||||
max_gain = gain
|
||||
selected = feature_set
|
||||
return selected
|
||||
return selected if selected is not None else feature_set
|
||||
|
||||
def _get_subspaces_set(
|
||||
self, dataset: np.array, labels: np.array, max_features: int
|
||||
@@ -226,8 +253,12 @@ class Splitter:
|
||||
features_sets = list(combinations(features, max_features))
|
||||
if len(features_sets) > 1:
|
||||
if self._splitter_type == "random":
|
||||
return features_sets[random.randint(0, len(features_sets) - 1)]
|
||||
index = random.randint(0, len(features_sets) - 1)
|
||||
return features_sets[index]
|
||||
else:
|
||||
# get only 3 sets at most
|
||||
if len(features_sets) > 3:
|
||||
features_sets = random.sample(features_sets, 3)
|
||||
return self._select_best_set(dataset, labels, features_sets)
|
||||
else:
|
||||
return features_sets[0]
|
||||
@@ -242,21 +273,56 @@ class Splitter:
|
||||
|
||||
@staticmethod
|
||||
def _min_distance(data: np.array, _) -> np.array:
|
||||
# chooses the lowest distance of every sample
|
||||
indices = np.argmin(np.abs(data), axis=1)
|
||||
return np.array(
|
||||
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
|
||||
)
|
||||
"""Assign class to min distances
|
||||
|
||||
return a vector of classes so partition can separate class 0 from
|
||||
the rest of classes, ie. class 0 goes to one splitted node and the
|
||||
rest of classes go to the other
|
||||
:param data: distances to hyper plane of every class
|
||||
:type data: np.array (m, n_classes)
|
||||
:param _: enable call compat with other measures
|
||||
:type _: None
|
||||
:return: vector with the class assigned to each sample
|
||||
:rtype: np.array shape (m,)
|
||||
"""
|
||||
return np.argmin(data, axis=1)
|
||||
|
||||
@staticmethod
|
||||
def _max_distance(data: np.array, _) -> np.array:
|
||||
"""Assign class to max distances
|
||||
|
||||
return a vector of classes so partition can separate class 0 from
|
||||
the rest of classes, ie. class 0 goes to one splitted node and the
|
||||
rest of classes go to the other
|
||||
:param data: distances to hyper plane of every class
|
||||
:type data: np.array (m, n_classes)
|
||||
:param _: enable call compat with other measures
|
||||
:type _: None
|
||||
:return: vector with the class assigned to each sample values
|
||||
(can be 0, 1, ...)
|
||||
:rtype: np.array shape (m,)
|
||||
"""
|
||||
return np.argmax(data, axis=1)
|
||||
|
||||
@staticmethod
|
||||
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||
"""return distances of the class with more samples
|
||||
|
||||
:param data: distances to hyper plane of every class
|
||||
:type data: np.array (m, n_classes)
|
||||
:param y: vector of labels (classes)
|
||||
:type y: np.array (m,)
|
||||
:return: vector with distances to hyperplane (can be positive or neg.)
|
||||
:rtype: np.array shape (m,)
|
||||
"""
|
||||
# select the class with max number of samples
|
||||
_, samples = np.unique(y, return_counts=True)
|
||||
selected = np.argmax(samples)
|
||||
return data[:, selected]
|
||||
|
||||
def partition(self, samples: np.array, node: Snode):
|
||||
"""Set the criteria to split arrays
|
||||
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||
that should go to one side of the tree (down)
|
||||
|
||||
"""
|
||||
data = self._distances(node, samples)
|
||||
@@ -379,7 +445,9 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=np.float64
|
||||
)
|
||||
check_classification_targets(y)
|
||||
# Initialize computed parameters
|
||||
self.splitter_ = Splitter(
|
||||
@@ -439,13 +507,22 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
features=X.shape[1],
|
||||
impurity=0.0,
|
||||
title=title + ", <pure>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
# Train the model
|
||||
clf = self._build_clf()
|
||||
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||
# solve WARNING: class label 0 specified in weight is not found
|
||||
# in bagging
|
||||
if any(sample_weight == 0):
|
||||
indices = sample_weight == 0
|
||||
y_next = y[~indices]
|
||||
# touch weights if removing any class
|
||||
if np.unique(y_next).shape[0] != self.n_classes_:
|
||||
sample_weight += 1e-5
|
||||
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||
impurity = self.splitter_.impurity(y)
|
||||
node = Snode(clf, X, y, features, impurity, title)
|
||||
node = Snode(clf, X, y, features, impurity, title, sample_weight)
|
||||
self.depth_ = max(depth, self.depth_)
|
||||
self.splitter_.partition(X, node)
|
||||
X_U, X_D = self.splitter_.part(X)
|
||||
@@ -460,6 +537,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
features=X.shape[1],
|
||||
impurity=impurity,
|
||||
title=title + ", <cgaf>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||
|
@@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase):
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
if len(classes) > 1:
|
||||
try:
|
||||
belief = max_card / (max_card + min_card)
|
||||
except ZeroDivisionError:
|
||||
belief = 0.0
|
||||
belief = max_card / (max_card + min_card)
|
||||
else:
|
||||
belief = 1
|
||||
self.assertEqual(belief, node._belief)
|
||||
|
@@ -1,11 +1,11 @@
|
||||
import os
|
||||
import unittest
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.datasets import load_wine, load_iris
|
||||
from stree import Splitter
|
||||
from .utils import load_dataset
|
||||
|
||||
|
||||
class Splitter_test(unittest.TestCase):
|
||||
@@ -15,7 +15,7 @@ class Splitter_test(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def build(
|
||||
clf=LinearSVC(),
|
||||
clf=SVC,
|
||||
min_samples_split=0,
|
||||
splitter_type="random",
|
||||
criterion="gini",
|
||||
@@ -23,7 +23,7 @@ class Splitter_test(unittest.TestCase):
|
||||
random_state=None,
|
||||
):
|
||||
return Splitter(
|
||||
clf=clf,
|
||||
clf=clf(random_state=random_state, kernel="rbf"),
|
||||
min_samples_split=min_samples_split,
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
@@ -43,10 +43,14 @@ class Splitter_test(unittest.TestCase):
|
||||
with self.assertRaises(ValueError):
|
||||
self.build(criteria="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
self.build(clf=None)
|
||||
_ = Splitter(clf=None)
|
||||
for splitter_type in ["best", "random"]:
|
||||
for criterion in ["gini", "entropy"]:
|
||||
for criteria in ["min_distance", "max_samples"]:
|
||||
for criteria in [
|
||||
"min_distance",
|
||||
"max_samples",
|
||||
"max_distance",
|
||||
]:
|
||||
tcl = self.build(
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
@@ -57,30 +61,74 @@ class Splitter_test(unittest.TestCase):
|
||||
self.assertEqual(criteria, tcl._criteria)
|
||||
|
||||
def test_gini(self):
|
||||
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
||||
expected = 0.48
|
||||
self.assertEqual(expected, Splitter._gini(y))
|
||||
tcl = self.build(criterion="gini")
|
||||
self.assertEqual(expected, tcl.criterion_function(y))
|
||||
expected_values = [
|
||||
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
|
||||
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
|
||||
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
|
||||
([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
|
||||
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
|
||||
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
|
||||
([0], 0),
|
||||
([1, 1, 1, 1], 0),
|
||||
]
|
||||
for labels, expected in expected_values:
|
||||
self.assertAlmostEqual(expected, Splitter._gini(labels))
|
||||
tcl = self.build(criterion="gini")
|
||||
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||
|
||||
def test_entropy(self):
|
||||
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
||||
expected = 0.9709505944546686
|
||||
self.assertAlmostEqual(expected, Splitter._entropy(y))
|
||||
tcl = self.build(criterion="entropy")
|
||||
self.assertEqual(expected, tcl.criterion_function(y))
|
||||
expected_values = [
|
||||
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
|
||||
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
|
||||
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
|
||||
([0, 0, 1, 1, 1, 1, 0, 0], 1),
|
||||
([0, 0, 1, 1, 2, 2, 3, 3], 1),
|
||||
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
|
||||
([1], 0),
|
||||
([0, 0, 0, 0], 0),
|
||||
]
|
||||
for labels, expected in expected_values:
|
||||
self.assertAlmostEqual(expected, Splitter._entropy(labels))
|
||||
tcl = self.build(criterion="entropy")
|
||||
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||
|
||||
def test_information_gain(self):
|
||||
yu = np.array([0, 1, 1, 1, 1, 1])
|
||||
yd = np.array([0, 0, 0, 1])
|
||||
values_expected = [
|
||||
("gini", 0.31666666666666665),
|
||||
("entropy", 0.7145247027726656),
|
||||
expected_values = [
|
||||
(
|
||||
[0, 1, 1, 1, 1, 1],
|
||||
[0, 0, 0, 1],
|
||||
0.16333333333333333,
|
||||
0.25642589168200297,
|
||||
),
|
||||
(
|
||||
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
|
||||
[5, 3, 2, 1, 1],
|
||||
0.007381776239907684,
|
||||
-0.03328610916207225,
|
||||
),
|
||||
([], [], 0.0, 0.0),
|
||||
([1], [], 0.0, 0.0),
|
||||
([], [1], 0.0, 0.0),
|
||||
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
|
||||
([], [1, 1, 1, 2], 0.0, 0.0),
|
||||
(None, [1, 2, 3], 0.0, 0.0),
|
||||
([1, 2, 3], None, 0.0, 0.0),
|
||||
]
|
||||
for criterion, expected in values_expected:
|
||||
tcl = self.build(criterion=criterion)
|
||||
computed = tcl.information_gain(yu, yd)
|
||||
self.assertAlmostEqual(expected, computed)
|
||||
for yu, yd, expected_gini, expected_entropy in expected_values:
|
||||
yu = np.array(yu, dtype=np.int32) if yu is not None else None
|
||||
yd = np.array(yd, dtype=np.int32) if yd is not None else None
|
||||
if yu is not None and yd is not None:
|
||||
complete = np.append(yu, yd)
|
||||
elif yd is not None:
|
||||
complete = yd
|
||||
else:
|
||||
complete = yu
|
||||
tcl = self.build(criterion="gini")
|
||||
computed = tcl.information_gain(complete, yu, yd)
|
||||
self.assertAlmostEqual(expected_gini, computed)
|
||||
tcl = self.build(criterion="entropy")
|
||||
computed = tcl.information_gain(complete, yu, yd)
|
||||
self.assertAlmostEqual(expected_entropy, computed)
|
||||
|
||||
def test_max_samples(self):
|
||||
tcl = self.build(criteria="max_samples")
|
||||
@@ -108,34 +156,73 @@ class Splitter_test(unittest.TestCase):
|
||||
[0.1, 0.2, 0.3],
|
||||
]
|
||||
)
|
||||
expected = np.array([-0.1, 0.01, 0.5, 0.1])
|
||||
expected = np.array([2, 2, 1, 0])
|
||||
computed = tcl._min_distance(data, None)
|
||||
self.assertEqual((4,), computed.shape)
|
||||
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||
|
||||
def test_max_distance(self):
|
||||
tcl = self.build(criteria="max_distance")
|
||||
data = np.array(
|
||||
[
|
||||
[-0.1, 0.2, -0.3],
|
||||
[0.7, 0.01, -0.1],
|
||||
[0.7, -0.9, 0.5],
|
||||
[0.1, 0.2, 0.3],
|
||||
]
|
||||
)
|
||||
expected = np.array([1, 0, 0, 2])
|
||||
computed = tcl._max_distance(data, None)
|
||||
self.assertEqual((4,), computed.shape)
|
||||
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||
|
||||
def test_best_splitter_few_sets(self):
|
||||
X, y = load_iris(return_X_y=True)
|
||||
X = np.delete(X, 3, 1)
|
||||
tcl = self.build(splitter_type="best", random_state=self._random_state)
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
||||
self.assertListEqual([0, 2], list(computed))
|
||||
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
|
||||
|
||||
def test_splitter_parameter(self):
|
||||
expected_values = [
|
||||
[1, 7, 9],
|
||||
[1, 7, 9],
|
||||
[1, 7, 9],
|
||||
[1, 7, 9],
|
||||
[0, 5, 6],
|
||||
[0, 5, 6],
|
||||
[0, 5, 6],
|
||||
[0, 5, 6],
|
||||
[2, 3, 5, 7], # best entropy min_distance
|
||||
[0, 2, 4, 5], # best entropy max_samples
|
||||
[0, 2, 8, 12], # best entropy max_distance
|
||||
[1, 2, 5, 12], # best gini min_distance
|
||||
[0, 3, 4, 10], # best gini max_samples
|
||||
[1, 2, 9, 12], # best gini max_distance
|
||||
[3, 9, 11, 12], # random entropy min_distance
|
||||
[1, 5, 6, 9], # random entropy max_samples
|
||||
[1, 2, 4, 8], # random entropy max_distance
|
||||
[2, 6, 7, 12], # random gini min_distance
|
||||
[3, 9, 10, 11], # random gini max_samples
|
||||
[2, 5, 8, 12], # random gini max_distance
|
||||
]
|
||||
X, y = load_dataset(self._random_state, n_features=12)
|
||||
X, y = load_wine(return_X_y=True)
|
||||
rn = 0
|
||||
for splitter_type in ["best", "random"]:
|
||||
for criterion in ["gini", "entropy"]:
|
||||
for criteria in ["min_distance", "max_samples"]:
|
||||
for criterion in ["entropy", "gini"]:
|
||||
for criteria in [
|
||||
"min_distance",
|
||||
"max_samples",
|
||||
"max_distance",
|
||||
]:
|
||||
tcl = self.build(
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
criteria=criteria,
|
||||
random_state=self._random_state,
|
||||
)
|
||||
expected = expected_values.pop(0)
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=3)
|
||||
random.seed(rn)
|
||||
rn += 1
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=4)
|
||||
# print(
|
||||
# "{}, # {:7s}{:8s}{:15s}".format(
|
||||
# list(computed), splitter_type, criterion,
|
||||
# criteria,
|
||||
# )
|
||||
# )
|
||||
self.assertListEqual(expected, list(computed))
|
||||
self.assertListEqual(
|
||||
X[:, computed].tolist(), dataset.tolist()
|
||||
|
@@ -1,8 +1,10 @@
|
||||
import os
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.datasets import load_iris, load_wine
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
|
||||
from stree import Stree, Snode
|
||||
from .utils import load_dataset
|
||||
@@ -39,10 +41,7 @@ class Stree_test(unittest.TestCase):
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
except IndexError:
|
||||
number_down = 0
|
||||
number_down = count_d[i]
|
||||
try:
|
||||
number_up = count_u[i]
|
||||
except IndexError:
|
||||
@@ -59,33 +58,12 @@ class Stree_test(unittest.TestCase):
|
||||
def test_build_tree(self):
|
||||
"""Check if the tree is built the same way as predictions of models
|
||||
"""
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
clf.fit(*load_dataset(self._random_state))
|
||||
self._check_tree(clf.tree_)
|
||||
|
||||
@staticmethod
|
||||
def _find_out(px: np.array, x_original: np.array, y_original) -> list:
|
||||
"""Find the original values of y for a given array of samples
|
||||
|
||||
Arguments:
|
||||
px {np.array} -- array of samples to search for
|
||||
x_original {np.array} -- original dataset
|
||||
y_original {[type]} -- original classes
|
||||
|
||||
Returns:
|
||||
np.array -- classes of the given samples
|
||||
"""
|
||||
res = []
|
||||
for needle in px:
|
||||
for row in range(x_original.shape[0]):
|
||||
if all(x_original[row, :] == needle):
|
||||
res.append(y_original[row])
|
||||
return res
|
||||
|
||||
def test_single_prediction(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
@@ -102,22 +80,6 @@ class Stree_test(unittest.TestCase):
|
||||
yp = clf.fit(X, y).predict(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||
|
||||
def test_score(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
accuracies = [
|
||||
0.9506666666666667,
|
||||
0.9606666666666667,
|
||||
0.9433333333333334,
|
||||
]
|
||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||
clf = Stree(random_state=self._random_state, kernel=kernel,)
|
||||
clf.fit(X, y)
|
||||
accuracy_score = clf.score(X, y)
|
||||
yp = clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||
|
||||
def test_single_vs_multiple_prediction(self):
|
||||
"""Check if predicting sample by sample gives the same result as
|
||||
predicting all samples at once
|
||||
@@ -164,9 +126,6 @@ class Stree_test(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def test_is_a_sklearn_classifier():
|
||||
import warnings
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
@@ -239,6 +198,9 @@ class Stree_test(unittest.TestCase):
|
||||
"min_distance linear": 0.9533333333333334,
|
||||
"min_distance rbf": 0.836,
|
||||
"min_distance poly": 0.9473333333333334,
|
||||
"max_distance linear": 0.9533333333333334,
|
||||
"max_distance rbf": 0.836,
|
||||
"max_distance poly": 0.9473333333333334,
|
||||
},
|
||||
"Iris": {
|
||||
"max_samples linear": 0.98,
|
||||
@@ -247,11 +209,14 @@ class Stree_test(unittest.TestCase):
|
||||
"min_distance linear": 0.98,
|
||||
"min_distance rbf": 1.0,
|
||||
"min_distance poly": 1.0,
|
||||
"max_distance linear": 0.98,
|
||||
"max_distance rbf": 1.0,
|
||||
"max_distance poly": 1.0,
|
||||
},
|
||||
}
|
||||
for name, dataset in datasets.items():
|
||||
px, py = dataset
|
||||
for criteria in ["max_samples", "min_distance"]:
|
||||
for criteria in ["max_samples", "min_distance", "max_distance"]:
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(
|
||||
C=1e4,
|
||||
@@ -322,13 +287,130 @@ class Stree_test(unittest.TestCase):
|
||||
with self.assertRaises(ValueError):
|
||||
clf.predict(X[:, :3])
|
||||
|
||||
# Tests of score
|
||||
|
||||
def test_score_binary(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
accuracies = [
|
||||
0.9506666666666667,
|
||||
0.9606666666666667,
|
||||
0.9433333333333334,
|
||||
]
|
||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||
clf = Stree(random_state=self._random_state, kernel=kernel,)
|
||||
clf.fit(X, y)
|
||||
accuracy_score = clf.score(X, y)
|
||||
yp = clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||
|
||||
def test_score_max_features(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
clf = Stree(random_state=self._random_state, max_features=2)
|
||||
clf.fit(X, y)
|
||||
self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))
|
||||
|
||||
def test_score_multi_class(self):
|
||||
warnings.filterwarnings("ignore")
|
||||
accuracies = [
|
||||
0.8258427, # Wine linear min_distance
|
||||
0.6741573, # Wine linear max_distance
|
||||
0.8314607, # Wine linear max_samples
|
||||
0.6629213, # Wine rbf min_distance
|
||||
1.0000000, # Wine rbf max_distance
|
||||
0.4044944, # Wine rbf max_samples
|
||||
0.9157303, # Wine poly min_distance
|
||||
1.0000000, # Wine poly max_distance
|
||||
0.7640449, # Wine poly max_samples
|
||||
0.9933333, # Iris linear min_distance
|
||||
0.9666667, # Iris linear max_distance
|
||||
0.9666667, # Iris linear max_samples
|
||||
0.9800000, # Iris rbf min_distance
|
||||
0.9800000, # Iris rbf max_distance
|
||||
0.9800000, # Iris rbf max_samples
|
||||
1.0000000, # Iris poly min_distance
|
||||
1.0000000, # Iris poly max_distance
|
||||
1.0000000, # Iris poly max_samples
|
||||
0.8993333, # Synthetic linear min_distance
|
||||
0.6533333, # Synthetic linear max_distance
|
||||
0.9313333, # Synthetic linear max_samples
|
||||
0.8320000, # Synthetic rbf min_distance
|
||||
0.6660000, # Synthetic rbf max_distance
|
||||
0.8320000, # Synthetic rbf max_samples
|
||||
0.6066667, # Synthetic poly min_distance
|
||||
0.6840000, # Synthetic poly max_distance
|
||||
0.6340000, # Synthetic poly max_samples
|
||||
]
|
||||
datasets = [
|
||||
("Wine", load_wine(return_X_y=True)),
|
||||
("Iris", load_iris(return_X_y=True)),
|
||||
(
|
||||
"Synthetic",
|
||||
load_dataset(self._random_state, n_classes=3, n_features=5),
|
||||
),
|
||||
]
|
||||
for dataset_name, dataset in datasets:
|
||||
X, y = dataset
|
||||
for kernel in self._kernels:
|
||||
for criteria in [
|
||||
"min_distance",
|
||||
"max_distance",
|
||||
"max_samples",
|
||||
]:
|
||||
clf = Stree(
|
||||
C=17,
|
||||
random_state=self._random_state,
|
||||
kernel=kernel,
|
||||
split_criteria=criteria,
|
||||
degree=5,
|
||||
gamma="auto",
|
||||
)
|
||||
clf.fit(X, y)
|
||||
accuracy_score = clf.score(X, y)
|
||||
yp = clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
# print(
|
||||
# "{:.7f}, # {:7} {:5} {}".format(
|
||||
# accuracy_score, dataset_name, kernel, criteria
|
||||
# )
|
||||
# )
|
||||
accuracy_expected = accuracies.pop(0)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||
|
||||
def test_bogus_splitter_parameter(self):
|
||||
clf = Stree(splitter="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
clf.fit(*load_dataset())
|
||||
|
||||
def test_weights_removing_class(self):
|
||||
# This patch solves an stderr message from sklearn svm lib
|
||||
# "WARNING: class label x specified in weight is not found"
|
||||
X = np.array(
|
||||
[
|
||||
[0.1, 0.1],
|
||||
[0.1, 0.2],
|
||||
[0.2, 0.1],
|
||||
[5, 6],
|
||||
[8, 9],
|
||||
[6, 7],
|
||||
[0.2, 0.2],
|
||||
]
|
||||
)
|
||||
y = np.array([0, 0, 0, 1, 1, 1, 0])
|
||||
epsilon = 1e-5
|
||||
weights = [1, 1, 1, 0, 0, 0, 1]
|
||||
weights = np.array(weights, dtype="float64")
|
||||
weights_epsilon = [x + epsilon for x in weights]
|
||||
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
|
||||
original = weights_no_zero.copy()
|
||||
clf = Stree()
|
||||
clf.fit(X, y)
|
||||
node = clf.train(X, y, weights, 1, "test",)
|
||||
# if a class is lost with zero weights the patch adds epsilon
|
||||
self.assertListEqual(weights.tolist(), weights_epsilon)
|
||||
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
|
||||
# zero weights are ok when they don't erase a class
|
||||
_ = clf.train(X, y, weights_no_zero, 1, "test")
|
||||
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
|
||||
|
Reference in New Issue
Block a user