Compare commits

...

9 Commits

Author SHA1 Message Date
f5706c3159 Update version and notebooks 2020-06-28 10:44:29 +02:00
be552fdd6c Add test for getting 3 feature_sets in Splitter
Add ensemble notebook
2020-06-28 02:45:08 +02:00
5e3a8e3ec5 Change adaboost notebook 2020-06-27 23:34:15 +02:00
554ec03c32 Get only 3 sets for best split
Fix flaky test in Splitter_test
2020-06-27 18:29:40 +02:00
4b7e4a3fb0 better solution to the sklearn bagging problem
Add better tests
enhance .coveragerc
2020-06-26 11:22:45 +02:00
76723993fd Solve Warning class label not found when bagging 2020-06-25 13:07:50 +02:00
ecd0b86f4d Solve the mistake of min and max distance
The split criteria functions min and max distance return classes while
max_samples return distances positives and negatives to hyperplane of
the class with more samples in node
2020-06-17 00:13:52 +02:00
3e52a4746c Fix entroy and information_gain functions 2020-06-16 13:56:02 +02:00
Ricardo Montañana Gómez
a20e45e8e7 Merge pull request #10 from Doctorado-ML/add_subspaces
#2 Add subspaces
2020-06-15 11:30:53 +02:00
11 changed files with 518 additions and 225 deletions

View File

@@ -10,5 +10,4 @@ exclude_lines =
if __name__ == .__main__.: if __name__ == .__main__.:
ignore_errors = True ignore_errors = True
omit = omit =
stree/tests/*
stree/__init__.py stree/__init__.py

2
.gitignore vendored
View File

@@ -131,3 +131,5 @@ dmypy.json
.idea .idea
.vscode .vscode
.pre-commit-config.yaml .pre-commit-config.yaml
**.csv

File diff suppressed because one or more lines are too long

View File

@@ -4,7 +4,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Test AdaBoost with different configurations" "# Test Stree with AdaBoost and Bagging with different configurations"
] ]
}, },
{ {
@@ -34,11 +34,8 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import time\n", "import time\n",
"from sklearn.ensemble import AdaBoostClassifier\n", "from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.model_selection import train_test_split\n",
"from sklearn.svm import LinearSVC, SVC\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.datasets import load_iris\n",
"from stree import Stree" "from stree import Stree"
] ]
}, },
@@ -57,12 +54,14 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 4,
"metadata": {}, "metadata": {
"tags": []
},
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n" "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.644% 647\nValid: 99.356% 99845\n"
} }
], ],
"source": [ "source": [
@@ -117,18 +116,20 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## STree alone on the whole dataset and linear kernel" "## STree alone with 100.000 samples and linear kernel"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"metadata": {}, "metadata": {
"tags": []
},
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Score Train: 0.9985499829409757\nScore Test: 0.998407854584052\nTook 39.45 seconds\n" "text": "Score Train: 0.9985784146480154\nScore Test: 0.9981093273185617\nTook 73.27 seconds\n"
} }
], ],
"source": [ "source": [
@@ -144,7 +145,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Different kernels with different configuations" "## Adaboost"
] ]
}, },
{ {
@@ -161,18 +162,20 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 7,
"metadata": {}, "metadata": {
"tags": []
},
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n" "text": "Kernel: linear\tTime: 93.78 seconds\tScore Train: 0.9983083\tScore Test: 0.9983083\nKernel: rbf\tTime: 18.32 seconds\tScore Train: 0.9935602\tScore Test: 0.9935651\nKernel: poly\tTime: 69.68 seconds\tScore Train: 0.9973132\tScore Test: 0.9972801\n"
} }
], ],
"source": [ "source": [
"for kernel in ['linear', 'rbf', 'poly']:\n", "for kernel in ['linear', 'rbf', 'poly']:\n",
" now = time.time()\n", " now = time.time()\n",
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n", " clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n", " clf.fit(Xtrain, ytrain)\n",
" score_train = clf.score(Xtrain, ytrain)\n", " score_train = clf.score(Xtrain, ytrain)\n",
" score_test = clf.score(Xtest, ytest)\n", " score_test = clf.score(Xtest, ytest)\n",
@@ -183,24 +186,37 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Test algorithm SAMME in AdaBoost to check speed/accuracy" "## Bagging"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [],
"source": [
"n_estimators = 10\n",
"C = 7\n",
"max_depth = 3"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"tags": []
},
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n" "text": "Kernel: linear\tTime: 387.06 seconds\tScore Train: 0.9985784\tScore Test: 0.9981093\nKernel: rbf\tTime: 144.00 seconds\tScore Train: 0.9992750\tScore Test: 0.9983415\nKernel: poly\tTime: 101.78 seconds\tScore Train: 0.9992466\tScore Test: 0.9981757\n"
} }
], ],
"source": [ "source": [
"for kernel in ['linear', 'rbf', 'poly']:\n", "for kernel in ['linear', 'rbf', 'poly']:\n",
" now = time.time()\n", " now = time.time()\n",
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", " clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n", " clf.fit(Xtrain, ytrain)\n",
" score_train = clf.score(Xtrain, ytrain)\n", " score_train = clf.score(Xtrain, ytrain)\n",
" score_test = clf.score(Xtest, ytest)\n", " score_test = clf.score(Xtest, ytest)\n",
@@ -223,7 +239,7 @@
}, },
"orig_nbformat": 2, "orig_nbformat": 2,
"kernelspec": { "kernelspec": {
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39", "name": "python37664bitgeneralvenve3128601eb614c5da59c5055670b6040",
"display_name": "Python 3.7.6 64-bit ('general': venv)" "display_name": "Python 3.7.6 64-bit ('general': venv)"
} }
}, },

File diff suppressed because one or more lines are too long

View File

@@ -66,7 +66,8 @@
"id": "z9Q-YUfBDZEq", "id": "z9Q-YUfBDZEq",
"colab_type": "code", "colab_type": "code",
"colab": {}, "colab": {},
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b" "outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b",
"tags": []
}, },
"source": [ "source": [
"random_state=1\n", "random_state=1\n",
@@ -112,7 +113,7 @@
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n" "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
} }
] ]
}, },
@@ -137,25 +138,25 @@
" 'learning_rate': [.5, 1],\n", " 'learning_rate': [.5, 1],\n",
" 'base_estimator__tol': [.1, 1e-02],\n", " 'base_estimator__tol': [.1, 1e-02],\n",
" 'base_estimator__max_depth': [3, 5],\n", " 'base_estimator__max_depth': [3, 5],\n",
" 'base_estimator__C': [1, 3],\n", " 'base_estimator__C': [7, 55],\n",
" 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n", " 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
"}" "}"
], ],
"execution_count": 9, "execution_count": 5,
"outputs": [] "outputs": []
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"output_type": "execute_result", "output_type": "execute_result",
"data": { "data": {
"text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}" "text/plain": "{'C': 1.0,\n 'criterion': 'gini',\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_features': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'split_criteria': 'max_samples',\n 'splitter': 'random',\n 'tol': 0.0001}"
}, },
"metadata": {}, "metadata": {},
"execution_count": 14 "execution_count": 6
} }
], ],
"source": [ "source": [
@@ -168,28 +169,29 @@
"id": "CrcB8o6EDZE5", "id": "CrcB8o6EDZE5",
"colab_type": "code", "colab_type": "code",
"colab": {}, "colab": {},
"outputId": "7703413a-d563-4289-a13b-532f38f82762" "outputId": "7703413a-d563-4289-a13b-532f38f82762",
"tags": []
}, },
"source": [ "source": [
"random_state=2020\n", "random_state=2020\n",
"clf = AdaBoostClassifier(random_state=random_state)\n", "clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n",
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n", "grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
"grid.fit(Xtrain, ytrain)" "grid.fit(Xtrain, ytrain)"
], ],
"execution_count": 11, "execution_count": 7,
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 3.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 4.2s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 4.8s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 7.2s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 8.9s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 10.7s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.7s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 1.5min finished\n" "text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.0s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.4s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 3.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 4.3s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 6.6s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 8.1s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 9.4s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 10.1s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 11.1s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 12.3s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 13.6s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 14.9s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 16.2s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 17.6s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 19.1s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 21.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 25.9s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 30.4s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 36.7s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 38.1s\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 39.6s\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 41.9s\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 44.9s\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 48.2s\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 49.2s finished\n"
}, },
{ {
"output_type": "execute_result", "output_type": "execute_result",
"data": { "data": {
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)" "text/plain": "GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=55, max_depth=3, tol=0.01)],\n 'base_estimator__C': [7, 55],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
}, },
"metadata": {}, "metadata": {},
"execution_count": 11 "execution_count": 7
} }
] ]
}, },
@@ -199,19 +201,20 @@
"id": "ZjX88NoYDZE8", "id": "ZjX88NoYDZE8",
"colab_type": "code", "colab_type": "code",
"colab": {}, "colab": {},
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344" "outputId": "285163c8-fa33-4915-8ae7-61c4f7844344",
"tags": []
}, },
"source": [ "source": [
"print(\"Best estimator: \", grid.best_estimator_)\n", "print(\"Best estimator: \", grid.best_estimator_)\n",
"print(\"Best hyperparameters: \", grid.best_params_)\n", "print(\"Best hyperparameters: \", grid.best_params_)\n",
"print(\"Best accuracy: \", grid.best_score_)" "print(\"Best accuracy: \", grid.best_score_)"
], ],
"execution_count": 16, "execution_count": 8,
"outputs": [ "outputs": [
{ {
"output_type": "stream", "output_type": "stream",
"name": "stdout", "name": "stdout",
"text": "Best estimator: AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy: 0.9492316893632683\n" "text": "Best estimator: AdaBoostClassifier(algorithm='SAMME',\n base_estimator=Stree(C=55, max_depth=3, tol=0.01),\n learning_rate=0.5, n_estimators=25, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=55, max_depth=3, tol=0.01), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\nBest accuracy: 0.9559440559440558\n"
} }
] ]
} }

View File

@@ -1,6 +1,6 @@
import setuptools import setuptools
__version__ = "0.9rc4" __version__ = "0.9rc5"
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"

View File

@@ -10,6 +10,7 @@ import os
import numbers import numbers
import random import random
import warnings import warnings
from math import log
from itertools import combinations from itertools import combinations
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
@@ -39,6 +40,7 @@ class Snode:
features: np.array, features: np.array,
impurity: float, impurity: float,
title: str, title: str,
weight: np.ndarray = None,
): ):
self._clf = clf self._clf = clf
self._title = title self._title = title
@@ -50,7 +52,9 @@ class Snode:
self._up = None self._up = None
self._class = None self._class = None
self._feature = None self._feature = None
self._sample_weight = None self._sample_weight = (
weight if os.environ.get("TESTING", "NS") != "NS" else None
)
self._features = features self._features = features
self._impurity = impurity self._impurity = impurity
@@ -163,10 +167,10 @@ class Splitter:
f"criterion must be gini or entropy got({criterion})" f"criterion must be gini or entropy got({criterion})"
) )
if criteria not in ["min_distance", "max_samples"]: if criteria not in ["min_distance", "max_samples", "max_distance"]:
raise ValueError( raise ValueError(
f"split_criteria has to be min_distance or \ "split_criteria has to be min_distance "
max_samples got ({criteria})" f"max_distance or max_samples got ({criteria})"
) )
if splitter_type not in ["random", "best"]: if splitter_type not in ["random", "best"]:
@@ -186,24 +190,47 @@ class Splitter:
@staticmethod @staticmethod
def _entropy(y: np.array) -> float: def _entropy(y: np.array) -> float:
_, count = np.unique(y, return_counts=True) n_labels = len(y)
proportion = count / np.sum(count) if n_labels <= 1:
return -np.sum(proportion * np.log2(proportion)) return 0
counts = np.bincount(y)
proportions = counts / n_labels
n_classes = np.count_nonzero(proportions)
if n_classes <= 1:
return 0
entropy = 0.0
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log(prop, n_classes)
return entropy
def information_gain( def information_gain(
self, labels_up: np.array, labels_dn: np.array self, labels: np.array, labels_up: np.array, labels_dn: np.array
) -> float: ) -> float:
card_up = labels_up.shape[0] if labels_up is not None else 0 imp_prev = self.criterion_function(labels)
card_up = card_dn = imp_up = imp_dn = 0
if labels_up is not None:
card_up = labels_up.shape[0]
imp_up = self.criterion_function(labels_up)
if labels_dn is not None:
card_dn = labels_dn.shape[0] if labels_dn is not None else 0 card_dn = labels_dn.shape[0] if labels_dn is not None else 0
imp_dn = self.criterion_function(labels_dn)
samples = card_up + card_dn samples = card_up + card_dn
up = card_up / samples * self.criterion_function(labels_up) if samples == 0:
dn = card_dn / samples * self.criterion_function(labels_dn) return 0.0
return up + dn else:
result = (
imp_prev
- (card_up / samples) * imp_up
- (card_dn / samples) * imp_dn
)
return result
def _select_best_set( def _select_best_set(
self, dataset: np.array, labels: np.array, features_sets: list self, dataset: np.array, labels: np.array, features_sets: list
) -> list: ) -> list:
min_impurity = 1 max_gain = 0
selected = None selected = None
warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=ConvergenceWarning)
for feature_set in features_sets: for feature_set in features_sets:
@@ -213,11 +240,11 @@ class Splitter:
) )
self.partition(dataset, node) self.partition(dataset, node)
y1, y2 = self.part(labels) y1, y2 = self.part(labels)
impurity = self.information_gain(y1, y2) gain = self.information_gain(labels, y1, y2)
if impurity < min_impurity: if gain > max_gain:
min_impurity = impurity max_gain = gain
selected = feature_set selected = feature_set
return selected return selected if selected is not None else feature_set
def _get_subspaces_set( def _get_subspaces_set(
self, dataset: np.array, labels: np.array, max_features: int self, dataset: np.array, labels: np.array, max_features: int
@@ -226,8 +253,12 @@ class Splitter:
features_sets = list(combinations(features, max_features)) features_sets = list(combinations(features, max_features))
if len(features_sets) > 1: if len(features_sets) > 1:
if self._splitter_type == "random": if self._splitter_type == "random":
return features_sets[random.randint(0, len(features_sets) - 1)] index = random.randint(0, len(features_sets) - 1)
return features_sets[index]
else: else:
# get only 3 sets at most
if len(features_sets) > 3:
features_sets = random.sample(features_sets, 3)
return self._select_best_set(dataset, labels, features_sets) return self._select_best_set(dataset, labels, features_sets)
else: else:
return features_sets[0] return features_sets[0]
@@ -242,21 +273,56 @@ class Splitter:
@staticmethod @staticmethod
def _min_distance(data: np.array, _) -> np.array: def _min_distance(data: np.array, _) -> np.array:
# chooses the lowest distance of every sample """Assign class to min distances
indices = np.argmin(np.abs(data), axis=1)
return np.array( return a vector of classes so partition can separate class 0 from
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] the rest of classes, ie. class 0 goes to one splitted node and the
) rest of classes go to the other
:param data: distances to hyper plane of every class
:type data: np.array (m, n_classes)
:param _: enable call compat with other measures
:type _: None
:return: vector with the class assigned to each sample
:rtype: np.array shape (m,)
"""
return np.argmin(data, axis=1)
@staticmethod
def _max_distance(data: np.array, _) -> np.array:
"""Assign class to max distances
return a vector of classes so partition can separate class 0 from
the rest of classes, ie. class 0 goes to one splitted node and the
rest of classes go to the other
:param data: distances to hyper plane of every class
:type data: np.array (m, n_classes)
:param _: enable call compat with other measures
:type _: None
:return: vector with the class assigned to each sample values
(can be 0, 1, ...)
:rtype: np.array shape (m,)
"""
return np.argmax(data, axis=1)
@staticmethod @staticmethod
def _max_samples(data: np.array, y: np.array) -> np.array: def _max_samples(data: np.array, y: np.array) -> np.array:
"""return distances of the class with more samples
:param data: distances to hyper plane of every class
:type data: np.array (m, n_classes)
:param y: vector of labels (classes)
:type y: np.array (m,)
:return: vector with distances to hyperplane (can be positive or neg.)
:rtype: np.array shape (m,)
"""
# select the class with max number of samples # select the class with max number of samples
_, samples = np.unique(y, return_counts=True) _, samples = np.unique(y, return_counts=True)
selected = np.argmax(samples) selected = np.argmax(samples)
return data[:, selected] return data[:, selected]
def partition(self, samples: np.array, node: Snode): def partition(self, samples: np.array, node: Snode):
"""Set the criteria to split arrays """Set the criteria to split arrays. Compute the indices of the samples
that should go to one side of the tree (down)
""" """
data = self._distances(node, samples) data = self._distances(node, samples)
@@ -379,7 +445,9 @@ class Stree(BaseEstimator, ClassifierMixin):
check_classification_targets(y) check_classification_targets(y)
X, y = check_X_y(X, y) X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(sample_weight, X) sample_weight = _check_sample_weight(
sample_weight, X, dtype=np.float64
)
check_classification_targets(y) check_classification_targets(y)
# Initialize computed parameters # Initialize computed parameters
self.splitter_ = Splitter( self.splitter_ = Splitter(
@@ -439,13 +507,22 @@ class Stree(BaseEstimator, ClassifierMixin):
features=X.shape[1], features=X.shape[1],
impurity=0.0, impurity=0.0,
title=title + ", <pure>", title=title + ", <pure>",
weight=sample_weight,
) )
# Train the model # Train the model
clf = self._build_clf() clf = self._build_clf()
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
# solve WARNING: class label 0 specified in weight is not found
# in bagging
if any(sample_weight == 0):
indices = sample_weight == 0
y_next = y[~indices]
# touch weights if removing any class
if np.unique(y_next).shape[0] != self.n_classes_:
sample_weight += 1e-5
clf.fit(Xs, y, sample_weight=sample_weight) clf.fit(Xs, y, sample_weight=sample_weight)
impurity = self.splitter_.impurity(y) impurity = self.splitter_.impurity(y)
node = Snode(clf, X, y, features, impurity, title) node = Snode(clf, X, y, features, impurity, title, sample_weight)
self.depth_ = max(depth, self.depth_) self.depth_ = max(depth, self.depth_)
self.splitter_.partition(X, node) self.splitter_.partition(X, node)
X_U, X_D = self.splitter_.part(X) X_U, X_D = self.splitter_.part(X)
@@ -460,6 +537,7 @@ class Stree(BaseEstimator, ClassifierMixin):
features=X.shape[1], features=X.shape[1],
impurity=impurity, impurity=impurity,
title=title + ", <cgaf>", title=title + ", <cgaf>",
weight=sample_weight,
) )
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up")) node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down")) node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))

View File

@@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase):
max_card = max(card) max_card = max(card)
min_card = min(card) min_card = min(card)
if len(classes) > 1: if len(classes) > 1:
try:
belief = max_card / (max_card + min_card) belief = max_card / (max_card + min_card)
except ZeroDivisionError:
belief = 0.0
else: else:
belief = 1 belief = 1
self.assertEqual(belief, node._belief) self.assertEqual(belief, node._belief)

View File

@@ -1,11 +1,11 @@
import os import os
import unittest import unittest
import random
import numpy as np import numpy as np
from sklearn.svm import LinearSVC from sklearn.svm import SVC
from sklearn.datasets import load_wine, load_iris
from stree import Splitter from stree import Splitter
from .utils import load_dataset
class Splitter_test(unittest.TestCase): class Splitter_test(unittest.TestCase):
@@ -15,7 +15,7 @@ class Splitter_test(unittest.TestCase):
@staticmethod @staticmethod
def build( def build(
clf=LinearSVC(), clf=SVC,
min_samples_split=0, min_samples_split=0,
splitter_type="random", splitter_type="random",
criterion="gini", criterion="gini",
@@ -23,7 +23,7 @@ class Splitter_test(unittest.TestCase):
random_state=None, random_state=None,
): ):
return Splitter( return Splitter(
clf=clf, clf=clf(random_state=random_state, kernel="rbf"),
min_samples_split=min_samples_split, min_samples_split=min_samples_split,
splitter_type=splitter_type, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
@@ -43,10 +43,14 @@ class Splitter_test(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(criteria="duck") self.build(criteria="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.build(clf=None) _ = Splitter(clf=None)
for splitter_type in ["best", "random"]: for splitter_type in ["best", "random"]:
for criterion in ["gini", "entropy"]: for criterion in ["gini", "entropy"]:
for criteria in ["min_distance", "max_samples"]: for criteria in [
"min_distance",
"max_samples",
"max_distance",
]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
@@ -57,30 +61,74 @@ class Splitter_test(unittest.TestCase):
self.assertEqual(criteria, tcl._criteria) self.assertEqual(criteria, tcl._criteria)
def test_gini(self): def test_gini(self):
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] expected_values = [
expected = 0.48 ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
self.assertEqual(expected, Splitter._gini(y)) ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
([0], 0),
([1, 1, 1, 1], 0),
]
for labels, expected in expected_values:
self.assertAlmostEqual(expected, Splitter._gini(labels))
tcl = self.build(criterion="gini") tcl = self.build(criterion="gini")
self.assertEqual(expected, tcl.criterion_function(y)) self.assertAlmostEqual(expected, tcl.criterion_function(labels))
def test_entropy(self): def test_entropy(self):
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] expected_values = [
expected = 0.9709505944546686 ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
self.assertAlmostEqual(expected, Splitter._entropy(y)) ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
([0, 0, 1, 1, 1, 1, 0, 0], 1),
([0, 0, 1, 1, 2, 2, 3, 3], 1),
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
([1], 0),
([0, 0, 0, 0], 0),
]
for labels, expected in expected_values:
self.assertAlmostEqual(expected, Splitter._entropy(labels))
tcl = self.build(criterion="entropy") tcl = self.build(criterion="entropy")
self.assertEqual(expected, tcl.criterion_function(y)) self.assertAlmostEqual(expected, tcl.criterion_function(labels))
def test_information_gain(self): def test_information_gain(self):
yu = np.array([0, 1, 1, 1, 1, 1]) expected_values = [
yd = np.array([0, 0, 0, 1]) (
values_expected = [ [0, 1, 1, 1, 1, 1],
("gini", 0.31666666666666665), [0, 0, 0, 1],
("entropy", 0.7145247027726656), 0.16333333333333333,
0.25642589168200297,
),
(
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
[5, 3, 2, 1, 1],
0.007381776239907684,
-0.03328610916207225,
),
([], [], 0.0, 0.0),
([1], [], 0.0, 0.0),
([], [1], 0.0, 0.0),
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
([], [1, 1, 1, 2], 0.0, 0.0),
(None, [1, 2, 3], 0.0, 0.0),
([1, 2, 3], None, 0.0, 0.0),
] ]
for criterion, expected in values_expected: for yu, yd, expected_gini, expected_entropy in expected_values:
tcl = self.build(criterion=criterion) yu = np.array(yu, dtype=np.int32) if yu is not None else None
computed = tcl.information_gain(yu, yd) yd = np.array(yd, dtype=np.int32) if yd is not None else None
self.assertAlmostEqual(expected, computed) if yu is not None and yd is not None:
complete = np.append(yu, yd)
elif yd is not None:
complete = yd
else:
complete = yu
tcl = self.build(criterion="gini")
computed = tcl.information_gain(complete, yu, yd)
self.assertAlmostEqual(expected_gini, computed)
tcl = self.build(criterion="entropy")
computed = tcl.information_gain(complete, yu, yd)
self.assertAlmostEqual(expected_entropy, computed)
def test_max_samples(self): def test_max_samples(self):
tcl = self.build(criteria="max_samples") tcl = self.build(criteria="max_samples")
@@ -108,34 +156,73 @@ class Splitter_test(unittest.TestCase):
[0.1, 0.2, 0.3], [0.1, 0.2, 0.3],
] ]
) )
expected = np.array([-0.1, 0.01, 0.5, 0.1]) expected = np.array([2, 2, 1, 0])
computed = tcl._min_distance(data, None) computed = tcl._min_distance(data, None)
self.assertEqual((4,), computed.shape) self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist()) self.assertListEqual(expected.tolist(), computed.tolist())
def test_max_distance(self):
tcl = self.build(criteria="max_distance")
data = np.array(
[
[-0.1, 0.2, -0.3],
[0.7, 0.01, -0.1],
[0.7, -0.9, 0.5],
[0.1, 0.2, 0.3],
]
)
expected = np.array([1, 0, 0, 2])
computed = tcl._max_distance(data, None)
self.assertEqual((4,), computed.shape)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_best_splitter_few_sets(self):
X, y = load_iris(return_X_y=True)
X = np.delete(X, 3, 1)
tcl = self.build(splitter_type="best", random_state=self._random_state)
dataset, computed = tcl.get_subspace(X, y, max_features=2)
self.assertListEqual([0, 2], list(computed))
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
def test_splitter_parameter(self): def test_splitter_parameter(self):
expected_values = [ expected_values = [
[1, 7, 9], [2, 3, 5, 7], # best entropy min_distance
[1, 7, 9], [0, 2, 4, 5], # best entropy max_samples
[1, 7, 9], [0, 2, 8, 12], # best entropy max_distance
[1, 7, 9], [1, 2, 5, 12], # best gini min_distance
[0, 5, 6], [0, 3, 4, 10], # best gini max_samples
[0, 5, 6], [1, 2, 9, 12], # best gini max_distance
[0, 5, 6], [3, 9, 11, 12], # random entropy min_distance
[0, 5, 6], [1, 5, 6, 9], # random entropy max_samples
[1, 2, 4, 8], # random entropy max_distance
[2, 6, 7, 12], # random gini min_distance
[3, 9, 10, 11], # random gini max_samples
[2, 5, 8, 12], # random gini max_distance
] ]
X, y = load_dataset(self._random_state, n_features=12) X, y = load_wine(return_X_y=True)
rn = 0
for splitter_type in ["best", "random"]: for splitter_type in ["best", "random"]:
for criterion in ["gini", "entropy"]: for criterion in ["entropy", "gini"]:
for criteria in ["min_distance", "max_samples"]: for criteria in [
"min_distance",
"max_samples",
"max_distance",
]:
tcl = self.build( tcl = self.build(
splitter_type=splitter_type, splitter_type=splitter_type,
criterion=criterion, criterion=criterion,
criteria=criteria, criteria=criteria,
random_state=self._random_state,
) )
expected = expected_values.pop(0) expected = expected_values.pop(0)
dataset, computed = tcl.get_subspace(X, y, max_features=3) random.seed(rn)
rn += 1
dataset, computed = tcl.get_subspace(X, y, max_features=4)
# print(
# "{}, # {:7s}{:8s}{:15s}".format(
# list(computed), splitter_type, criterion,
# criteria,
# )
# )
self.assertListEqual(expected, list(computed)) self.assertListEqual(expected, list(computed))
self.assertListEqual( self.assertListEqual(
X[:, computed].tolist(), dataset.tolist() X[:, computed].tolist(), dataset.tolist()

View File

@@ -1,8 +1,10 @@
import os import os
import unittest import unittest
import warnings
import numpy as np import numpy as np
from sklearn.datasets import load_iris from sklearn.datasets import load_iris, load_wine
from sklearn.exceptions import ConvergenceWarning
from stree import Stree, Snode from stree import Stree, Snode
from .utils import load_dataset from .utils import load_dataset
@@ -39,10 +41,7 @@ class Stree_test(unittest.TestCase):
_, count_u = np.unique(y_up, return_counts=True) _, count_u = np.unique(y_up, return_counts=True)
# #
for i in unique_y: for i in unique_y:
try:
number_down = count_d[i] number_down = count_d[i]
except IndexError:
number_down = 0
try: try:
number_up = count_u[i] number_up = count_u[i]
except IndexError: except IndexError:
@@ -59,33 +58,12 @@ class Stree_test(unittest.TestCase):
def test_build_tree(self): def test_build_tree(self):
"""Check if the tree is built the same way as predictions of models """Check if the tree is built the same way as predictions of models
""" """
import warnings
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree(kernel=kernel, random_state=self._random_state) clf = Stree(kernel=kernel, random_state=self._random_state)
clf.fit(*load_dataset(self._random_state)) clf.fit(*load_dataset(self._random_state))
self._check_tree(clf.tree_) self._check_tree(clf.tree_)
@staticmethod
def _find_out(px: np.array, x_original: np.array, y_original) -> list:
"""Find the original values of y for a given array of samples
Arguments:
px {np.array} -- array of samples to search for
x_original {np.array} -- original dataset
y_original {[type]} -- original classes
Returns:
np.array -- classes of the given samples
"""
res = []
for needle in px:
for row in range(x_original.shape[0]):
if all(x_original[row, :] == needle):
res.append(y_original[row])
return res
def test_single_prediction(self): def test_single_prediction(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
for kernel in self._kernels: for kernel in self._kernels:
@@ -102,22 +80,6 @@ class Stree_test(unittest.TestCase):
yp = clf.fit(X, y).predict(X[:num, :]) yp = clf.fit(X, y).predict(X[:num, :])
self.assertListEqual(y[:num].tolist(), yp.tolist()) self.assertListEqual(y[:num].tolist(), yp.tolist())
def test_score(self):
X, y = load_dataset(self._random_state)
accuracies = [
0.9506666666666667,
0.9606666666666667,
0.9433333333333334,
]
for kernel, accuracy_expected in zip(self._kernels, accuracies):
clf = Stree(random_state=self._random_state, kernel=kernel,)
clf.fit(X, y)
accuracy_score = clf.score(X, y)
yp = clf.predict(X)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertAlmostEqual(accuracy_expected, accuracy_score)
def test_single_vs_multiple_prediction(self): def test_single_vs_multiple_prediction(self):
"""Check if predicting sample by sample gives the same result as """Check if predicting sample by sample gives the same result as
predicting all samples at once predicting all samples at once
@@ -164,9 +126,6 @@ class Stree_test(unittest.TestCase):
@staticmethod @staticmethod
def test_is_a_sklearn_classifier(): def test_is_a_sklearn_classifier():
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator
@@ -239,6 +198,9 @@ class Stree_test(unittest.TestCase):
"min_distance linear": 0.9533333333333334, "min_distance linear": 0.9533333333333334,
"min_distance rbf": 0.836, "min_distance rbf": 0.836,
"min_distance poly": 0.9473333333333334, "min_distance poly": 0.9473333333333334,
"max_distance linear": 0.9533333333333334,
"max_distance rbf": 0.836,
"max_distance poly": 0.9473333333333334,
}, },
"Iris": { "Iris": {
"max_samples linear": 0.98, "max_samples linear": 0.98,
@@ -247,11 +209,14 @@ class Stree_test(unittest.TestCase):
"min_distance linear": 0.98, "min_distance linear": 0.98,
"min_distance rbf": 1.0, "min_distance rbf": 1.0,
"min_distance poly": 1.0, "min_distance poly": 1.0,
"max_distance linear": 0.98,
"max_distance rbf": 1.0,
"max_distance poly": 1.0,
}, },
} }
for name, dataset in datasets.items(): for name, dataset in datasets.items():
px, py = dataset px, py = dataset
for criteria in ["max_samples", "min_distance"]: for criteria in ["max_samples", "min_distance", "max_distance"]:
for kernel in self._kernels: for kernel in self._kernels:
clf = Stree( clf = Stree(
C=1e4, C=1e4,
@@ -322,13 +287,130 @@ class Stree_test(unittest.TestCase):
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
clf.predict(X[:, :3]) clf.predict(X[:, :3])
# Tests of score
def test_score_binary(self):
X, y = load_dataset(self._random_state)
accuracies = [
0.9506666666666667,
0.9606666666666667,
0.9433333333333334,
]
for kernel, accuracy_expected in zip(self._kernels, accuracies):
clf = Stree(random_state=self._random_state, kernel=kernel,)
clf.fit(X, y)
accuracy_score = clf.score(X, y)
yp = clf.predict(X)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertAlmostEqual(accuracy_expected, accuracy_score)
def test_score_max_features(self): def test_score_max_features(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state, max_features=2) clf = Stree(random_state=self._random_state, max_features=2)
clf.fit(X, y) clf.fit(X, y)
self.assertAlmostEqual(0.9426666666666667, clf.score(X, y)) self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))
def test_score_multi_class(self):
warnings.filterwarnings("ignore")
accuracies = [
0.8258427, # Wine linear min_distance
0.6741573, # Wine linear max_distance
0.8314607, # Wine linear max_samples
0.6629213, # Wine rbf min_distance
1.0000000, # Wine rbf max_distance
0.4044944, # Wine rbf max_samples
0.9157303, # Wine poly min_distance
1.0000000, # Wine poly max_distance
0.7640449, # Wine poly max_samples
0.9933333, # Iris linear min_distance
0.9666667, # Iris linear max_distance
0.9666667, # Iris linear max_samples
0.9800000, # Iris rbf min_distance
0.9800000, # Iris rbf max_distance
0.9800000, # Iris rbf max_samples
1.0000000, # Iris poly min_distance
1.0000000, # Iris poly max_distance
1.0000000, # Iris poly max_samples
0.8993333, # Synthetic linear min_distance
0.6533333, # Synthetic linear max_distance
0.9313333, # Synthetic linear max_samples
0.8320000, # Synthetic rbf min_distance
0.6660000, # Synthetic rbf max_distance
0.8320000, # Synthetic rbf max_samples
0.6066667, # Synthetic poly min_distance
0.6840000, # Synthetic poly max_distance
0.6340000, # Synthetic poly max_samples
]
datasets = [
("Wine", load_wine(return_X_y=True)),
("Iris", load_iris(return_X_y=True)),
(
"Synthetic",
load_dataset(self._random_state, n_classes=3, n_features=5),
),
]
for dataset_name, dataset in datasets:
X, y = dataset
for kernel in self._kernels:
for criteria in [
"min_distance",
"max_distance",
"max_samples",
]:
clf = Stree(
C=17,
random_state=self._random_state,
kernel=kernel,
split_criteria=criteria,
degree=5,
gamma="auto",
)
clf.fit(X, y)
accuracy_score = clf.score(X, y)
yp = clf.predict(X)
accuracy_computed = np.mean(yp == y)
# print(
# "{:.7f}, # {:7} {:5} {}".format(
# accuracy_score, dataset_name, kernel, criteria
# )
# )
accuracy_expected = accuracies.pop(0)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertAlmostEqual(accuracy_expected, accuracy_score)
def test_bogus_splitter_parameter(self): def test_bogus_splitter_parameter(self):
clf = Stree(splitter="duck") clf = Stree(splitter="duck")
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
clf.fit(*load_dataset()) clf.fit(*load_dataset())
def test_weights_removing_class(self):
# This patch solves an stderr message from sklearn svm lib
# "WARNING: class label x specified in weight is not found"
X = np.array(
[
[0.1, 0.1],
[0.1, 0.2],
[0.2, 0.1],
[5, 6],
[8, 9],
[6, 7],
[0.2, 0.2],
]
)
y = np.array([0, 0, 0, 1, 1, 1, 0])
epsilon = 1e-5
weights = [1, 1, 1, 0, 0, 0, 1]
weights = np.array(weights, dtype="float64")
weights_epsilon = [x + epsilon for x in weights]
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
original = weights_no_zero.copy()
clf = Stree()
clf.fit(X, y)
node = clf.train(X, y, weights, 1, "test",)
# if a class is lost with zero weights the patch adds epsilon
self.assertListEqual(weights.tolist(), weights_epsilon)
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
# zero weights are ok when they don't erase a class
_ = clf.train(X, y, weights_no_zero, 1, "test")
self.assertListEqual(weights_no_zero.tolist(), original.tolist())