mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 08:26:00 +00:00
Compare commits
2 Commits
0.9rc5
...
mypy-stati
Author | SHA1 | Date | |
---|---|---|---|
d1e30a3372
|
|||
fa001f97a4
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -66,8 +66,7 @@
|
||||
"id": "z9Q-YUfBDZEq",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b",
|
||||
"tags": []
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b"
|
||||
},
|
||||
"source": [
|
||||
"random_state=1\n",
|
||||
@@ -113,7 +112,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -138,25 +137,25 @@
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5],\n",
|
||||
" 'base_estimator__C': [7, 55],\n",
|
||||
" 'base_estimator__C': [1, 3],\n",
|
||||
" 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
|
||||
"}"
|
||||
],
|
||||
"execution_count": 5,
|
||||
"execution_count": 9,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "{'C': 1.0,\n 'criterion': 'gini',\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_features': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'split_criteria': 'max_samples',\n 'splitter': 'random',\n 'tol': 0.0001}"
|
||||
"text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 6
|
||||
"execution_count": 14
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -169,29 +168,28 @@
|
||||
"id": "CrcB8o6EDZE5",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762",
|
||||
"tags": []
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762"
|
||||
},
|
||||
"source": [
|
||||
"random_state=2020\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state)\n",
|
||||
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
|
||||
"grid.fit(Xtrain, ytrain)"
|
||||
],
|
||||
"execution_count": 7,
|
||||
"execution_count": 11,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.0s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.4s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 3.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 4.3s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 6.6s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 8.1s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 9.4s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 10.1s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 11.1s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 12.3s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 13.6s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 14.9s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 16.2s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 17.6s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 19.1s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 21.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 25.9s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 30.4s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 36.7s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 38.1s\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 39.6s\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 41.9s\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 44.9s\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 48.2s\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 49.2s finished\n"
|
||||
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 3.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 4.2s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 4.8s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 7.2s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 8.9s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 10.7s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.7s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 1.5min finished\n"
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=55, max_depth=3, tol=0.01)],\n 'base_estimator__C': [7, 55],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
|
||||
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 7
|
||||
"execution_count": 11
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -201,20 +199,19 @@
|
||||
"id": "ZjX88NoYDZE8",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344",
|
||||
"tags": []
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344"
|
||||
},
|
||||
"source": [
|
||||
"print(\"Best estimator: \", grid.best_estimator_)\n",
|
||||
"print(\"Best hyperparameters: \", grid.best_params_)\n",
|
||||
"print(\"Best accuracy: \", grid.best_score_)"
|
||||
],
|
||||
"execution_count": 8,
|
||||
"execution_count": 16,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Best estimator: AdaBoostClassifier(algorithm='SAMME',\n base_estimator=Stree(C=55, max_depth=3, tol=0.01),\n learning_rate=0.5, n_estimators=25, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=55, max_depth=3, tol=0.01), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\nBest accuracy: 0.9559440559440558\n"
|
||||
"text": "Best estimator: AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy: 0.9492316893632683\n"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
2
setup.py
2
setup.py
@@ -1,6 +1,6 @@
|
||||
import setuptools
|
||||
|
||||
__version__ = "0.9rc5"
|
||||
__version__ = "0.9rc4"
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
|
||||
|
||||
|
255
stree/Strees.py
255
stree/Strees.py
@@ -6,25 +6,24 @@ __version__ = "0.9"
|
||||
Build an oblique tree classifier based on SVM Trees
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import os
|
||||
import numbers
|
||||
import random
|
||||
import warnings
|
||||
from typing import Optional, List, Union, Tuple
|
||||
from math import log
|
||||
from itertools import combinations
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.svm import SVC, LinearSVC
|
||||
from sklearn.utils import check_consistent_length
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils.validation import (
|
||||
check_X_y,
|
||||
check_array,
|
||||
import numpy as np # type: ignore
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin # type: ignore
|
||||
from sklearn.svm import SVC, LinearSVC # type: ignore
|
||||
from sklearn.utils.multiclass import ( # type: ignore
|
||||
check_classification_targets,
|
||||
)
|
||||
from sklearn.exceptions import ConvergenceWarning # type: ignore
|
||||
from sklearn.utils.validation import ( # type: ignore
|
||||
check_is_fitted,
|
||||
_check_sample_weight,
|
||||
)
|
||||
from sklearn.metrics._classification import _weighted_sum, _check_targets
|
||||
|
||||
|
||||
class Snode:
|
||||
@@ -34,7 +33,7 @@ class Snode:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
clf: SVC,
|
||||
clf: Union[SVC, LinearSVC],
|
||||
X: np.ndarray,
|
||||
y: np.ndarray,
|
||||
features: np.array,
|
||||
@@ -42,24 +41,25 @@ class Snode:
|
||||
title: str,
|
||||
weight: np.ndarray = None,
|
||||
):
|
||||
self._clf = clf
|
||||
self._title = title
|
||||
self._belief = 0.0
|
||||
self._clf: Union[SVC, LinearSVC] = clf
|
||||
self._title: str = title
|
||||
self._belief: float = 0.0
|
||||
# Only store dataset in Testing
|
||||
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
|
||||
self._y = y
|
||||
self._down = None
|
||||
self._up = None
|
||||
self._X: Optional[np.array] = X if os.environ.get(
|
||||
"TESTING", "NS"
|
||||
) != "NS" else None
|
||||
self._y: np.array = y
|
||||
self._down: Optional[Snode] = None
|
||||
self._up: Optional[Snode] = None
|
||||
self._class = None
|
||||
self._feature = None
|
||||
self._sample_weight = (
|
||||
self._sample_weight: Optional[np.array] = (
|
||||
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||
)
|
||||
self._features = features
|
||||
self._impurity = impurity
|
||||
self._features: Tuple[int, ...] = features
|
||||
self._impurity: float = impurity
|
||||
|
||||
@classmethod
|
||||
def copy(cls, node: "Snode") -> "Snode":
|
||||
def copy(cls, node: Snode) -> Snode:
|
||||
return cls(
|
||||
node._clf,
|
||||
node._X,
|
||||
@@ -69,22 +69,22 @@ class Snode:
|
||||
node._title,
|
||||
)
|
||||
|
||||
def set_down(self, son):
|
||||
def set_down(self, son: Snode) -> None:
|
||||
self._down = son
|
||||
|
||||
def set_up(self, son):
|
||||
def set_up(self, son: Snode) -> None:
|
||||
self._up = son
|
||||
|
||||
def is_leaf(self) -> bool:
|
||||
return self._up is None and self._down is None
|
||||
|
||||
def get_down(self) -> "Snode":
|
||||
def get_down(self) -> Optional[Snode]:
|
||||
return self._down
|
||||
|
||||
def get_up(self) -> "Snode":
|
||||
def get_up(self) -> Optional[Snode]:
|
||||
return self._up
|
||||
|
||||
def make_predictor(self):
|
||||
def make_predictor(self) -> None:
|
||||
"""Compute the class of the predictor and its belief based on the
|
||||
subdataset of the node only if it is a leaf
|
||||
"""
|
||||
@@ -123,11 +123,11 @@ class Siterator:
|
||||
"""Stree preorder iterator
|
||||
"""
|
||||
|
||||
def __init__(self, tree: Snode):
|
||||
self._stack = []
|
||||
def __init__(self, tree: Optional[Snode]):
|
||||
self._stack: List[Snode] = []
|
||||
self._push(tree)
|
||||
|
||||
def _push(self, node: Snode):
|
||||
def _push(self, node: Optional[Snode]) -> None:
|
||||
if node is not None:
|
||||
self._stack.append(node)
|
||||
|
||||
@@ -143,21 +143,21 @@ class Siterator:
|
||||
class Splitter:
|
||||
def __init__(
|
||||
self,
|
||||
clf: SVC = None,
|
||||
criterion: str = None,
|
||||
splitter_type: str = None,
|
||||
criteria: str = None,
|
||||
min_samples_split: int = None,
|
||||
random_state=None,
|
||||
clf: Union[SVC, LinearSVC] = None,
|
||||
criterion: str = "",
|
||||
splitter_type: str = "",
|
||||
criteria: str = "",
|
||||
min_samples_split: int = 0,
|
||||
random_state: Optional[int] = None,
|
||||
):
|
||||
self._clf = clf
|
||||
self._random_state = random_state
|
||||
self._clf: Union[SVC, LinearSVC] = clf
|
||||
self._random_state: Optional[int] = random_state
|
||||
if random_state is not None:
|
||||
random.seed(random_state)
|
||||
self._criterion = criterion
|
||||
self._min_samples_split = min_samples_split
|
||||
self._criteria = criteria
|
||||
self._splitter_type = splitter_type
|
||||
self._criterion: str = criterion
|
||||
self._min_samples_split: int = min_samples_split
|
||||
self._criteria: str = criteria
|
||||
self._splitter_type: str = splitter_type
|
||||
|
||||
if clf is None:
|
||||
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||
@@ -186,7 +186,7 @@ class Splitter:
|
||||
@staticmethod
|
||||
def _gini(y: np.array) -> float:
|
||||
_, count = np.unique(y, return_counts=True)
|
||||
return 1 - np.sum(np.square(count / np.sum(count)))
|
||||
return float(1 - np.sum(np.square(count / np.sum(count))))
|
||||
|
||||
@staticmethod
|
||||
def _entropy(y: np.array) -> float:
|
||||
@@ -220,7 +220,7 @@ class Splitter:
|
||||
if samples == 0:
|
||||
return 0.0
|
||||
else:
|
||||
result = (
|
||||
result = float(
|
||||
imp_prev
|
||||
- (card_up / samples) * imp_up
|
||||
- (card_dn / samples) * imp_dn
|
||||
@@ -228,10 +228,13 @@ class Splitter:
|
||||
return result
|
||||
|
||||
def _select_best_set(
|
||||
self, dataset: np.array, labels: np.array, features_sets: list
|
||||
) -> list:
|
||||
max_gain = 0
|
||||
selected = None
|
||||
self,
|
||||
dataset: np.array,
|
||||
labels: np.array,
|
||||
features_sets: List[Tuple[int, ...]],
|
||||
) -> Tuple[int, ...]:
|
||||
max_gain: float = 0.0
|
||||
selected: Union[Tuple[int, ...], None] = None
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
for feature_set in features_sets:
|
||||
self._clf.fit(dataset[:, feature_set], labels)
|
||||
@@ -265,14 +268,14 @@ class Splitter:
|
||||
|
||||
def get_subspace(
|
||||
self, dataset: np.array, labels: np.array, max_features: int
|
||||
) -> list:
|
||||
) -> Tuple[np.array, np.array]:
|
||||
"""Return the best subspace to make a split
|
||||
"""
|
||||
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||
return dataset[:, indices], indices
|
||||
|
||||
@staticmethod
|
||||
def _min_distance(data: np.array, _) -> np.array:
|
||||
def _min_distance(data: np.array, _: np.array) -> np.array:
|
||||
"""Assign class to min distances
|
||||
|
||||
return a vector of classes so partition can separate class 0 from
|
||||
@@ -288,7 +291,7 @@ class Splitter:
|
||||
return np.argmin(data, axis=1)
|
||||
|
||||
@staticmethod
|
||||
def _max_distance(data: np.array, _) -> np.array:
|
||||
def _max_distance(data: np.array, _: np.array) -> np.array:
|
||||
"""Assign class to max distances
|
||||
|
||||
return a vector of classes so partition can separate class 0 from
|
||||
@@ -320,7 +323,7 @@ class Splitter:
|
||||
selected = np.argmax(samples)
|
||||
return data[:, selected]
|
||||
|
||||
def partition(self, samples: np.array, node: Snode):
|
||||
def partition(self, samples: np.array, node: Snode) -> None:
|
||||
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||
that should go to one side of the tree (down)
|
||||
|
||||
@@ -348,7 +351,7 @@ class Splitter:
|
||||
"""
|
||||
return node._clf.decision_function(data[:, node._features])
|
||||
|
||||
def part(self, origin: np.array) -> list:
|
||||
def part(self, origin: np.array) -> Tuple[np.array, np.array]:
|
||||
"""Split an array in two based on indices (down) and its complement
|
||||
|
||||
:param origin: dataset to split
|
||||
@@ -359,13 +362,13 @@ class Splitter:
|
||||
:rtype: list
|
||||
"""
|
||||
up = ~self._down
|
||||
return [
|
||||
return (
|
||||
origin[up] if any(up) else None,
|
||||
origin[self._down] if any(self._down) else None,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class Stree(BaseEstimator, ClassifierMixin):
|
||||
class Stree(BaseEstimator, ClassifierMixin): # type: ignore
|
||||
"""Estimator that is based on binary trees of svm nodes
|
||||
can deal with sample_weights in predict, used in boosting sklearn methods
|
||||
inheriting from BaseEstimator implements get_params and set_params methods
|
||||
@@ -378,42 +381,34 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
C: float = 1.0,
|
||||
kernel: str = "linear",
|
||||
max_iter: int = 1000,
|
||||
random_state: int = None,
|
||||
max_depth: int = None,
|
||||
random_state: Optional[int] = None,
|
||||
max_depth: Optional[int] = None,
|
||||
tol: float = 1e-4,
|
||||
degree: int = 3,
|
||||
gamma="scale",
|
||||
gamma: Union[float, str] = "scale",
|
||||
split_criteria: str = "max_samples",
|
||||
criterion: str = "gini",
|
||||
min_samples_split: int = 0,
|
||||
max_features=None,
|
||||
max_features: Optional[Union[str, int, float]] = None,
|
||||
splitter: str = "random",
|
||||
):
|
||||
self.max_iter = max_iter
|
||||
self.C = C
|
||||
self.kernel = kernel
|
||||
self.random_state = random_state
|
||||
self.max_depth = max_depth
|
||||
self.tol = tol
|
||||
self.gamma = gamma
|
||||
self.degree = degree
|
||||
self.min_samples_split = min_samples_split
|
||||
self.split_criteria = split_criteria
|
||||
self.max_features = max_features
|
||||
self.criterion = criterion
|
||||
self.splitter = splitter
|
||||
|
||||
def _more_tags(self) -> dict:
|
||||
"""Required by sklearn to supply features of the classifier
|
||||
|
||||
:return: the tag required
|
||||
:rtype: dict
|
||||
"""
|
||||
return {"requires_y": True}
|
||||
self.C: float = C
|
||||
self.kernel: str = kernel
|
||||
self.random_state: Optional[int] = random_state
|
||||
self.max_depth: Optional[int] = max_depth
|
||||
self.tol: float = tol
|
||||
self.gamma: Union[float, str] = gamma
|
||||
self.degree: int = degree
|
||||
self.min_samples_split: int = min_samples_split
|
||||
self.split_criteria: str = split_criteria
|
||||
self.max_features: Union[str, int, float, None] = max_features
|
||||
self.criterion: str = criterion
|
||||
self.splitter: str = splitter
|
||||
|
||||
def fit(
|
||||
self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
|
||||
) -> "Stree":
|
||||
) -> Stree:
|
||||
"""Build the tree based on the dataset of samples and its labels
|
||||
|
||||
:param X: dataset of samples to make predictions
|
||||
@@ -442,13 +437,11 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||
{self.max_depth})"
|
||||
)
|
||||
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
X, y = self._validate_data(X, y)
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=np.float64
|
||||
)
|
||||
check_classification_targets(y)
|
||||
# Initialize computed parameters
|
||||
self.splitter_ = Splitter(
|
||||
clf=self._build_clf(),
|
||||
@@ -464,8 +457,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
self.n_classes_ = self.classes_.shape[0]
|
||||
self.n_iter_ = self.max_iter
|
||||
self.depth_ = 0
|
||||
self.n_features_ = X.shape[1]
|
||||
self.n_features_in_ = X.shape[1]
|
||||
self.max_features_ = self._initialize_max_features()
|
||||
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
||||
self._build_predictor()
|
||||
@@ -478,7 +469,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
sample_weight: np.ndarray,
|
||||
depth: int,
|
||||
title: str,
|
||||
) -> Snode:
|
||||
) -> Optional[Snode]:
|
||||
"""Recursive function to split the original dataset into predictor
|
||||
nodes (leaves)
|
||||
|
||||
@@ -539,15 +530,25 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
title=title + ", <cgaf>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||
node.set_up(
|
||||
self.train( # type: ignore
|
||||
X_U, y_u, sw_u, depth + 1, title + " - Up"
|
||||
)
|
||||
)
|
||||
node.set_down(
|
||||
self.train( # type: ignore
|
||||
X_D, y_d, sw_d, depth + 1, title + " - Down"
|
||||
)
|
||||
)
|
||||
return node
|
||||
|
||||
def _build_predictor(self):
|
||||
def _build_predictor(self) -> None:
|
||||
"""Process the leaves to make them predictors
|
||||
"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
def run_tree(node: Optional[Snode]) -> None:
|
||||
if node is None:
|
||||
raise ValueError("Can't build predictors on None")
|
||||
if node.is_leaf():
|
||||
node.make_predictor()
|
||||
return
|
||||
@@ -556,7 +557,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
|
||||
run_tree(self.tree_)
|
||||
|
||||
def _build_clf(self):
|
||||
def _build_clf(self) -> Union[LinearSVC, SVC]:
|
||||
""" Build the correct classifier for the node
|
||||
"""
|
||||
return (
|
||||
@@ -605,30 +606,30 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
"""
|
||||
|
||||
def predict_class(
|
||||
xp: np.array, indices: np.array, node: Snode
|
||||
xp: np.array, indices: np.array, node: Optional[Snode]
|
||||
) -> np.array:
|
||||
if xp is None:
|
||||
return [], []
|
||||
if node.is_leaf():
|
||||
if node.is_leaf(): # type: ignore
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
prediction = np.full(
|
||||
(xp.shape[0], 1), node._class # type: ignore
|
||||
)
|
||||
return prediction, indices
|
||||
self.splitter_.partition(xp, node)
|
||||
self.splitter_.partition(xp, node) # type: ignore
|
||||
x_u, x_d = self.splitter_.part(xp)
|
||||
i_u, i_d = self.splitter_.part(indices)
|
||||
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
|
||||
prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
|
||||
prx_u, prin_u = predict_class(
|
||||
x_u, i_u, node.get_up() # type: ignore
|
||||
)
|
||||
prx_d, prin_d = predict_class(
|
||||
x_d, i_d, node.get_down() # type: ignore
|
||||
)
|
||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self, ["tree_"])
|
||||
check_is_fitted(self, "n_features_in_")
|
||||
# Input validation
|
||||
X = check_array(X)
|
||||
if X.shape[1] != self.n_features_:
|
||||
raise ValueError(
|
||||
f"Expected {self.n_features_} features but got "
|
||||
f"({X.shape[1]})"
|
||||
)
|
||||
X = self._validate_data(X, reset=False)
|
||||
# setup prediction & make it happen
|
||||
indices = np.arange(X.shape[0])
|
||||
result = (
|
||||
@@ -638,32 +639,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
)
|
||||
return self.classes_[result]
|
||||
|
||||
def score(
|
||||
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||
) -> float:
|
||||
"""Compute accuracy of the prediction
|
||||
|
||||
:param X: dataset of samples to make predictions
|
||||
:type X: np.array
|
||||
:param y_true: samples labels
|
||||
:type y_true: np.array
|
||||
:param sample_weight: weights of the samples. Rescale C per sample.
|
||||
Hi' weights force the classifier to put more emphasis on these points
|
||||
:type sample_weight: np.array optional
|
||||
:return: accuracy of the prediction
|
||||
:rtype: float
|
||||
"""
|
||||
# sklearn check
|
||||
check_is_fitted(self)
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
y_pred = self.predict(X).reshape(y.shape)
|
||||
# Compute accuracy for each possible representation
|
||||
_, y_true, y_pred = _check_targets(y, y_pred)
|
||||
check_consistent_length(y_true, y_pred, sample_weight)
|
||||
score = y_true == y_pred
|
||||
return _weighted_sum(score, sample_weight, normalize=True)
|
||||
|
||||
def __iter__(self) -> Siterator:
|
||||
"""Create an iterator to be able to visit the nodes of the tree in
|
||||
preorder, can make a list with all the nodes in preorder
|
||||
@@ -691,11 +666,11 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
def _initialize_max_features(self) -> int:
|
||||
if isinstance(self.max_features, str):
|
||||
if self.max_features == "auto":
|
||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||
max_features = max(1, int(np.sqrt(self.n_features_in_)))
|
||||
elif self.max_features == "sqrt":
|
||||
max_features = max(1, int(np.sqrt(self.n_features_)))
|
||||
max_features = max(1, int(np.sqrt(self.n_features_in_)))
|
||||
elif self.max_features == "log2":
|
||||
max_features = max(1, int(np.log2(self.n_features_)))
|
||||
max_features = max(1, int(np.log2(self.n_features_in_)))
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid value for max_features. "
|
||||
@@ -703,13 +678,13 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
"'sqrt' or 'log2'."
|
||||
)
|
||||
elif self.max_features is None:
|
||||
max_features = self.n_features_
|
||||
elif isinstance(self.max_features, numbers.Integral):
|
||||
max_features = self.n_features_in_
|
||||
elif isinstance(self.max_features, int):
|
||||
max_features = self.max_features
|
||||
else: # float
|
||||
if self.max_features > 0.0:
|
||||
max_features = max(
|
||||
1, int(self.max_features * self.n_features_)
|
||||
1, int(self.max_features * self.n_features_in_)
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
|
@@ -1,3 +1,4 @@
|
||||
# type: ignore
|
||||
import os
|
||||
import unittest
|
||||
|
||||
|
@@ -1,3 +1,4 @@
|
||||
# type: ignore
|
||||
import os
|
||||
import unittest
|
||||
import random
|
||||
|
@@ -1,3 +1,4 @@
|
||||
# type: ignore
|
||||
import os
|
||||
import unittest
|
||||
import warnings
|
||||
@@ -239,7 +240,7 @@ class Stree_test(unittest.TestCase):
|
||||
(None, 16),
|
||||
]
|
||||
clf = Stree()
|
||||
clf.n_features_ = n_features
|
||||
clf.n_features_in_ = n_features
|
||||
for max_features, expected in expected_values:
|
||||
clf.set_params(**dict(max_features=max_features))
|
||||
computed = clf._initialize_max_features()
|
||||
@@ -414,3 +415,24 @@ class Stree_test(unittest.TestCase):
|
||||
# zero weights are ok when they don't erase a class
|
||||
_ = clf.train(X, y, weights_no_zero, 1, "test")
|
||||
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
|
||||
|
||||
def test_build_predictor(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
clf = Stree(random_state=self._random_state)
|
||||
with self.assertRaises(ValueError):
|
||||
clf.tree_ = None
|
||||
clf._build_predictor()
|
||||
clf.fit(X, y)
|
||||
node = clf.tree_.get_down().get_down()
|
||||
expected_impurity = 0.04686951386893923
|
||||
expected_class = 1
|
||||
expected_belief = 0.9759887005649718
|
||||
self.assertAlmostEqual(expected_impurity, node._impurity)
|
||||
self.assertAlmostEqual(expected_belief, node._belief)
|
||||
self.assertEqual(expected_class, node._class)
|
||||
node._belief = 0.0
|
||||
node._class = None
|
||||
clf._build_predictor()
|
||||
node = clf.tree_.get_down().get_down()
|
||||
self.assertAlmostEqual(expected_belief, node._belief)
|
||||
self.assertEqual(expected_class, node._class)
|
||||
|
@@ -1,3 +1,4 @@
|
||||
# type: ignore
|
||||
from .Stree_test import Stree_test
|
||||
from .Snode_test import Snode_test
|
||||
from .Splitter_test import Splitter_test
|
||||
|
@@ -1,3 +1,4 @@
|
||||
# type: ignore
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user