Compare commits

..

1 Commits

Author SHA1 Message Date
f5706c3159 Update version and notebooks 2020-06-28 10:44:29 +02:00
10 changed files with 230 additions and 281 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -66,7 +66,8 @@
"id": "z9Q-YUfBDZEq",
"colab_type": "code",
"colab": {},
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b"
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b",
"tags": []
},
"source": [
"random_state=1\n",
@@ -112,7 +113,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
}
]
},
@@ -137,25 +138,25 @@
" 'learning_rate': [.5, 1],\n",
" 'base_estimator__tol': [.1, 1e-02],\n",
" 'base_estimator__max_depth': [3, 5],\n",
" 'base_estimator__C': [1, 3],\n",
" 'base_estimator__C': [7, 55],\n",
" 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
"}"
],
"execution_count": 9,
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}"
"text/plain": "{'C': 1.0,\n 'criterion': 'gini',\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_features': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'split_criteria': 'max_samples',\n 'splitter': 'random',\n 'tol': 0.0001}"
},
"metadata": {},
"execution_count": 14
"execution_count": 6
}
],
"source": [
@@ -168,28 +169,29 @@
"id": "CrcB8o6EDZE5",
"colab_type": "code",
"colab": {},
"outputId": "7703413a-d563-4289-a13b-532f38f82762"
"outputId": "7703413a-d563-4289-a13b-532f38f82762",
"tags": []
},
"source": [
"random_state=2020\n",
"clf = AdaBoostClassifier(random_state=random_state)\n",
"clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n",
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
"grid.fit(Xtrain, ytrain)"
],
"execution_count": 11,
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 3.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 4.2s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 4.8s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 7.2s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 8.9s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 10.7s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.7s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 1.5min finished\n"
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.0s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.4s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 3.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 4.3s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 6.6s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 8.1s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 9.4s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 10.1s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 11.1s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 12.3s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 13.6s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 14.9s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 16.2s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 17.6s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 19.1s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 21.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 25.9s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 30.4s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 36.7s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 38.1s\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 39.6s\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 41.9s\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 44.9s\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 48.2s\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 49.2s finished\n"
},
{
"output_type": "execute_result",
"data": {
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=55, max_depth=3, tol=0.01)],\n 'base_estimator__C': [7, 55],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
},
"metadata": {},
"execution_count": 11
"execution_count": 7
}
]
},
@@ -199,19 +201,20 @@
"id": "ZjX88NoYDZE8",
"colab_type": "code",
"colab": {},
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344"
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344",
"tags": []
},
"source": [
"print(\"Best estimator: \", grid.best_estimator_)\n",
"print(\"Best hyperparameters: \", grid.best_params_)\n",
"print(\"Best accuracy: \", grid.best_score_)"
],
"execution_count": 16,
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Best estimator: AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy: 0.9492316893632683\n"
"text": "Best estimator: AdaBoostClassifier(algorithm='SAMME',\n base_estimator=Stree(C=55, max_depth=3, tol=0.01),\n learning_rate=0.5, n_estimators=25, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=55, max_depth=3, tol=0.01), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\nBest accuracy: 0.9559440559440558\n"
}
]
}

View File

@@ -1,6 +1,6 @@
import setuptools
__version__ = "0.9rc4"
__version__ = "0.9rc5"
__author__ = "Ricardo Montañana Gómez"

View File

@@ -6,24 +6,25 @@ __version__ = "0.9"
Build an oblique tree classifier based on SVM Trees
"""
from __future__ import annotations
import os
import numbers
import random
import warnings
from typing import Optional, List, Union, Tuple
from math import log
from itertools import combinations
import numpy as np # type: ignore
from sklearn.base import BaseEstimator, ClassifierMixin # type: ignore
from sklearn.svm import SVC, LinearSVC # type: ignore
from sklearn.utils.multiclass import ( # type: ignore
check_classification_targets,
)
from sklearn.exceptions import ConvergenceWarning # type: ignore
from sklearn.utils.validation import ( # type: ignore
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import check_classification_targets
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.validation import (
check_X_y,
check_array,
check_is_fitted,
_check_sample_weight,
)
from sklearn.metrics._classification import _weighted_sum, _check_targets
class Snode:
@@ -33,7 +34,7 @@ class Snode:
def __init__(
self,
clf: Union[SVC, LinearSVC],
clf: SVC,
X: np.ndarray,
y: np.ndarray,
features: np.array,
@@ -41,25 +42,24 @@ class Snode:
title: str,
weight: np.ndarray = None,
):
self._clf: Union[SVC, LinearSVC] = clf
self._title: str = title
self._belief: float = 0.0
self._clf = clf
self._title = title
self._belief = 0.0
# Only store dataset in Testing
self._X: Optional[np.array] = X if os.environ.get(
"TESTING", "NS"
) != "NS" else None
self._y: np.array = y
self._down: Optional[Snode] = None
self._up: Optional[Snode] = None
self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
self._y = y
self._down = None
self._up = None
self._class = None
self._sample_weight: Optional[np.array] = (
self._feature = None
self._sample_weight = (
weight if os.environ.get("TESTING", "NS") != "NS" else None
)
self._features: Tuple[int, ...] = features
self._impurity: float = impurity
self._features = features
self._impurity = impurity
@classmethod
def copy(cls, node: Snode) -> Snode:
def copy(cls, node: "Snode") -> "Snode":
return cls(
node._clf,
node._X,
@@ -69,22 +69,22 @@ class Snode:
node._title,
)
def set_down(self, son: Snode) -> None:
def set_down(self, son):
self._down = son
def set_up(self, son: Snode) -> None:
def set_up(self, son):
self._up = son
def is_leaf(self) -> bool:
return self._up is None and self._down is None
def get_down(self) -> Optional[Snode]:
def get_down(self) -> "Snode":
return self._down
def get_up(self) -> Optional[Snode]:
def get_up(self) -> "Snode":
return self._up
def make_predictor(self) -> None:
def make_predictor(self):
"""Compute the class of the predictor and its belief based on the
subdataset of the node only if it is a leaf
"""
@@ -123,11 +123,11 @@ class Siterator:
"""Stree preorder iterator
"""
def __init__(self, tree: Optional[Snode]):
self._stack: List[Snode] = []
def __init__(self, tree: Snode):
self._stack = []
self._push(tree)
def _push(self, node: Optional[Snode]) -> None:
def _push(self, node: Snode):
if node is not None:
self._stack.append(node)
@@ -143,21 +143,21 @@ class Siterator:
class Splitter:
def __init__(
self,
clf: Union[SVC, LinearSVC] = None,
criterion: str = "",
splitter_type: str = "",
criteria: str = "",
min_samples_split: int = 0,
random_state: Optional[int] = None,
clf: SVC = None,
criterion: str = None,
splitter_type: str = None,
criteria: str = None,
min_samples_split: int = None,
random_state=None,
):
self._clf: Union[SVC, LinearSVC] = clf
self._random_state: Optional[int] = random_state
self._clf = clf
self._random_state = random_state
if random_state is not None:
random.seed(random_state)
self._criterion: str = criterion
self._min_samples_split: int = min_samples_split
self._criteria: str = criteria
self._splitter_type: str = splitter_type
self._criterion = criterion
self._min_samples_split = min_samples_split
self._criteria = criteria
self._splitter_type = splitter_type
if clf is None:
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
@@ -186,7 +186,7 @@ class Splitter:
@staticmethod
def _gini(y: np.array) -> float:
_, count = np.unique(y, return_counts=True)
return float(1 - np.sum(np.square(count / np.sum(count))))
return 1 - np.sum(np.square(count / np.sum(count)))
@staticmethod
def _entropy(y: np.array) -> float:
@@ -220,7 +220,7 @@ class Splitter:
if samples == 0:
return 0.0
else:
result = float(
result = (
imp_prev
- (card_up / samples) * imp_up
- (card_dn / samples) * imp_dn
@@ -228,13 +228,10 @@ class Splitter:
return result
def _select_best_set(
self,
dataset: np.array,
labels: np.array,
features_sets: List[Tuple[int, ...]],
) -> Tuple[int, ...]:
max_gain: float = 0.0
selected: Union[Tuple[int, ...], None] = None
self, dataset: np.array, labels: np.array, features_sets: list
) -> list:
max_gain = 0
selected = None
warnings.filterwarnings("ignore", category=ConvergenceWarning)
for feature_set in features_sets:
self._clf.fit(dataset[:, feature_set], labels)
@@ -268,14 +265,14 @@ class Splitter:
def get_subspace(
self, dataset: np.array, labels: np.array, max_features: int
) -> Tuple[np.array, np.array]:
) -> list:
"""Return the best subspace to make a split
"""
indices = self._get_subspaces_set(dataset, labels, max_features)
return dataset[:, indices], indices
@staticmethod
def _min_distance(data: np.array, _: np.array) -> np.array:
def _min_distance(data: np.array, _) -> np.array:
"""Assign class to min distances
return a vector of classes so partition can separate class 0 from
@@ -291,7 +288,7 @@ class Splitter:
return np.argmin(data, axis=1)
@staticmethod
def _max_distance(data: np.array, _: np.array) -> np.array:
def _max_distance(data: np.array, _) -> np.array:
"""Assign class to max distances
return a vector of classes so partition can separate class 0 from
@@ -323,7 +320,7 @@ class Splitter:
selected = np.argmax(samples)
return data[:, selected]
def partition(self, samples: np.array, node: Snode) -> None:
def partition(self, samples: np.array, node: Snode):
"""Set the criteria to split arrays. Compute the indices of the samples
that should go to one side of the tree (down)
@@ -351,7 +348,7 @@ class Splitter:
"""
return node._clf.decision_function(data[:, node._features])
def part(self, origin: np.array) -> Tuple[np.array, np.array]:
def part(self, origin: np.array) -> list:
"""Split an array in two based on indices (down) and its complement
:param origin: dataset to split
@@ -362,13 +359,13 @@ class Splitter:
:rtype: list
"""
up = ~self._down
return (
return [
origin[up] if any(up) else None,
origin[self._down] if any(self._down) else None,
)
]
class Stree(BaseEstimator, ClassifierMixin): # type: ignore
class Stree(BaseEstimator, ClassifierMixin):
"""Estimator that is based on binary trees of svm nodes
can deal with sample_weights in predict, used in boosting sklearn methods
inheriting from BaseEstimator implements get_params and set_params methods
@@ -381,34 +378,42 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
C: float = 1.0,
kernel: str = "linear",
max_iter: int = 1000,
random_state: Optional[int] = None,
max_depth: Optional[int] = None,
random_state: int = None,
max_depth: int = None,
tol: float = 1e-4,
degree: int = 3,
gamma: Union[float, str] = "scale",
gamma="scale",
split_criteria: str = "max_samples",
criterion: str = "gini",
min_samples_split: int = 0,
max_features: Optional[Union[str, int, float]] = None,
max_features=None,
splitter: str = "random",
):
self.max_iter = max_iter
self.C: float = C
self.kernel: str = kernel
self.random_state: Optional[int] = random_state
self.max_depth: Optional[int] = max_depth
self.tol: float = tol
self.gamma: Union[float, str] = gamma
self.degree: int = degree
self.min_samples_split: int = min_samples_split
self.split_criteria: str = split_criteria
self.max_features: Union[str, int, float, None] = max_features
self.criterion: str = criterion
self.splitter: str = splitter
self.C = C
self.kernel = kernel
self.random_state = random_state
self.max_depth = max_depth
self.tol = tol
self.gamma = gamma
self.degree = degree
self.min_samples_split = min_samples_split
self.split_criteria = split_criteria
self.max_features = max_features
self.criterion = criterion
self.splitter = splitter
def _more_tags(self) -> dict:
"""Required by sklearn to supply features of the classifier
:return: the tag required
:rtype: dict
"""
return {"requires_y": True}
def fit(
self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
) -> Stree:
) -> "Stree":
"""Build the tree based on the dataset of samples and its labels
:param X: dataset of samples to make predictions
@@ -437,11 +442,13 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
f"Maximum depth has to be greater than 1... got (max_depth=\
{self.max_depth})"
)
check_classification_targets(y)
X, y = self._validate_data(X, y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(
sample_weight, X, dtype=np.float64
)
check_classification_targets(y)
# Initialize computed parameters
self.splitter_ = Splitter(
clf=self._build_clf(),
@@ -457,6 +464,8 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
self.n_classes_ = self.classes_.shape[0]
self.n_iter_ = self.max_iter
self.depth_ = 0
self.n_features_ = X.shape[1]
self.n_features_in_ = X.shape[1]
self.max_features_ = self._initialize_max_features()
self.tree_ = self.train(X, y, sample_weight, 1, "root")
self._build_predictor()
@@ -469,7 +478,7 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
sample_weight: np.ndarray,
depth: int,
title: str,
) -> Optional[Snode]:
) -> Snode:
"""Recursive function to split the original dataset into predictor
nodes (leaves)
@@ -530,25 +539,15 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
title=title + ", <cgaf>",
weight=sample_weight,
)
node.set_up(
self.train( # type: ignore
X_U, y_u, sw_u, depth + 1, title + " - Up"
)
)
node.set_down(
self.train( # type: ignore
X_D, y_d, sw_d, depth + 1, title + " - Down"
)
)
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
return node
def _build_predictor(self) -> None:
def _build_predictor(self):
"""Process the leaves to make them predictors
"""
def run_tree(node: Optional[Snode]) -> None:
if node is None:
raise ValueError("Can't build predictors on None")
def run_tree(node: Snode):
if node.is_leaf():
node.make_predictor()
return
@@ -557,7 +556,7 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
run_tree(self.tree_)
def _build_clf(self) -> Union[LinearSVC, SVC]:
def _build_clf(self):
""" Build the correct classifier for the node
"""
return (
@@ -606,30 +605,30 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
"""
def predict_class(
xp: np.array, indices: np.array, node: Optional[Snode]
xp: np.array, indices: np.array, node: Snode
) -> np.array:
if xp is None:
return [], []
if node.is_leaf(): # type: ignore
if node.is_leaf():
# set a class for every sample in dataset
prediction = np.full(
(xp.shape[0], 1), node._class # type: ignore
)
prediction = np.full((xp.shape[0], 1), node._class)
return prediction, indices
self.splitter_.partition(xp, node) # type: ignore
self.splitter_.partition(xp, node)
x_u, x_d = self.splitter_.part(xp)
i_u, i_d = self.splitter_.part(indices)
prx_u, prin_u = predict_class(
x_u, i_u, node.get_up() # type: ignore
)
prx_d, prin_d = predict_class(
x_d, i_d, node.get_down() # type: ignore
)
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
check_is_fitted(self, "n_features_in_")
# sklearn check
check_is_fitted(self, ["tree_"])
# Input validation
X = self._validate_data(X, reset=False)
X = check_array(X)
if X.shape[1] != self.n_features_:
raise ValueError(
f"Expected {self.n_features_} features but got "
f"({X.shape[1]})"
)
# setup prediction & make it happen
indices = np.arange(X.shape[0])
result = (
@@ -639,6 +638,32 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
)
return self.classes_[result]
def score(
self, X: np.array, y: np.array, sample_weight: np.array = None
) -> float:
"""Compute accuracy of the prediction
:param X: dataset of samples to make predictions
:type X: np.array
:param y_true: samples labels
:type y_true: np.array
:param sample_weight: weights of the samples. Rescale C per sample.
Hi' weights force the classifier to put more emphasis on these points
:type sample_weight: np.array optional
:return: accuracy of the prediction
:rtype: float
"""
# sklearn check
check_is_fitted(self)
check_classification_targets(y)
X, y = check_X_y(X, y)
y_pred = self.predict(X).reshape(y.shape)
# Compute accuracy for each possible representation
_, y_true, y_pred = _check_targets(y, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)
score = y_true == y_pred
return _weighted_sum(score, sample_weight, normalize=True)
def __iter__(self) -> Siterator:
"""Create an iterator to be able to visit the nodes of the tree in
preorder, can make a list with all the nodes in preorder
@@ -666,11 +691,11 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
def _initialize_max_features(self) -> int:
if isinstance(self.max_features, str):
if self.max_features == "auto":
max_features = max(1, int(np.sqrt(self.n_features_in_)))
max_features = max(1, int(np.sqrt(self.n_features_)))
elif self.max_features == "sqrt":
max_features = max(1, int(np.sqrt(self.n_features_in_)))
max_features = max(1, int(np.sqrt(self.n_features_)))
elif self.max_features == "log2":
max_features = max(1, int(np.log2(self.n_features_in_)))
max_features = max(1, int(np.log2(self.n_features_)))
else:
raise ValueError(
"Invalid value for max_features. "
@@ -678,13 +703,13 @@ class Stree(BaseEstimator, ClassifierMixin): # type: ignore
"'sqrt' or 'log2'."
)
elif self.max_features is None:
max_features = self.n_features_in_
elif isinstance(self.max_features, int):
max_features = self.n_features_
elif isinstance(self.max_features, numbers.Integral):
max_features = self.max_features
else: # float
if self.max_features > 0.0:
max_features = max(
1, int(self.max_features * self.n_features_in_)
1, int(self.max_features * self.n_features_)
)
else:
raise ValueError(

View File

@@ -1,4 +1,3 @@
# type: ignore
import os
import unittest

View File

@@ -1,4 +1,3 @@
# type: ignore
import os
import unittest
import random

View File

@@ -1,4 +1,3 @@
# type: ignore
import os
import unittest
import warnings
@@ -240,7 +239,7 @@ class Stree_test(unittest.TestCase):
(None, 16),
]
clf = Stree()
clf.n_features_in_ = n_features
clf.n_features_ = n_features
for max_features, expected in expected_values:
clf.set_params(**dict(max_features=max_features))
computed = clf._initialize_max_features()
@@ -415,24 +414,3 @@ class Stree_test(unittest.TestCase):
# zero weights are ok when they don't erase a class
_ = clf.train(X, y, weights_no_zero, 1, "test")
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
def test_build_predictor(self):
X, y = load_dataset(self._random_state)
clf = Stree(random_state=self._random_state)
with self.assertRaises(ValueError):
clf.tree_ = None
clf._build_predictor()
clf.fit(X, y)
node = clf.tree_.get_down().get_down()
expected_impurity = 0.04686951386893923
expected_class = 1
expected_belief = 0.9759887005649718
self.assertAlmostEqual(expected_impurity, node._impurity)
self.assertAlmostEqual(expected_belief, node._belief)
self.assertEqual(expected_class, node._class)
node._belief = 0.0
node._class = None
clf._build_predictor()
node = clf.tree_.get_down().get_down()
self.assertAlmostEqual(expected_belief, node._belief)
self.assertEqual(expected_class, node._class)

View File

@@ -1,4 +1,3 @@
# type: ignore
from .Stree_test import Stree_test
from .Snode_test import Snode_test
from .Splitter_test import Splitter_test

View File

@@ -1,4 +1,3 @@
# type: ignore
from sklearn.datasets import make_classification