Make training parallel

add pre-commit config
This commit is contained in:
Ricardo Montañana Gómez 2020-07-21 13:11:59 +02:00
parent b17582e93a
commit 8f7cbc9091
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
4 changed files with 85 additions and 58 deletions

View File

@ -3,8 +3,20 @@ repos:
rev: stable
hooks:
- id: black
language_version: python3.7
language_version: python3.8
- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.9
hooks:
- id: flake8
- id: flake8
- repo: https://github.com/pre-commit/mirrors-mypy
rev: '' # Use the sha / tag you want to point at
hooks:
- id: mypy
args: [--strict, --ignore-missing-imports]
- repo: local
hooks:
- id: unittest
name: unittest
entry: python -m unittest discover
language: system
pass_filenames: false

View File

@ -10,23 +10,22 @@ import random
import sys
from typing import Union, Optional, Tuple, List
from itertools import combinations
import numpy as np # type: ignore
from sklearn.utils.multiclass import ( # type: ignore
check_classification_targets,
)
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
from sklearn.ensemble import BaseEnsemble # type: ignore
from sklearn.utils.validation import ( # type: ignore
import numpy as np
from sklearn.utils.multiclass import check_classification_targets
from sklearn.base import clone, BaseEstimator, ClassifierMixin
from sklearn.ensemble import BaseEnsemble
from sklearn.utils.validation import (
check_is_fitted,
_check_sample_weight,
)
from stree import Stree # type: ignore
from joblib import Parallel, delayed
from stree import Stree
class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
def __init__(
self,
n_jobs: int = 1,
base_estimator: BaseEstimator = None,
random_state: int = 0,
max_features: Optional[Union[str, int, float]] = None,
@ -41,6 +40,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
super().__init__(
base_estimator=base_estimator, n_estimators=n_estimators,
)
self.n_jobs = n_jobs
self.n_estimators = n_estimators
self.random_state = random_state
self.max_features = max_features
@ -52,14 +52,6 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
return np.random.mtrand._rand
return np.random.RandomState(self.random_state)
@staticmethod
def _initialize_sample_weight(
sample_weight: np.array, n_samples: int
) -> np.array:
if sample_weight is None:
return np.ones((n_samples,), dtype=np.float64)
return sample_weight.copy()
def _validate_estimator(self) -> None:
"""Check the estimator and set the base_estimator_ attribute."""
super()._validate_estimator(
@ -77,6 +69,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
)
check_classification_targets(y)
X, y = self._validate_data(X, y)
# if weights is None return np.ones
sample_weight = _check_sample_weight(
sample_weight, X, dtype=np.float64
)
@ -90,35 +83,60 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
self.n_classes_: int = self.classes_.shape[0]
self.estimators_: List[BaseEstimator] = []
self.subspaces_: List[Tuple[int, ...]] = []
self._train(X, y, sample_weight)
result = self._train(X, y, sample_weight)
self.estimators_, self.subspaces_ = tuple( # type: ignore
zip(*result)
)
return self
def _train(
self, X: np.array, y: np.array, sample_weight: np.array
) -> None:
random_box = self._initialize_random()
random_seed = self.random_state
@staticmethod
def _parallel_build_tree(
base_estimator_: Stree,
X: np.array,
y: np.array,
weights: np.array,
random_box: np.random.mtrand.RandomState,
random_seed: int,
boot_samples: int,
max_features: int,
) -> Tuple[BaseEstimator, Tuple[int, ...]]:
clf = clone(base_estimator_)
clf.set_params(random_state=random_seed)
n_samples = X.shape[0]
# bootstrap
indices = random_box.randint(0, n_samples, boot_samples)
# update weights with the chosen samples
weights_update = np.bincount(indices, minlength=n_samples)
current_weights = weights * weights_update
# random subspace
features = Odte._get_random_subspace(X, y, max_features)
# train the classifier
bootstrap = X[indices, :]
clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
return (clf, features)
def _train(
self, X: np.array, y: np.array, weights: np.array
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
random_box = self._initialize_random()
n_samples = X.shape[0]
weights = self._initialize_sample_weight(sample_weight, n_samples)
boot_samples = self._get_bootstrap_n_samples(n_samples)
for _ in range(self.n_estimators):
# Build clf
clf = clone(self.base_estimator_)
clf.random_state = random_seed
random_seed += 1
self.estimators_.append(clf)
# bootstrap
indices = random_box.randint(0, n_samples, boot_samples)
# update weights with the chosen samples
weights_update = np.bincount(indices, minlength=n_samples)
features = self._get_random_subspace(X, y)
self.subspaces_.append(features)
current_weights = weights * weights_update
# train the classifier
bootstrap = X[indices, :]
clf.fit(
bootstrap[:, features], y[indices], current_weights[indices]
clf = clone(self.base_estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
clf,
X,
y,
weights,
random_box,
random_seed,
boot_samples,
self.max_features_,
)
for random_seed in range(
self.random_state, self.random_state + self.n_estimators
)
)
def _get_bootstrap_n_samples(self, n_samples: int) -> int:
if self.max_samples is None:
@ -171,11 +189,12 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
)
return max_features
@staticmethod
def _get_random_subspace(
self, dataset: np.array, labels: np.array
dataset: np.array, labels: np.array, max_features: int
) -> Tuple[int, ...]:
features = range(dataset.shape[1])
features_sets = list(combinations(features, self.max_features_))
features_sets = list(combinations(features, max_features))
if len(features_sets) > 1:
index = random.randint(0, len(features_sets) - 1)
return features_sets[index]

View File

@ -1,5 +1,6 @@
# type: ignore
import unittest
import numpy as np
import os
import warnings
from sklearn.exceptions import ConvergenceWarning
@ -27,16 +28,6 @@ class Odte_test(unittest.TestCase):
computed = tclf._get_bootstrap_n_samples(1500)
self.assertEqual(expected, computed)
def test_initialize_sample_weight(self):
m = 5
ones = np.ones(m,)
weights = np.random.rand(m,)
expected_values = [(None, ones), (weights, weights)]
for value, expected in expected_values:
tclf = Odte()
computed = tclf._initialize_sample_weight(value, m)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_initialize_max_feature(self):
expected_values = [
[0, 5, 6, 15],
@ -55,7 +46,7 @@ class Odte_test(unittest.TestCase):
random_state=self._random_state, max_features=max_features
)
tclf.fit(X, y)
computed = tclf._get_random_subspace(X, y)
computed = tclf._get_random_subspace(X, y, tclf.max_features_)
expected = expected_values.pop(0)
self.assertListEqual(expected, list(computed))
@ -88,11 +79,14 @@ class Odte_test(unittest.TestCase):
tclf.fit(*load_dataset(self._random_state))
def test_simple_predict(self):
os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
expected = [0, 1, 1, 1]
tclf = Odte(random_state=self._random_state, n_estimators=10,)
tclf = Odte(
random_state=self._random_state, n_estimators=10, n_jobs=-1
)
tclf.set_params(
**dict(
base_estimator__kernel="rbf",
@ -147,6 +141,7 @@ class Odte_test(unittest.TestCase):
@staticmethod
def test_is_a_sklearn_classifier():
os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator

View File

@ -1,3 +1,4 @@
# type: ignore
from .Odte_tests import Odte_test
__all__ = ["Odte_test"]