Make training parallel

add pre-commit config
This commit is contained in:
Ricardo Montañana Gómez 2020-07-21 13:11:59 +02:00
parent b17582e93a
commit 8f7cbc9091
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
4 changed files with 85 additions and 58 deletions

View File

@ -3,8 +3,20 @@ repos:
rev: stable rev: stable
hooks: hooks:
- id: black - id: black
language_version: python3.7 language_version: python3.8
- repo: https://gitlab.com/pycqa/flake8 - repo: https://gitlab.com/pycqa/flake8
rev: 3.7.9 rev: 3.7.9
hooks: hooks:
- id: flake8 - id: flake8
- repo: https://github.com/pre-commit/mirrors-mypy
rev: '' # Use the sha / tag you want to point at
hooks:
- id: mypy
args: [--strict, --ignore-missing-imports]
- repo: local
hooks:
- id: unittest
name: unittest
entry: python -m unittest discover
language: system
pass_filenames: false

View File

@ -10,23 +10,22 @@ import random
import sys import sys
from typing import Union, Optional, Tuple, List from typing import Union, Optional, Tuple, List
from itertools import combinations from itertools import combinations
import numpy as np # type: ignore import numpy as np
from sklearn.utils.multiclass import ( # type: ignore from sklearn.utils.multiclass import check_classification_targets
check_classification_targets, from sklearn.base import clone, BaseEstimator, ClassifierMixin
) from sklearn.ensemble import BaseEnsemble
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore from sklearn.utils.validation import (
from sklearn.ensemble import BaseEnsemble # type: ignore
from sklearn.utils.validation import ( # type: ignore
check_is_fitted, check_is_fitted,
_check_sample_weight, _check_sample_weight,
) )
from joblib import Parallel, delayed
from stree import Stree # type: ignore from stree import Stree
class Odte(BaseEnsemble, ClassifierMixin): # type: ignore class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
def __init__( def __init__(
self, self,
n_jobs: int = 1,
base_estimator: BaseEstimator = None, base_estimator: BaseEstimator = None,
random_state: int = 0, random_state: int = 0,
max_features: Optional[Union[str, int, float]] = None, max_features: Optional[Union[str, int, float]] = None,
@ -41,6 +40,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
super().__init__( super().__init__(
base_estimator=base_estimator, n_estimators=n_estimators, base_estimator=base_estimator, n_estimators=n_estimators,
) )
self.n_jobs = n_jobs
self.n_estimators = n_estimators self.n_estimators = n_estimators
self.random_state = random_state self.random_state = random_state
self.max_features = max_features self.max_features = max_features
@ -52,14 +52,6 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
return np.random.mtrand._rand return np.random.mtrand._rand
return np.random.RandomState(self.random_state) return np.random.RandomState(self.random_state)
@staticmethod
def _initialize_sample_weight(
sample_weight: np.array, n_samples: int
) -> np.array:
if sample_weight is None:
return np.ones((n_samples,), dtype=np.float64)
return sample_weight.copy()
def _validate_estimator(self) -> None: def _validate_estimator(self) -> None:
"""Check the estimator and set the base_estimator_ attribute.""" """Check the estimator and set the base_estimator_ attribute."""
super()._validate_estimator( super()._validate_estimator(
@ -77,6 +69,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
) )
check_classification_targets(y) check_classification_targets(y)
X, y = self._validate_data(X, y) X, y = self._validate_data(X, y)
# if weights is None return np.ones
sample_weight = _check_sample_weight( sample_weight = _check_sample_weight(
sample_weight, X, dtype=np.float64 sample_weight, X, dtype=np.float64
) )
@ -90,34 +83,59 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
self.n_classes_: int = self.classes_.shape[0] self.n_classes_: int = self.classes_.shape[0]
self.estimators_: List[BaseEstimator] = [] self.estimators_: List[BaseEstimator] = []
self.subspaces_: List[Tuple[int, ...]] = [] self.subspaces_: List[Tuple[int, ...]] = []
self._train(X, y, sample_weight) result = self._train(X, y, sample_weight)
self.estimators_, self.subspaces_ = tuple( # type: ignore
zip(*result)
)
return self return self
def _train( @staticmethod
self, X: np.array, y: np.array, sample_weight: np.array def _parallel_build_tree(
) -> None: base_estimator_: Stree,
random_box = self._initialize_random() X: np.array,
random_seed = self.random_state y: np.array,
weights: np.array,
random_box: np.random.mtrand.RandomState,
random_seed: int,
boot_samples: int,
max_features: int,
) -> Tuple[BaseEstimator, Tuple[int, ...]]:
clf = clone(base_estimator_)
clf.set_params(random_state=random_seed)
n_samples = X.shape[0] n_samples = X.shape[0]
weights = self._initialize_sample_weight(sample_weight, n_samples)
boot_samples = self._get_bootstrap_n_samples(n_samples)
for _ in range(self.n_estimators):
# Build clf
clf = clone(self.base_estimator_)
clf.random_state = random_seed
random_seed += 1
self.estimators_.append(clf)
# bootstrap # bootstrap
indices = random_box.randint(0, n_samples, boot_samples) indices = random_box.randint(0, n_samples, boot_samples)
# update weights with the chosen samples # update weights with the chosen samples
weights_update = np.bincount(indices, minlength=n_samples) weights_update = np.bincount(indices, minlength=n_samples)
features = self._get_random_subspace(X, y)
self.subspaces_.append(features)
current_weights = weights * weights_update current_weights = weights * weights_update
# random subspace
features = Odte._get_random_subspace(X, y, max_features)
# train the classifier # train the classifier
bootstrap = X[indices, :] bootstrap = X[indices, :]
clf.fit( clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
bootstrap[:, features], y[indices], current_weights[indices] return (clf, features)
def _train(
self, X: np.array, y: np.array, weights: np.array
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
random_box = self._initialize_random()
n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples)
clf = clone(self.base_estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
clf,
X,
y,
weights,
random_box,
random_seed,
boot_samples,
self.max_features_,
)
for random_seed in range(
self.random_state, self.random_state + self.n_estimators
)
) )
def _get_bootstrap_n_samples(self, n_samples: int) -> int: def _get_bootstrap_n_samples(self, n_samples: int) -> int:
@ -171,11 +189,12 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
) )
return max_features return max_features
@staticmethod
def _get_random_subspace( def _get_random_subspace(
self, dataset: np.array, labels: np.array dataset: np.array, labels: np.array, max_features: int
) -> Tuple[int, ...]: ) -> Tuple[int, ...]:
features = range(dataset.shape[1]) features = range(dataset.shape[1])
features_sets = list(combinations(features, self.max_features_)) features_sets = list(combinations(features, max_features))
if len(features_sets) > 1: if len(features_sets) > 1:
index = random.randint(0, len(features_sets) - 1) index = random.randint(0, len(features_sets) - 1)
return features_sets[index] return features_sets[index]

View File

@ -1,5 +1,6 @@
# type: ignore
import unittest import unittest
import numpy as np import os
import warnings import warnings
from sklearn.exceptions import ConvergenceWarning from sklearn.exceptions import ConvergenceWarning
@ -27,16 +28,6 @@ class Odte_test(unittest.TestCase):
computed = tclf._get_bootstrap_n_samples(1500) computed = tclf._get_bootstrap_n_samples(1500)
self.assertEqual(expected, computed) self.assertEqual(expected, computed)
def test_initialize_sample_weight(self):
m = 5
ones = np.ones(m,)
weights = np.random.rand(m,)
expected_values = [(None, ones), (weights, weights)]
for value, expected in expected_values:
tclf = Odte()
computed = tclf._initialize_sample_weight(value, m)
self.assertListEqual(expected.tolist(), computed.tolist())
def test_initialize_max_feature(self): def test_initialize_max_feature(self):
expected_values = [ expected_values = [
[0, 5, 6, 15], [0, 5, 6, 15],
@ -55,7 +46,7 @@ class Odte_test(unittest.TestCase):
random_state=self._random_state, max_features=max_features random_state=self._random_state, max_features=max_features
) )
tclf.fit(X, y) tclf.fit(X, y)
computed = tclf._get_random_subspace(X, y) computed = tclf._get_random_subspace(X, y, tclf.max_features_)
expected = expected_values.pop(0) expected = expected_values.pop(0)
self.assertListEqual(expected, list(computed)) self.assertListEqual(expected, list(computed))
@ -88,11 +79,14 @@ class Odte_test(unittest.TestCase):
tclf.fit(*load_dataset(self._random_state)) tclf.fit(*load_dataset(self._random_state))
def test_simple_predict(self): def test_simple_predict(self):
os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=RuntimeWarning)
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2] X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
expected = [0, 1, 1, 1] expected = [0, 1, 1, 1]
tclf = Odte(random_state=self._random_state, n_estimators=10,) tclf = Odte(
random_state=self._random_state, n_estimators=10, n_jobs=-1
)
tclf.set_params( tclf.set_params(
**dict( **dict(
base_estimator__kernel="rbf", base_estimator__kernel="rbf",
@ -147,6 +141,7 @@ class Odte_test(unittest.TestCase):
@staticmethod @staticmethod
def test_is_a_sklearn_classifier(): def test_is_a_sklearn_classifier():
os.environ["PYTHONWARNINGS"] = "ignore"
warnings.filterwarnings("ignore", category=ConvergenceWarning) warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) warnings.filterwarnings("ignore", category=RuntimeWarning)
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator

View File

@ -1,3 +1,4 @@
# type: ignore
from .Odte_tests import Odte_test from .Odte_tests import Odte_test
__all__ = ["Odte_test"] __all__ = ["Odte_test"]