mirror of
https://github.com/Doctorado-ML/Odte.git
synced 2025-07-11 08:12:06 +00:00
Make training parallel
add pre-commit config
This commit is contained in:
parent
b17582e93a
commit
8f7cbc9091
@ -3,8 +3,20 @@ repos:
|
|||||||
rev: stable
|
rev: stable
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
language_version: python3.7
|
language_version: python3.8
|
||||||
- repo: https://gitlab.com/pycqa/flake8
|
- repo: https://gitlab.com/pycqa/flake8
|
||||||
rev: 3.7.9
|
rev: 3.7.9
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||||
|
rev: '' # Use the sha / tag you want to point at
|
||||||
|
hooks:
|
||||||
|
- id: mypy
|
||||||
|
args: [--strict, --ignore-missing-imports]
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: unittest
|
||||||
|
name: unittest
|
||||||
|
entry: python -m unittest discover
|
||||||
|
language: system
|
||||||
|
pass_filenames: false
|
93
odte/Odte.py
93
odte/Odte.py
@ -10,23 +10,22 @@ import random
|
|||||||
import sys
|
import sys
|
||||||
from typing import Union, Optional, Tuple, List
|
from typing import Union, Optional, Tuple, List
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
import numpy as np # type: ignore
|
import numpy as np
|
||||||
from sklearn.utils.multiclass import ( # type: ignore
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
check_classification_targets,
|
from sklearn.base import clone, BaseEstimator, ClassifierMixin
|
||||||
)
|
from sklearn.ensemble import BaseEnsemble
|
||||||
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
|
from sklearn.utils.validation import (
|
||||||
from sklearn.ensemble import BaseEnsemble # type: ignore
|
|
||||||
from sklearn.utils.validation import ( # type: ignore
|
|
||||||
check_is_fitted,
|
check_is_fitted,
|
||||||
_check_sample_weight,
|
_check_sample_weight,
|
||||||
)
|
)
|
||||||
|
from joblib import Parallel, delayed
|
||||||
from stree import Stree # type: ignore
|
from stree import Stree
|
||||||
|
|
||||||
|
|
||||||
class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
n_jobs: int = 1,
|
||||||
base_estimator: BaseEstimator = None,
|
base_estimator: BaseEstimator = None,
|
||||||
random_state: int = 0,
|
random_state: int = 0,
|
||||||
max_features: Optional[Union[str, int, float]] = None,
|
max_features: Optional[Union[str, int, float]] = None,
|
||||||
@ -41,6 +40,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
|||||||
super().__init__(
|
super().__init__(
|
||||||
base_estimator=base_estimator, n_estimators=n_estimators,
|
base_estimator=base_estimator, n_estimators=n_estimators,
|
||||||
)
|
)
|
||||||
|
self.n_jobs = n_jobs
|
||||||
self.n_estimators = n_estimators
|
self.n_estimators = n_estimators
|
||||||
self.random_state = random_state
|
self.random_state = random_state
|
||||||
self.max_features = max_features
|
self.max_features = max_features
|
||||||
@ -52,14 +52,6 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
|||||||
return np.random.mtrand._rand
|
return np.random.mtrand._rand
|
||||||
return np.random.RandomState(self.random_state)
|
return np.random.RandomState(self.random_state)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _initialize_sample_weight(
|
|
||||||
sample_weight: np.array, n_samples: int
|
|
||||||
) -> np.array:
|
|
||||||
if sample_weight is None:
|
|
||||||
return np.ones((n_samples,), dtype=np.float64)
|
|
||||||
return sample_weight.copy()
|
|
||||||
|
|
||||||
def _validate_estimator(self) -> None:
|
def _validate_estimator(self) -> None:
|
||||||
"""Check the estimator and set the base_estimator_ attribute."""
|
"""Check the estimator and set the base_estimator_ attribute."""
|
||||||
super()._validate_estimator(
|
super()._validate_estimator(
|
||||||
@ -77,6 +69,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
|||||||
)
|
)
|
||||||
check_classification_targets(y)
|
check_classification_targets(y)
|
||||||
X, y = self._validate_data(X, y)
|
X, y = self._validate_data(X, y)
|
||||||
|
# if weights is None return np.ones
|
||||||
sample_weight = _check_sample_weight(
|
sample_weight = _check_sample_weight(
|
||||||
sample_weight, X, dtype=np.float64
|
sample_weight, X, dtype=np.float64
|
||||||
)
|
)
|
||||||
@ -90,34 +83,59 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
|||||||
self.n_classes_: int = self.classes_.shape[0]
|
self.n_classes_: int = self.classes_.shape[0]
|
||||||
self.estimators_: List[BaseEstimator] = []
|
self.estimators_: List[BaseEstimator] = []
|
||||||
self.subspaces_: List[Tuple[int, ...]] = []
|
self.subspaces_: List[Tuple[int, ...]] = []
|
||||||
self._train(X, y, sample_weight)
|
result = self._train(X, y, sample_weight)
|
||||||
|
self.estimators_, self.subspaces_ = tuple( # type: ignore
|
||||||
|
zip(*result)
|
||||||
|
)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _train(
|
@staticmethod
|
||||||
self, X: np.array, y: np.array, sample_weight: np.array
|
def _parallel_build_tree(
|
||||||
) -> None:
|
base_estimator_: Stree,
|
||||||
random_box = self._initialize_random()
|
X: np.array,
|
||||||
random_seed = self.random_state
|
y: np.array,
|
||||||
|
weights: np.array,
|
||||||
|
random_box: np.random.mtrand.RandomState,
|
||||||
|
random_seed: int,
|
||||||
|
boot_samples: int,
|
||||||
|
max_features: int,
|
||||||
|
) -> Tuple[BaseEstimator, Tuple[int, ...]]:
|
||||||
|
clf = clone(base_estimator_)
|
||||||
|
clf.set_params(random_state=random_seed)
|
||||||
n_samples = X.shape[0]
|
n_samples = X.shape[0]
|
||||||
weights = self._initialize_sample_weight(sample_weight, n_samples)
|
|
||||||
boot_samples = self._get_bootstrap_n_samples(n_samples)
|
|
||||||
for _ in range(self.n_estimators):
|
|
||||||
# Build clf
|
|
||||||
clf = clone(self.base_estimator_)
|
|
||||||
clf.random_state = random_seed
|
|
||||||
random_seed += 1
|
|
||||||
self.estimators_.append(clf)
|
|
||||||
# bootstrap
|
# bootstrap
|
||||||
indices = random_box.randint(0, n_samples, boot_samples)
|
indices = random_box.randint(0, n_samples, boot_samples)
|
||||||
# update weights with the chosen samples
|
# update weights with the chosen samples
|
||||||
weights_update = np.bincount(indices, minlength=n_samples)
|
weights_update = np.bincount(indices, minlength=n_samples)
|
||||||
features = self._get_random_subspace(X, y)
|
|
||||||
self.subspaces_.append(features)
|
|
||||||
current_weights = weights * weights_update
|
current_weights = weights * weights_update
|
||||||
|
# random subspace
|
||||||
|
features = Odte._get_random_subspace(X, y, max_features)
|
||||||
# train the classifier
|
# train the classifier
|
||||||
bootstrap = X[indices, :]
|
bootstrap = X[indices, :]
|
||||||
clf.fit(
|
clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
|
||||||
bootstrap[:, features], y[indices], current_weights[indices]
|
return (clf, features)
|
||||||
|
|
||||||
|
def _train(
|
||||||
|
self, X: np.array, y: np.array, weights: np.array
|
||||||
|
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
|
||||||
|
random_box = self._initialize_random()
|
||||||
|
n_samples = X.shape[0]
|
||||||
|
boot_samples = self._get_bootstrap_n_samples(n_samples)
|
||||||
|
clf = clone(self.base_estimator_)
|
||||||
|
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
|
||||||
|
delayed(Odte._parallel_build_tree)(
|
||||||
|
clf,
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
weights,
|
||||||
|
random_box,
|
||||||
|
random_seed,
|
||||||
|
boot_samples,
|
||||||
|
self.max_features_,
|
||||||
|
)
|
||||||
|
for random_seed in range(
|
||||||
|
self.random_state, self.random_state + self.n_estimators
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_bootstrap_n_samples(self, n_samples: int) -> int:
|
def _get_bootstrap_n_samples(self, n_samples: int) -> int:
|
||||||
@ -171,11 +189,12 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
|||||||
)
|
)
|
||||||
return max_features
|
return max_features
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def _get_random_subspace(
|
def _get_random_subspace(
|
||||||
self, dataset: np.array, labels: np.array
|
dataset: np.array, labels: np.array, max_features: int
|
||||||
) -> Tuple[int, ...]:
|
) -> Tuple[int, ...]:
|
||||||
features = range(dataset.shape[1])
|
features = range(dataset.shape[1])
|
||||||
features_sets = list(combinations(features, self.max_features_))
|
features_sets = list(combinations(features, max_features))
|
||||||
if len(features_sets) > 1:
|
if len(features_sets) > 1:
|
||||||
index = random.randint(0, len(features_sets) - 1)
|
index = random.randint(0, len(features_sets) - 1)
|
||||||
return features_sets[index]
|
return features_sets[index]
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
# type: ignore
|
||||||
import unittest
|
import unittest
|
||||||
import numpy as np
|
import os
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
@ -27,16 +28,6 @@ class Odte_test(unittest.TestCase):
|
|||||||
computed = tclf._get_bootstrap_n_samples(1500)
|
computed = tclf._get_bootstrap_n_samples(1500)
|
||||||
self.assertEqual(expected, computed)
|
self.assertEqual(expected, computed)
|
||||||
|
|
||||||
def test_initialize_sample_weight(self):
|
|
||||||
m = 5
|
|
||||||
ones = np.ones(m,)
|
|
||||||
weights = np.random.rand(m,)
|
|
||||||
expected_values = [(None, ones), (weights, weights)]
|
|
||||||
for value, expected in expected_values:
|
|
||||||
tclf = Odte()
|
|
||||||
computed = tclf._initialize_sample_weight(value, m)
|
|
||||||
self.assertListEqual(expected.tolist(), computed.tolist())
|
|
||||||
|
|
||||||
def test_initialize_max_feature(self):
|
def test_initialize_max_feature(self):
|
||||||
expected_values = [
|
expected_values = [
|
||||||
[0, 5, 6, 15],
|
[0, 5, 6, 15],
|
||||||
@ -55,7 +46,7 @@ class Odte_test(unittest.TestCase):
|
|||||||
random_state=self._random_state, max_features=max_features
|
random_state=self._random_state, max_features=max_features
|
||||||
)
|
)
|
||||||
tclf.fit(X, y)
|
tclf.fit(X, y)
|
||||||
computed = tclf._get_random_subspace(X, y)
|
computed = tclf._get_random_subspace(X, y, tclf.max_features_)
|
||||||
expected = expected_values.pop(0)
|
expected = expected_values.pop(0)
|
||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
|
|
||||||
@ -88,11 +79,14 @@ class Odte_test(unittest.TestCase):
|
|||||||
tclf.fit(*load_dataset(self._random_state))
|
tclf.fit(*load_dataset(self._random_state))
|
||||||
|
|
||||||
def test_simple_predict(self):
|
def test_simple_predict(self):
|
||||||
|
os.environ["PYTHONWARNINGS"] = "ignore"
|
||||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
|
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
|
||||||
expected = [0, 1, 1, 1]
|
expected = [0, 1, 1, 1]
|
||||||
tclf = Odte(random_state=self._random_state, n_estimators=10,)
|
tclf = Odte(
|
||||||
|
random_state=self._random_state, n_estimators=10, n_jobs=-1
|
||||||
|
)
|
||||||
tclf.set_params(
|
tclf.set_params(
|
||||||
**dict(
|
**dict(
|
||||||
base_estimator__kernel="rbf",
|
base_estimator__kernel="rbf",
|
||||||
@ -147,6 +141,7 @@ class Odte_test(unittest.TestCase):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def test_is_a_sklearn_classifier():
|
def test_is_a_sklearn_classifier():
|
||||||
|
os.environ["PYTHONWARNINGS"] = "ignore"
|
||||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||||
from sklearn.utils.estimator_checks import check_estimator
|
from sklearn.utils.estimator_checks import check_estimator
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
# type: ignore
|
||||||
from .Odte_tests import Odte_test
|
from .Odte_tests import Odte_test
|
||||||
|
|
||||||
__all__ = ["Odte_test"]
|
__all__ = ["Odte_test"]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user