mirror of
https://github.com/Doctorado-ML/Odte.git
synced 2025-07-11 08:12:06 +00:00
Make training parallel
add pre-commit config
This commit is contained in:
parent
b17582e93a
commit
8f7cbc9091
@ -3,8 +3,20 @@ repos:
|
||||
rev: stable
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.7
|
||||
language_version: python3.8
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
rev: 3.7.9
|
||||
hooks:
|
||||
- id: flake8
|
||||
- id: flake8
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: '' # Use the sha / tag you want to point at
|
||||
hooks:
|
||||
- id: mypy
|
||||
args: [--strict, --ignore-missing-imports]
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: unittest
|
||||
name: unittest
|
||||
entry: python -m unittest discover
|
||||
language: system
|
||||
pass_filenames: false
|
105
odte/Odte.py
105
odte/Odte.py
@ -10,23 +10,22 @@ import random
|
||||
import sys
|
||||
from typing import Union, Optional, Tuple, List
|
||||
from itertools import combinations
|
||||
import numpy as np # type: ignore
|
||||
from sklearn.utils.multiclass import ( # type: ignore
|
||||
check_classification_targets,
|
||||
)
|
||||
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
|
||||
from sklearn.ensemble import BaseEnsemble # type: ignore
|
||||
from sklearn.utils.validation import ( # type: ignore
|
||||
import numpy as np
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.base import clone, BaseEstimator, ClassifierMixin
|
||||
from sklearn.ensemble import BaseEnsemble
|
||||
from sklearn.utils.validation import (
|
||||
check_is_fitted,
|
||||
_check_sample_weight,
|
||||
)
|
||||
|
||||
from stree import Stree # type: ignore
|
||||
from joblib import Parallel, delayed
|
||||
from stree import Stree
|
||||
|
||||
|
||||
class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
||||
def __init__(
|
||||
self,
|
||||
n_jobs: int = 1,
|
||||
base_estimator: BaseEstimator = None,
|
||||
random_state: int = 0,
|
||||
max_features: Optional[Union[str, int, float]] = None,
|
||||
@ -41,6 +40,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
||||
super().__init__(
|
||||
base_estimator=base_estimator, n_estimators=n_estimators,
|
||||
)
|
||||
self.n_jobs = n_jobs
|
||||
self.n_estimators = n_estimators
|
||||
self.random_state = random_state
|
||||
self.max_features = max_features
|
||||
@ -52,14 +52,6 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
||||
return np.random.mtrand._rand
|
||||
return np.random.RandomState(self.random_state)
|
||||
|
||||
@staticmethod
|
||||
def _initialize_sample_weight(
|
||||
sample_weight: np.array, n_samples: int
|
||||
) -> np.array:
|
||||
if sample_weight is None:
|
||||
return np.ones((n_samples,), dtype=np.float64)
|
||||
return sample_weight.copy()
|
||||
|
||||
def _validate_estimator(self) -> None:
|
||||
"""Check the estimator and set the base_estimator_ attribute."""
|
||||
super()._validate_estimator(
|
||||
@ -77,6 +69,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
||||
)
|
||||
check_classification_targets(y)
|
||||
X, y = self._validate_data(X, y)
|
||||
# if weights is None return np.ones
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=np.float64
|
||||
)
|
||||
@ -90,35 +83,60 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
||||
self.n_classes_: int = self.classes_.shape[0]
|
||||
self.estimators_: List[BaseEstimator] = []
|
||||
self.subspaces_: List[Tuple[int, ...]] = []
|
||||
self._train(X, y, sample_weight)
|
||||
result = self._train(X, y, sample_weight)
|
||||
self.estimators_, self.subspaces_ = tuple( # type: ignore
|
||||
zip(*result)
|
||||
)
|
||||
return self
|
||||
|
||||
def _train(
|
||||
self, X: np.array, y: np.array, sample_weight: np.array
|
||||
) -> None:
|
||||
random_box = self._initialize_random()
|
||||
random_seed = self.random_state
|
||||
@staticmethod
|
||||
def _parallel_build_tree(
|
||||
base_estimator_: Stree,
|
||||
X: np.array,
|
||||
y: np.array,
|
||||
weights: np.array,
|
||||
random_box: np.random.mtrand.RandomState,
|
||||
random_seed: int,
|
||||
boot_samples: int,
|
||||
max_features: int,
|
||||
) -> Tuple[BaseEstimator, Tuple[int, ...]]:
|
||||
clf = clone(base_estimator_)
|
||||
clf.set_params(random_state=random_seed)
|
||||
n_samples = X.shape[0]
|
||||
# bootstrap
|
||||
indices = random_box.randint(0, n_samples, boot_samples)
|
||||
# update weights with the chosen samples
|
||||
weights_update = np.bincount(indices, minlength=n_samples)
|
||||
current_weights = weights * weights_update
|
||||
# random subspace
|
||||
features = Odte._get_random_subspace(X, y, max_features)
|
||||
# train the classifier
|
||||
bootstrap = X[indices, :]
|
||||
clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
|
||||
return (clf, features)
|
||||
|
||||
def _train(
|
||||
self, X: np.array, y: np.array, weights: np.array
|
||||
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
|
||||
random_box = self._initialize_random()
|
||||
n_samples = X.shape[0]
|
||||
weights = self._initialize_sample_weight(sample_weight, n_samples)
|
||||
boot_samples = self._get_bootstrap_n_samples(n_samples)
|
||||
for _ in range(self.n_estimators):
|
||||
# Build clf
|
||||
clf = clone(self.base_estimator_)
|
||||
clf.random_state = random_seed
|
||||
random_seed += 1
|
||||
self.estimators_.append(clf)
|
||||
# bootstrap
|
||||
indices = random_box.randint(0, n_samples, boot_samples)
|
||||
# update weights with the chosen samples
|
||||
weights_update = np.bincount(indices, minlength=n_samples)
|
||||
features = self._get_random_subspace(X, y)
|
||||
self.subspaces_.append(features)
|
||||
current_weights = weights * weights_update
|
||||
# train the classifier
|
||||
bootstrap = X[indices, :]
|
||||
clf.fit(
|
||||
bootstrap[:, features], y[indices], current_weights[indices]
|
||||
clf = clone(self.base_estimator_)
|
||||
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
|
||||
delayed(Odte._parallel_build_tree)(
|
||||
clf,
|
||||
X,
|
||||
y,
|
||||
weights,
|
||||
random_box,
|
||||
random_seed,
|
||||
boot_samples,
|
||||
self.max_features_,
|
||||
)
|
||||
for random_seed in range(
|
||||
self.random_state, self.random_state + self.n_estimators
|
||||
)
|
||||
)
|
||||
|
||||
def _get_bootstrap_n_samples(self, n_samples: int) -> int:
|
||||
if self.max_samples is None:
|
||||
@ -171,11 +189,12 @@ class Odte(BaseEnsemble, ClassifierMixin): # type: ignore
|
||||
)
|
||||
return max_features
|
||||
|
||||
@staticmethod
|
||||
def _get_random_subspace(
|
||||
self, dataset: np.array, labels: np.array
|
||||
dataset: np.array, labels: np.array, max_features: int
|
||||
) -> Tuple[int, ...]:
|
||||
features = range(dataset.shape[1])
|
||||
features_sets = list(combinations(features, self.max_features_))
|
||||
features_sets = list(combinations(features, max_features))
|
||||
if len(features_sets) > 1:
|
||||
index = random.randint(0, len(features_sets) - 1)
|
||||
return features_sets[index]
|
||||
|
@ -1,5 +1,6 @@
|
||||
# type: ignore
|
||||
import unittest
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
import warnings
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
@ -27,16 +28,6 @@ class Odte_test(unittest.TestCase):
|
||||
computed = tclf._get_bootstrap_n_samples(1500)
|
||||
self.assertEqual(expected, computed)
|
||||
|
||||
def test_initialize_sample_weight(self):
|
||||
m = 5
|
||||
ones = np.ones(m,)
|
||||
weights = np.random.rand(m,)
|
||||
expected_values = [(None, ones), (weights, weights)]
|
||||
for value, expected in expected_values:
|
||||
tclf = Odte()
|
||||
computed = tclf._initialize_sample_weight(value, m)
|
||||
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||
|
||||
def test_initialize_max_feature(self):
|
||||
expected_values = [
|
||||
[0, 5, 6, 15],
|
||||
@ -55,7 +46,7 @@ class Odte_test(unittest.TestCase):
|
||||
random_state=self._random_state, max_features=max_features
|
||||
)
|
||||
tclf.fit(X, y)
|
||||
computed = tclf._get_random_subspace(X, y)
|
||||
computed = tclf._get_random_subspace(X, y, tclf.max_features_)
|
||||
expected = expected_values.pop(0)
|
||||
self.assertListEqual(expected, list(computed))
|
||||
|
||||
@ -88,11 +79,14 @@ class Odte_test(unittest.TestCase):
|
||||
tclf.fit(*load_dataset(self._random_state))
|
||||
|
||||
def test_simple_predict(self):
|
||||
os.environ["PYTHONWARNINGS"] = "ignore"
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
|
||||
expected = [0, 1, 1, 1]
|
||||
tclf = Odte(random_state=self._random_state, n_estimators=10,)
|
||||
tclf = Odte(
|
||||
random_state=self._random_state, n_estimators=10, n_jobs=-1
|
||||
)
|
||||
tclf.set_params(
|
||||
**dict(
|
||||
base_estimator__kernel="rbf",
|
||||
@ -147,6 +141,7 @@ class Odte_test(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def test_is_a_sklearn_classifier():
|
||||
os.environ["PYTHONWARNINGS"] = "ignore"
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
|
@ -1,3 +1,4 @@
|
||||
# type: ignore
|
||||
from .Odte_tests import Odte_test
|
||||
|
||||
__all__ = ["Odte_test"]
|
||||
|
Loading…
x
Reference in New Issue
Block a user