Make training parallel

add pre-commit config
2025-07-11 16:22:00 +00:00 · 2020-07-21 13:11:59 +02:00 · 2020-07-21 13:11:59 +02:00 · 8f7cbc9091
commit 8f7cbc9091
parent b17582e93a
4 changed files with 85 additions and 58 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,8 +3,20 @@ repos:
    rev: stable
    hooks:
    - id: black
-      language_version: python3.7
+      language_version: python3.8
 -   repo: https://gitlab.com/pycqa/flake8
    rev: 3.7.9
    hooks:
    - id: flake8
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: ''  # Use the sha / tag you want to point at
    hooks:
    -   id: mypy
        args: [--strict, --ignore-missing-imports]
 -   repo: local
    hooks:
      - id: unittest
        name: unittest
        entry: python -m unittest discover 
        language: system
        pass_filenames: false
--- a/odte/Odte.py
+++ b/odte/Odte.py
@ -10,23 +10,22 @@ import random
 import sys
 from typing import Union, Optional, Tuple, List
 from itertools import combinations
-import numpy as np  # type: ignore
+import numpy as np
-from sklearn.utils.multiclass import (  # type: ignore
+from sklearn.utils.multiclass import check_classification_targets
-    check_classification_targets,
+from sklearn.base import clone, BaseEstimator, ClassifierMixin
-)
+from sklearn.ensemble import BaseEnsemble
-from sklearn.base import clone, BaseEstimator, ClassifierMixin  # type: ignore
+from sklearn.utils.validation import (
 from sklearn.ensemble import BaseEnsemble  # type: ignore
 from sklearn.utils.validation import (  # type: ignore
    check_is_fitted,
    _check_sample_weight,
 )
-
+from joblib import Parallel, delayed
-from stree import Stree  # type: ignore
+from stree import Stree
 class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
    def __init__(
        self,
        n_jobs: int = 1,
        base_estimator: BaseEstimator = None,
        random_state: int = 0,
        max_features: Optional[Union[str, int, float]] = None,
@ -41,6 +40,7 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
        super().__init__(
            base_estimator=base_estimator, n_estimators=n_estimators,
        )
        self.n_jobs = n_jobs
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.max_features = max_features
@ -52,14 +52,6 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
            return np.random.mtrand._rand
        return np.random.RandomState(self.random_state)
    @staticmethod
    def _initialize_sample_weight(
        sample_weight: np.array, n_samples: int
    ) -> np.array:
        if sample_weight is None:
            return np.ones((n_samples,), dtype=np.float64)
        return sample_weight.copy()
    def _validate_estimator(self) -> None:
        """Check the estimator and set the base_estimator_ attribute."""
        super()._validate_estimator(
@ -77,6 +69,7 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
            )
        check_classification_targets(y)
        X, y = self._validate_data(X, y)
        # if weights is None return np.ones
        sample_weight = _check_sample_weight(
            sample_weight, X, dtype=np.float64
        )
@ -90,34 +83,59 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
        self.n_classes_: int = self.classes_.shape[0]
        self.estimators_: List[BaseEstimator] = []
        self.subspaces_: List[Tuple[int, ...]] = []
-        self._train(X, y, sample_weight)
+        result = self._train(X, y, sample_weight)
        self.estimators_, self.subspaces_ = tuple(  # type: ignore
            zip(*result)
        )
        return self
-    def _train(
+    @staticmethod
-        self, X: np.array, y: np.array, sample_weight: np.array
+    def _parallel_build_tree(
-    ) -> None:
+        base_estimator_: Stree,
-        random_box = self._initialize_random()
+        X: np.array,
-        random_seed = self.random_state
+        y: np.array,
        weights: np.array,
        random_box: np.random.mtrand.RandomState,
        random_seed: int,
        boot_samples: int,
        max_features: int,
    ) -> Tuple[BaseEstimator, Tuple[int, ...]]:
        clf = clone(base_estimator_)
        clf.set_params(random_state=random_seed)
        n_samples = X.shape[0]
        weights = self._initialize_sample_weight(sample_weight, n_samples)
        boot_samples = self._get_bootstrap_n_samples(n_samples)
        for _ in range(self.n_estimators):
            # Build clf
            clf = clone(self.base_estimator_)
            clf.random_state = random_seed
            random_seed += 1
            self.estimators_.append(clf)
        # bootstrap
        indices = random_box.randint(0, n_samples, boot_samples)
        # update weights with the chosen samples
        weights_update = np.bincount(indices, minlength=n_samples)
            features = self._get_random_subspace(X, y)
            self.subspaces_.append(features)
        current_weights = weights * weights_update
        # random subspace
        features = Odte._get_random_subspace(X, y, max_features)
        # train the classifier
        bootstrap = X[indices, :]
-            clf.fit(
+        clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
-                bootstrap[:, features], y[indices], current_weights[indices]
+        return (clf, features)
    def _train(
        self, X: np.array, y: np.array, weights: np.array
    ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
        random_box = self._initialize_random()
        n_samples = X.shape[0]
        boot_samples = self._get_bootstrap_n_samples(n_samples)
        clf = clone(self.base_estimator_)
        return Parallel(n_jobs=self.n_jobs, prefer="threads")(  # type: ignore
            delayed(Odte._parallel_build_tree)(
                clf,
                X,
                y,
                weights,
                random_box,
                random_seed,
                boot_samples,
                self.max_features_,
            )
            for random_seed in range(
                self.random_state, self.random_state + self.n_estimators
            )
        )
    def _get_bootstrap_n_samples(self, n_samples: int) -> int:
@ -171,11 +189,12 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
                )
        return max_features
    @staticmethod
    def _get_random_subspace(
-        self, dataset: np.array, labels: np.array
+        dataset: np.array, labels: np.array, max_features: int
    ) -> Tuple[int, ...]:
        features = range(dataset.shape[1])
-        features_sets = list(combinations(features, self.max_features_))
+        features_sets = list(combinations(features, max_features))
        if len(features_sets) > 1:
            index = random.randint(0, len(features_sets) - 1)
            return features_sets[index]
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@ -1,5 +1,6 @@
 # type: ignore
 import unittest
-import numpy as np
+import os
 import warnings
 from sklearn.exceptions import ConvergenceWarning
@ -27,16 +28,6 @@ class Odte_test(unittest.TestCase):
            computed = tclf._get_bootstrap_n_samples(1500)
            self.assertEqual(expected, computed)
    def test_initialize_sample_weight(self):
        m = 5
        ones = np.ones(m,)
        weights = np.random.rand(m,)
        expected_values = [(None, ones), (weights, weights)]
        for value, expected in expected_values:
            tclf = Odte()
            computed = tclf._initialize_sample_weight(value, m)
            self.assertListEqual(expected.tolist(), computed.tolist())
    def test_initialize_max_feature(self):
        expected_values = [
            [0, 5, 6, 15],
@ -55,7 +46,7 @@ class Odte_test(unittest.TestCase):
                random_state=self._random_state, max_features=max_features
            )
            tclf.fit(X, y)
-            computed = tclf._get_random_subspace(X, y)
+            computed = tclf._get_random_subspace(X, y, tclf.max_features_)
            expected = expected_values.pop(0)
            self.assertListEqual(expected, list(computed))
@ -88,11 +79,14 @@ class Odte_test(unittest.TestCase):
                tclf.fit(*load_dataset(self._random_state))
    def test_simple_predict(self):
        os.environ["PYTHONWARNINGS"] = "ignore"
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
        expected = [0, 1, 1, 1]
-        tclf = Odte(random_state=self._random_state, n_estimators=10,)
+        tclf = Odte(
            random_state=self._random_state, n_estimators=10, n_jobs=-1
        )
        tclf.set_params(
            **dict(
                base_estimator__kernel="rbf",
@ -147,6 +141,7 @@ class Odte_test(unittest.TestCase):
    @staticmethod
    def test_is_a_sklearn_classifier():
        os.environ["PYTHONWARNINGS"] = "ignore"
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        from sklearn.utils.estimator_checks import check_estimator
--- a/odte/tests/init.py
+++ b/odte/tests/init.py
@ -1,3 +1,4 @@
 # type: ignore
 from .Odte_tests import Odte_test
 __all__ = ["Odte_test"]