Make training parallel

add pre-commit config
2025-07-11 16:22:00 +00:00 · 2020-07-21 13:11:59 +02:00 · 2020-07-21 13:11:59 +02:00 · 8f7cbc9091
commit 8f7cbc9091
parent b17582e93a
4 changed files with 85 additions and 58 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,8 +3,20 @@ repos:
    rev: stable
    hooks:
    - id: black
-      language_version: python3.7
+      language_version: python3.8
 -   repo: https://gitlab.com/pycqa/flake8
    rev: 3.7.9
    hooks:
-    - id: flake8
+    - id: flake8
+-   repo: https://github.com/pre-commit/mirrors-mypy
+    rev: ''  # Use the sha / tag you want to point at
+    hooks:
+    -   id: mypy
+        args: [--strict, --ignore-missing-imports]
+-   repo: local
+    hooks:
+      - id: unittest
+        name: unittest
+        entry: python -m unittest discover 
+        language: system
+        pass_filenames: false
--- a/odte/Odte.py
+++ b/odte/Odte.py
@ -10,23 +10,22 @@ import random
 import sys
 from typing import Union, Optional, Tuple, List
 from itertools import combinations
-import numpy as np  # type: ignore
-from sklearn.utils.multiclass import (  # type: ignore
-    check_classification_targets,
-)
-from sklearn.base import clone, BaseEstimator, ClassifierMixin  # type: ignore
-from sklearn.ensemble import BaseEnsemble  # type: ignore
-from sklearn.utils.validation import (  # type: ignore
+import numpy as np
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.base import clone, BaseEstimator, ClassifierMixin
+from sklearn.ensemble import BaseEnsemble
+from sklearn.utils.validation import (
    check_is_fitted,
    _check_sample_weight,
 )
-
-from stree import Stree  # type: ignore
+from joblib import Parallel, delayed
+from stree import Stree


 class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
    def __init__(
        self,
+        n_jobs: int = 1,
        base_estimator: BaseEstimator = None,
        random_state: int = 0,
        max_features: Optional[Union[str, int, float]] = None,
@ -41,6 +40,7 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
        super().__init__(
            base_estimator=base_estimator, n_estimators=n_estimators,
        )
+        self.n_jobs = n_jobs
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.max_features = max_features
@ -52,14 +52,6 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
            return np.random.mtrand._rand
        return np.random.RandomState(self.random_state)

-    @staticmethod
-    def _initialize_sample_weight(
-        sample_weight: np.array, n_samples: int
-    ) -> np.array:
-        if sample_weight is None:
-            return np.ones((n_samples,), dtype=np.float64)
-        return sample_weight.copy()
-
    def _validate_estimator(self) -> None:
        """Check the estimator and set the base_estimator_ attribute."""
        super()._validate_estimator(
@ -77,6 +69,7 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
            )
        check_classification_targets(y)
        X, y = self._validate_data(X, y)
+        # if weights is None return np.ones
        sample_weight = _check_sample_weight(
            sample_weight, X, dtype=np.float64
        )
@ -90,35 +83,60 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
        self.n_classes_: int = self.classes_.shape[0]
        self.estimators_: List[BaseEstimator] = []
        self.subspaces_: List[Tuple[int, ...]] = []
-        self._train(X, y, sample_weight)
+        result = self._train(X, y, sample_weight)
+        self.estimators_, self.subspaces_ = tuple(  # type: ignore
+            zip(*result)
+        )
        return self

-    def _train(
-        self, X: np.array, y: np.array, sample_weight: np.array
-    ) -> None:
-        random_box = self._initialize_random()
-        random_seed = self.random_state
+    @staticmethod
+    def _parallel_build_tree(
+        base_estimator_: Stree,
+        X: np.array,
+        y: np.array,
+        weights: np.array,
+        random_box: np.random.mtrand.RandomState,
+        random_seed: int,
+        boot_samples: int,
+        max_features: int,
+    ) -> Tuple[BaseEstimator, Tuple[int, ...]]:
+        clf = clone(base_estimator_)
+        clf.set_params(random_state=random_seed)
+        n_samples = X.shape[0]
+        # bootstrap
+        indices = random_box.randint(0, n_samples, boot_samples)
+        # update weights with the chosen samples
+        weights_update = np.bincount(indices, minlength=n_samples)
+        current_weights = weights * weights_update
+        # random subspace
+        features = Odte._get_random_subspace(X, y, max_features)
+        # train the classifier
+        bootstrap = X[indices, :]
+        clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
+        return (clf, features)
+
+    def _train(
+        self, X: np.array, y: np.array, weights: np.array
+    ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
+        random_box = self._initialize_random()
        n_samples = X.shape[0]
-        weights = self._initialize_sample_weight(sample_weight, n_samples)
        boot_samples = self._get_bootstrap_n_samples(n_samples)
-        for _ in range(self.n_estimators):
-            # Build clf
-            clf = clone(self.base_estimator_)
-            clf.random_state = random_seed
-            random_seed += 1
-            self.estimators_.append(clf)
-            # bootstrap
-            indices = random_box.randint(0, n_samples, boot_samples)
-            # update weights with the chosen samples
-            weights_update = np.bincount(indices, minlength=n_samples)
-            features = self._get_random_subspace(X, y)
-            self.subspaces_.append(features)
-            current_weights = weights * weights_update
-            # train the classifier
-            bootstrap = X[indices, :]
-            clf.fit(
-                bootstrap[:, features], y[indices], current_weights[indices]
+        clf = clone(self.base_estimator_)
+        return Parallel(n_jobs=self.n_jobs, prefer="threads")(  # type: ignore
+            delayed(Odte._parallel_build_tree)(
+                clf,
+                X,
+                y,
+                weights,
+                random_box,
+                random_seed,
+                boot_samples,
+                self.max_features_,
            )
+            for random_seed in range(
+                self.random_state, self.random_state + self.n_estimators
+            )
+        )

    def _get_bootstrap_n_samples(self, n_samples: int) -> int:
        if self.max_samples is None:
@ -171,11 +189,12 @@ class Odte(BaseEnsemble, ClassifierMixin):  # type: ignore
                )
        return max_features

+    @staticmethod
    def _get_random_subspace(
-        self, dataset: np.array, labels: np.array
+        dataset: np.array, labels: np.array, max_features: int
    ) -> Tuple[int, ...]:
        features = range(dataset.shape[1])
-        features_sets = list(combinations(features, self.max_features_))
+        features_sets = list(combinations(features, max_features))
        if len(features_sets) > 1:
            index = random.randint(0, len(features_sets) - 1)
            return features_sets[index]
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@ -1,5 +1,6 @@
+# type: ignore
 import unittest
-import numpy as np
+import os

 import warnings
 from sklearn.exceptions import ConvergenceWarning
@ -27,16 +28,6 @@ class Odte_test(unittest.TestCase):
            computed = tclf._get_bootstrap_n_samples(1500)
            self.assertEqual(expected, computed)

-    def test_initialize_sample_weight(self):
-        m = 5
-        ones = np.ones(m,)
-        weights = np.random.rand(m,)
-        expected_values = [(None, ones), (weights, weights)]
-        for value, expected in expected_values:
-            tclf = Odte()
-            computed = tclf._initialize_sample_weight(value, m)
-            self.assertListEqual(expected.tolist(), computed.tolist())
-
    def test_initialize_max_feature(self):
        expected_values = [
            [0, 5, 6, 15],
@ -55,7 +46,7 @@ class Odte_test(unittest.TestCase):
                random_state=self._random_state, max_features=max_features
            )
            tclf.fit(X, y)
-            computed = tclf._get_random_subspace(X, y)
+            computed = tclf._get_random_subspace(X, y, tclf.max_features_)
            expected = expected_values.pop(0)
            self.assertListEqual(expected, list(computed))

@ -88,11 +79,14 @@ class Odte_test(unittest.TestCase):
                tclf.fit(*load_dataset(self._random_state))

    def test_simple_predict(self):
+        os.environ["PYTHONWARNINGS"] = "ignore"
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
        expected = [0, 1, 1, 1]
-        tclf = Odte(random_state=self._random_state, n_estimators=10,)
+        tclf = Odte(
+            random_state=self._random_state, n_estimators=10, n_jobs=-1
+        )
        tclf.set_params(
            **dict(
                base_estimator__kernel="rbf",
@ -147,6 +141,7 @@ class Odte_test(unittest.TestCase):

    @staticmethod
    def test_is_a_sklearn_classifier():
+        os.environ["PYTHONWARNINGS"] = "ignore"
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        from sklearn.utils.estimator_checks import check_estimator
--- a/odte/tests/init.py
+++ b/odte/tests/init.py
@ -1,3 +1,4 @@
+# type: ignore
 from .Odte_tests import Odte_test

 __all__ = ["Odte_test"]