Merge pull request #6 from Doctorado-ML/parallel_init

Parallel init error
2025-07-11 00:02:30 +00:00 · 2022-03-02 13:12:13 +01:00 · 2022-03-02 13:12:13 +01:00 · 98cadc7eeb
commit 98cadc7eeb
parent aff96bb97d dda3517090
3 changed files with 67 additions and 69 deletions
--- a/odte/Odte.py
+++ b/odte/Odte.py
@ -7,7 +7,6 @@ as well
 """
 from __future__ import annotations
 import random
-import sys
 import json
 from math import factorial
 from typing import Union, Optional, Tuple, List, Set
@ -16,6 +15,7 @@ from sklearn.utils.multiclass import (  # type: ignore
    check_classification_targets,
 )
 from sklearn.base import clone, BaseEstimator, ClassifierMixin  # type: ignore
+from sklearn.utils import check_random_state
 from sklearn.ensemble import BaseEnsemble  # type: ignore
 from sklearn.utils.validation import (  # type: ignore
    check_is_fitted,
@ -54,12 +54,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
    def version() -> str:
        return __version__

-    def _initialize_random(self) -> np.random.mtrand.RandomState:
-        if self.random_state is None:
-            self.random_state = random.randint(0, sys.maxsize)
-            return np.random.mtrand._rand
-        return np.random.RandomState(self.random_state)
-
    def _validate_estimator(self) -> None:
        """Check the estimator and set the base_estimator_ attribute."""
        super()._validate_estimator(
@ -109,13 +103,34 @@ class Odte(BaseEnsemble, ClassifierMixin):
        self.leaves_ = tleaves / self.n_estimators
        self.nodes_ = tnodes / self.n_estimators

+    def _train(
+        self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
+    ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
+        n_samples = X.shape[0]
+        boot_samples = self._get_bootstrap_n_samples(n_samples)
+        estimator = clone(self.base_estimator_)
+        return Parallel(n_jobs=self.n_jobs, prefer="threads")(  # type: ignore
+            delayed(Odte._parallel_build_tree)(
+                estimator,
+                X,
+                y,
+                weights,
+                random_seed,
+                boot_samples,
+                self.max_features_,
+                self.be_hyperparams,
+            )
+            for random_seed in range(
+                self.random_state, self.random_state + self.n_estimators
+            )
+        )
+
    @staticmethod
    def _parallel_build_tree(
-        base_estimator_: Stree,
+        base_estimator_: BaseEstimator,
        X: np.ndarray,
        y: np.ndarray,
        weights: np.ndarray,
-        random_box: np.random.mtrand.RandomState,
        random_seed: int,
        boot_samples: int,
        max_features: int,
@ -127,6 +142,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
        clf.set_params(**hyperparams_)
        n_samples = X.shape[0]
        # bootstrap
+        random_box = check_random_state(random_seed)
        indices = random_box.randint(0, n_samples, boot_samples)
        # update weights with the chosen samples
        weights_update = np.bincount(indices, minlength=n_samples)
@ -138,30 +154,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
        clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
        return (clf, features)

-    def _train(
-        self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
-    ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
-        random_box = self._initialize_random()
-        n_samples = X.shape[0]
-        boot_samples = self._get_bootstrap_n_samples(n_samples)
-        clf = clone(self.base_estimator_)
-        return Parallel(n_jobs=self.n_jobs, prefer="threads")(  # type: ignore
-            delayed(Odte._parallel_build_tree)(
-                clf,
-                X,
-                y,
-                weights,
-                random_box,
-                random_seed,
-                boot_samples,
-                self.max_features_,
-                self.be_hyperparams,
-            )
-            for random_seed in range(
-                self.random_state, self.random_state + self.n_estimators
-            )
-        )
-
    def _get_bootstrap_n_samples(self, n_samples: int) -> int:
        if self.max_samples is None:
            return n_samples
--- a/odte/_version.py
+++ b/odte/_version.py
@ -1 +1 @@
-__version__ = "0.3.1"
+__version__ = "0.3.2"
--- a/odte/tests/Odte_tests.py
+++ b/odte/tests/Odte_tests.py
@ -54,20 +54,6 @@ class Odte_test(unittest.TestCase):
            self.assertListEqual(expected, list(computed))
            # print(f"{list(computed)},")

-    def test_initialize_random(self):
-        expected = [37, 235, 908]
-        tclf = Odte(random_state=self._random_state)
-        box = tclf._initialize_random()
-        computed = box.randint(0, 1000, 3)
-        self.assertListEqual(expected, computed.tolist())
-        # test None
-        tclf = Odte(random_state=None)
-        box = tclf._initialize_random()
-        computed = box.randint(101, 1000, 3)
-        for value in computed.tolist():
-            self.assertGreaterEqual(value, 101)
-            self.assertLessEqual(value, 1000)
-
    def test_bogus_max_features(self):
        values = ["duck", -0.1, 0.0]
        for max_features in values:
@ -124,7 +110,7 @@ class Odte_test(unittest.TestCase):

    def test_score(self):
        X, y = load_dataset(self._random_state)
-        expected = 0.9513333333333334
+        expected = 0.9533333333333334
        tclf = Odte(
            random_state=self._random_state,
            max_features=None,
@ -136,19 +122,18 @@ class Odte_test(unittest.TestCase):
    def test_score_splitter_max_features(self):
        X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
        results = [
-            0.948,
-            0.924,
-            0.926,
-            0.94,
-            0.932,
-            0.936,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
-            0.962,
+            0.958,  # best auto
+            0.942,  # random auto
+            0.932,  # trandom auto
+            0.95,  # mutual auto
+            0.944,  # iwss auto
+            0.946,  # cfs auto
+            0.97,  # best None
+            0.97,  # random None
+            0.97,  # trandom None
+            0.97,  # mutual None
+            0.97,  # iwss None
+            0.97,  # cfs None
        ]
        random.seed(self._random_state)
        for max_features in ["auto", None]:
@ -207,16 +192,25 @@ class Odte_test(unittest.TestCase):
        tclf = Odte(
            base_estimator=Stree(),
            random_state=self._random_state,
-            n_estimators=3,
+            n_estimators=5,
+            n_jobs=1,
+        )
+        tclf_p = Odte(
+            base_estimator=Stree(),
+            random_state=self._random_state,
+            n_estimators=5,
+            n_jobs=-1,
        )
        X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
        tclf.fit(X, y)
-        self.assertAlmostEqual(6.0, tclf.depth_)
-        self.assertAlmostEqual(9.333333333333334, tclf.leaves_)
-        self.assertAlmostEqual(17.666666666666668, tclf.nodes_)
-        nodes, leaves = tclf.nodes_leaves()
-        self.assertAlmostEqual(9.333333333333334, leaves)
-        self.assertAlmostEqual(17.666666666666668, nodes)
+        tclf_p.fit(X, y)
+        for clf in [tclf, tclf_p]:
+            self.assertAlmostEqual(5.8, clf.depth_)
+            self.assertAlmostEqual(9.4, clf.leaves_)
+            self.assertAlmostEqual(17.8, clf.nodes_)
+            nodes, leaves = clf.nodes_leaves()
+            self.assertAlmostEqual(9.4, leaves)
+            self.assertAlmostEqual(17.8, nodes)

    def test_nodes_leaves_SVC(self):
        tclf = Odte(
@ -257,3 +251,15 @@ class Odte_test(unittest.TestCase):
    def test_version(self):
        tclf = Odte()
        self.assertEqual(__version__, tclf.version())
+
+    def test_parallel_score(self):
+        tclf_p = Odte(
+            n_jobs=-1, random_state=self._random_state, n_estimators=30
+        )
+        tclf_s = Odte(
+            n_jobs=1, random_state=self._random_state, n_estimators=30
+        )
+        X, y = load_dataset(self._random_state, n_features=56, n_samples=1500)
+        tclf_p.fit(X, y)
+        tclf_s.fit(X, y)
+        self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))