From cd7c7f3938291991c23f90936570021eb2339452 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 22 Feb 2022 20:40:35 +0100 Subject: [PATCH] First try to fix initialization issue --- odte/Odte.py | 73 +++++++++++++++++++++------------------- odte/_version.py | 2 +- odte/tests/Odte_tests.py | 12 +++++++ 3 files changed, 52 insertions(+), 35 deletions(-) diff --git a/odte/Odte.py b/odte/Odte.py index 036be0a..4b2809f 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -26,6 +26,35 @@ from stree import Stree # type: ignore from ._version import __version__ +def _parallel_build_tree( + base_estimator_: Stree, + X: np.ndarray, + y: np.ndarray, + weights: np.ndarray, + random_box: np.random.mtrand.RandomState, + random_seed: int, + boot_samples: int, + max_features: int, + hyperparams: str, +) -> Tuple[BaseEstimator, Tuple[int, ...]]: + clf = base_estimator_ + hyperparams_ = json.loads(hyperparams) + hyperparams_.update(dict(random_state=random_seed)) + clf.set_params(**hyperparams_) + n_samples = X.shape[0] + # bootstrap + indices = random_box.randint(0, n_samples, boot_samples) + # update weights with the chosen samples + weights_update = np.bincount(indices, minlength=n_samples) + current_weights = weights * weights_update + # random subspace + features = Odte._get_random_subspace(X, y, max_features) + # train the classifier + bootstrap = X[indices, :] + clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) + return (clf, features) + + class Odte(BaseEnsemble, ClassifierMixin): def __init__( self, @@ -109,45 +138,18 @@ class Odte(BaseEnsemble, ClassifierMixin): self.leaves_ = tleaves / self.n_estimators self.nodes_ = tnodes / self.n_estimators - @staticmethod - def _parallel_build_tree( - base_estimator_: Stree, - X: np.ndarray, - y: np.ndarray, - weights: np.ndarray, - random_box: np.random.mtrand.RandomState, - random_seed: int, - boot_samples: int, - max_features: int, - hyperparams: str, - ) -> Tuple[BaseEstimator, Tuple[int, ...]]: - clf = clone(base_estimator_) - hyperparams_ = json.loads(hyperparams) - hyperparams_.update(dict(random_state=random_seed)) - clf.set_params(**hyperparams_) - n_samples = X.shape[0] - # bootstrap - indices = random_box.randint(0, n_samples, boot_samples) - # update weights with the chosen samples - weights_update = np.bincount(indices, minlength=n_samples) - current_weights = weights * weights_update - # random subspace - features = Odte._get_random_subspace(X, y, max_features) - # train the classifier - bootstrap = X[indices, :] - clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) - return (clf, features) - def _train( self, X: np.ndarray, y: np.ndarray, weights: np.ndarray ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: random_box = self._initialize_random() n_samples = X.shape[0] boot_samples = self._get_bootstrap_n_samples(n_samples) - clf = clone(self.base_estimator_) + estimator = [] + for i in range(self.n_estimators): + estimator.append(clone(self.base_estimator_)) return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore - delayed(Odte._parallel_build_tree)( - clf, + delayed(_parallel_build_tree)( + estimator[i], X, y, weights, @@ -157,8 +159,11 @@ class Odte(BaseEnsemble, ClassifierMixin): self.max_features_, self.be_hyperparams, ) - for random_seed in range( - self.random_state, self.random_state + self.n_estimators + for random_seed, i in zip( + range( + self.random_state, self.random_state + self.n_estimators + ), + range(self.n_estimators), ) ) diff --git a/odte/_version.py b/odte/_version.py index 260c070..f9aa3e1 100644 --- a/odte/_version.py +++ b/odte/_version.py @@ -1 +1 @@ -__version__ = "0.3.1" +__version__ = "0.3.2" diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index f19f063..21abf07 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -257,3 +257,15 @@ class Odte_test(unittest.TestCase): def test_version(self): tclf = Odte() self.assertEqual(__version__, tclf.version()) + + def test_parallel_score(self): + tclf_p = Odte( + n_jobs=-1, random_state=self._random_state, n_estimators=30 + ) + tclf_s = Odte( + n_jobs=1, random_state=self._random_state, n_estimators=30 + ) + X, y = load_dataset(self._random_state, n_features=56, n_samples=1500) + tclf_p.fit(X, y) + tclf_s.fit(X, y) + self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))