From 3766886190396cf6916fa8ba690ba26f79136a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 23 Feb 2022 12:02:59 +0100 Subject: [PATCH] Fix np.random initialization --- odte/Odte.py | 19 +++-------- odte/tests/Odte_tests.py | 68 +++++++++++++++++++++------------------- 2 files changed, 40 insertions(+), 47 deletions(-) diff --git a/odte/Odte.py b/odte/Odte.py index 4b2809f..875da85 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -16,6 +16,7 @@ from sklearn.utils.multiclass import ( # type: ignore check_classification_targets, ) from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore +from sklearn.utils import check_random_state from sklearn.ensemble import BaseEnsemble # type: ignore from sklearn.utils.validation import ( # type: ignore check_is_fitted, @@ -31,7 +32,6 @@ def _parallel_build_tree( X: np.ndarray, y: np.ndarray, weights: np.ndarray, - random_box: np.random.mtrand.RandomState, random_seed: int, boot_samples: int, max_features: int, @@ -43,6 +43,7 @@ def _parallel_build_tree( clf.set_params(**hyperparams_) n_samples = X.shape[0] # bootstrap + random_box = check_random_state(random_seed) indices = random_box.randint(0, n_samples, boot_samples) # update weights with the chosen samples weights_update = np.bincount(indices, minlength=n_samples) @@ -83,12 +84,6 @@ class Odte(BaseEnsemble, ClassifierMixin): def version() -> str: return __version__ - def _initialize_random(self) -> np.random.mtrand.RandomState: - if self.random_state is None: - self.random_state = random.randint(0, sys.maxsize) - return np.random.mtrand._rand - return np.random.RandomState(self.random_state) - def _validate_estimator(self) -> None: """Check the estimator and set the base_estimator_ attribute.""" super()._validate_estimator( @@ -141,7 +136,7 @@ class Odte(BaseEnsemble, ClassifierMixin): def _train( self, X: np.ndarray, y: np.ndarray, weights: np.ndarray ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: - random_box = self._initialize_random() + # np.random.RandomState(seed) n_samples = X.shape[0] boot_samples = self._get_bootstrap_n_samples(n_samples) estimator = [] @@ -153,17 +148,13 @@ class Odte(BaseEnsemble, ClassifierMixin): X, y, weights, - random_box, random_seed, boot_samples, self.max_features_, self.be_hyperparams, ) - for random_seed, i in zip( - range( - self.random_state, self.random_state + self.n_estimators - ), - range(self.n_estimators), + for i, random_seed in enumerate( + range(self.random_state, self.random_state + self.n_estimators) ) ) diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index 21abf07..9945016 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -54,20 +54,6 @@ class Odte_test(unittest.TestCase): self.assertListEqual(expected, list(computed)) # print(f"{list(computed)},") - def test_initialize_random(self): - expected = [37, 235, 908] - tclf = Odte(random_state=self._random_state) - box = tclf._initialize_random() - computed = box.randint(0, 1000, 3) - self.assertListEqual(expected, computed.tolist()) - # test None - tclf = Odte(random_state=None) - box = tclf._initialize_random() - computed = box.randint(101, 1000, 3) - for value in computed.tolist(): - self.assertGreaterEqual(value, 101) - self.assertLessEqual(value, 1000) - def test_bogus_max_features(self): values = ["duck", -0.1, 0.0] for max_features in values: @@ -124,7 +110,7 @@ class Odte_test(unittest.TestCase): def test_score(self): X, y = load_dataset(self._random_state) - expected = 0.9513333333333334 + expected = 0.9533333333333334 tclf = Odte( random_state=self._random_state, max_features=None, @@ -136,19 +122,18 @@ class Odte_test(unittest.TestCase): def test_score_splitter_max_features(self): X, y = load_dataset(self._random_state, n_features=16, n_samples=500) results = [ - 0.948, - 0.924, - 0.926, - 0.94, - 0.932, - 0.936, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, - 0.962, + 0.958, # best auto + 0.942, # random auto + 0.932, # trandom auto + 0.95, # mutual auto + 0.944, # iwss auto + 0.946, # cfs auto + 0.97, # best None + 0.97, # random None + 0.97, # trandom None + 0.97, # mutual None + 0.97, # iwss None + 0.97, # cfs None ] random.seed(self._random_state) for max_features in ["auto", None]: @@ -208,15 +193,32 @@ class Odte_test(unittest.TestCase): base_estimator=Stree(), random_state=self._random_state, n_estimators=3, + n_jobs=1, ) X, y = load_dataset(self._random_state, n_features=16, n_samples=500) tclf.fit(X, y) - self.assertAlmostEqual(6.0, tclf.depth_) - self.assertAlmostEqual(9.333333333333334, tclf.leaves_) - self.assertAlmostEqual(17.666666666666668, tclf.nodes_) + self.assertAlmostEqual(6.333333333333333, tclf.depth_) + self.assertAlmostEqual(10.0, tclf.leaves_) + self.assertAlmostEqual(19.0, tclf.nodes_) nodes, leaves = tclf.nodes_leaves() - self.assertAlmostEqual(9.333333333333334, leaves) - self.assertAlmostEqual(17.666666666666668, nodes) + self.assertAlmostEqual(10.0, leaves) + self.assertAlmostEqual(19, nodes) + + def test_nodes_leaves_depth_parallel(self): + tclf = Odte( + base_estimator=Stree(), + random_state=self._random_state, + n_estimators=3, + n_jobs=-1, + ) + X, y = load_dataset(self._random_state, n_features=16, n_samples=500) + tclf.fit(X, y) + self.assertAlmostEqual(6.333333333333333, tclf.depth_) + self.assertAlmostEqual(10.0, tclf.leaves_) + self.assertAlmostEqual(19.0, tclf.nodes_) + nodes, leaves = tclf.nodes_leaves() + self.assertAlmostEqual(10.0, leaves) + self.assertAlmostEqual(19, nodes) def test_nodes_leaves_SVC(self): tclf = Odte(