Merge pull request #6 from Doctorado-ML/parallel_init

Parallel init error
This commit is contained in:
Ricardo Montañana Gómez 2022-03-02 13:12:13 +01:00 committed by GitHub
commit 98cadc7eeb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 67 additions and 69 deletions

View File

@ -7,7 +7,6 @@ as well
""" """
from __future__ import annotations from __future__ import annotations
import random import random
import sys
import json import json
from math import factorial from math import factorial
from typing import Union, Optional, Tuple, List, Set from typing import Union, Optional, Tuple, List, Set
@ -16,6 +15,7 @@ from sklearn.utils.multiclass import ( # type: ignore
check_classification_targets, check_classification_targets,
) )
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
from sklearn.utils import check_random_state
from sklearn.ensemble import BaseEnsemble # type: ignore from sklearn.ensemble import BaseEnsemble # type: ignore
from sklearn.utils.validation import ( # type: ignore from sklearn.utils.validation import ( # type: ignore
check_is_fitted, check_is_fitted,
@ -54,12 +54,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
def version() -> str: def version() -> str:
return __version__ return __version__
def _initialize_random(self) -> np.random.mtrand.RandomState:
if self.random_state is None:
self.random_state = random.randint(0, sys.maxsize)
return np.random.mtrand._rand
return np.random.RandomState(self.random_state)
def _validate_estimator(self) -> None: def _validate_estimator(self) -> None:
"""Check the estimator and set the base_estimator_ attribute.""" """Check the estimator and set the base_estimator_ attribute."""
super()._validate_estimator( super()._validate_estimator(
@ -109,13 +103,34 @@ class Odte(BaseEnsemble, ClassifierMixin):
self.leaves_ = tleaves / self.n_estimators self.leaves_ = tleaves / self.n_estimators
self.nodes_ = tnodes / self.n_estimators self.nodes_ = tnodes / self.n_estimators
def _train(
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples)
estimator = clone(self.base_estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
estimator,
X,
y,
weights,
random_seed,
boot_samples,
self.max_features_,
self.be_hyperparams,
)
for random_seed in range(
self.random_state, self.random_state + self.n_estimators
)
)
@staticmethod @staticmethod
def _parallel_build_tree( def _parallel_build_tree(
base_estimator_: Stree, base_estimator_: BaseEstimator,
X: np.ndarray, X: np.ndarray,
y: np.ndarray, y: np.ndarray,
weights: np.ndarray, weights: np.ndarray,
random_box: np.random.mtrand.RandomState,
random_seed: int, random_seed: int,
boot_samples: int, boot_samples: int,
max_features: int, max_features: int,
@ -127,6 +142,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
clf.set_params(**hyperparams_) clf.set_params(**hyperparams_)
n_samples = X.shape[0] n_samples = X.shape[0]
# bootstrap # bootstrap
random_box = check_random_state(random_seed)
indices = random_box.randint(0, n_samples, boot_samples) indices = random_box.randint(0, n_samples, boot_samples)
# update weights with the chosen samples # update weights with the chosen samples
weights_update = np.bincount(indices, minlength=n_samples) weights_update = np.bincount(indices, minlength=n_samples)
@ -138,30 +154,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
clf.fit(bootstrap[:, features], y[indices], current_weights[indices]) clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
return (clf, features) return (clf, features)
def _train(
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
random_box = self._initialize_random()
n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples)
clf = clone(self.base_estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)(
clf,
X,
y,
weights,
random_box,
random_seed,
boot_samples,
self.max_features_,
self.be_hyperparams,
)
for random_seed in range(
self.random_state, self.random_state + self.n_estimators
)
)
def _get_bootstrap_n_samples(self, n_samples: int) -> int: def _get_bootstrap_n_samples(self, n_samples: int) -> int:
if self.max_samples is None: if self.max_samples is None:
return n_samples return n_samples

View File

@ -1 +1 @@
__version__ = "0.3.1" __version__ = "0.3.2"

View File

@ -54,20 +54,6 @@ class Odte_test(unittest.TestCase):
self.assertListEqual(expected, list(computed)) self.assertListEqual(expected, list(computed))
# print(f"{list(computed)},") # print(f"{list(computed)},")
def test_initialize_random(self):
expected = [37, 235, 908]
tclf = Odte(random_state=self._random_state)
box = tclf._initialize_random()
computed = box.randint(0, 1000, 3)
self.assertListEqual(expected, computed.tolist())
# test None
tclf = Odte(random_state=None)
box = tclf._initialize_random()
computed = box.randint(101, 1000, 3)
for value in computed.tolist():
self.assertGreaterEqual(value, 101)
self.assertLessEqual(value, 1000)
def test_bogus_max_features(self): def test_bogus_max_features(self):
values = ["duck", -0.1, 0.0] values = ["duck", -0.1, 0.0]
for max_features in values: for max_features in values:
@ -124,7 +110,7 @@ class Odte_test(unittest.TestCase):
def test_score(self): def test_score(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
expected = 0.9513333333333334 expected = 0.9533333333333334
tclf = Odte( tclf = Odte(
random_state=self._random_state, random_state=self._random_state,
max_features=None, max_features=None,
@ -136,19 +122,18 @@ class Odte_test(unittest.TestCase):
def test_score_splitter_max_features(self): def test_score_splitter_max_features(self):
X, y = load_dataset(self._random_state, n_features=16, n_samples=500) X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
results = [ results = [
0.948, 0.958, # best auto
0.924, 0.942, # random auto
0.926, 0.932, # trandom auto
0.94, 0.95, # mutual auto
0.932, 0.944, # iwss auto
0.936, 0.946, # cfs auto
0.962, 0.97, # best None
0.962, 0.97, # random None
0.962, 0.97, # trandom None
0.962, 0.97, # mutual None
0.962, 0.97, # iwss None
0.962, 0.97, # cfs None
0.962,
] ]
random.seed(self._random_state) random.seed(self._random_state)
for max_features in ["auto", None]: for max_features in ["auto", None]:
@ -207,16 +192,25 @@ class Odte_test(unittest.TestCase):
tclf = Odte( tclf = Odte(
base_estimator=Stree(), base_estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=5,
n_jobs=1,
)
tclf_p = Odte(
base_estimator=Stree(),
random_state=self._random_state,
n_estimators=5,
n_jobs=-1,
) )
X, y = load_dataset(self._random_state, n_features=16, n_samples=500) X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
tclf.fit(X, y) tclf.fit(X, y)
self.assertAlmostEqual(6.0, tclf.depth_) tclf_p.fit(X, y)
self.assertAlmostEqual(9.333333333333334, tclf.leaves_) for clf in [tclf, tclf_p]:
self.assertAlmostEqual(17.666666666666668, tclf.nodes_) self.assertAlmostEqual(5.8, clf.depth_)
nodes, leaves = tclf.nodes_leaves() self.assertAlmostEqual(9.4, clf.leaves_)
self.assertAlmostEqual(9.333333333333334, leaves) self.assertAlmostEqual(17.8, clf.nodes_)
self.assertAlmostEqual(17.666666666666668, nodes) nodes, leaves = clf.nodes_leaves()
self.assertAlmostEqual(9.4, leaves)
self.assertAlmostEqual(17.8, nodes)
def test_nodes_leaves_SVC(self): def test_nodes_leaves_SVC(self):
tclf = Odte( tclf = Odte(
@ -257,3 +251,15 @@ class Odte_test(unittest.TestCase):
def test_version(self): def test_version(self):
tclf = Odte() tclf = Odte()
self.assertEqual(__version__, tclf.version()) self.assertEqual(__version__, tclf.version())
def test_parallel_score(self):
tclf_p = Odte(
n_jobs=-1, random_state=self._random_state, n_estimators=30
)
tclf_s = Odte(
n_jobs=1, random_state=self._random_state, n_estimators=30
)
X, y = load_dataset(self._random_state, n_features=56, n_samples=1500)
tclf_p.fit(X, y)
tclf_s.fit(X, y)
self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))