mirror of
https://github.com/Doctorado-ML/Odte.git
synced 2025-07-11 00:02:30 +00:00
Merge pull request #6 from Doctorado-ML/parallel_init
Parallel init error
This commit is contained in:
commit
98cadc7eeb
58
odte/Odte.py
58
odte/Odte.py
@ -7,7 +7,6 @@ as well
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import random
|
import random
|
||||||
import sys
|
|
||||||
import json
|
import json
|
||||||
from math import factorial
|
from math import factorial
|
||||||
from typing import Union, Optional, Tuple, List, Set
|
from typing import Union, Optional, Tuple, List, Set
|
||||||
@ -16,6 +15,7 @@ from sklearn.utils.multiclass import ( # type: ignore
|
|||||||
check_classification_targets,
|
check_classification_targets,
|
||||||
)
|
)
|
||||||
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
|
from sklearn.base import clone, BaseEstimator, ClassifierMixin # type: ignore
|
||||||
|
from sklearn.utils import check_random_state
|
||||||
from sklearn.ensemble import BaseEnsemble # type: ignore
|
from sklearn.ensemble import BaseEnsemble # type: ignore
|
||||||
from sklearn.utils.validation import ( # type: ignore
|
from sklearn.utils.validation import ( # type: ignore
|
||||||
check_is_fitted,
|
check_is_fitted,
|
||||||
@ -54,12 +54,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
|||||||
def version() -> str:
|
def version() -> str:
|
||||||
return __version__
|
return __version__
|
||||||
|
|
||||||
def _initialize_random(self) -> np.random.mtrand.RandomState:
|
|
||||||
if self.random_state is None:
|
|
||||||
self.random_state = random.randint(0, sys.maxsize)
|
|
||||||
return np.random.mtrand._rand
|
|
||||||
return np.random.RandomState(self.random_state)
|
|
||||||
|
|
||||||
def _validate_estimator(self) -> None:
|
def _validate_estimator(self) -> None:
|
||||||
"""Check the estimator and set the base_estimator_ attribute."""
|
"""Check the estimator and set the base_estimator_ attribute."""
|
||||||
super()._validate_estimator(
|
super()._validate_estimator(
|
||||||
@ -109,13 +103,34 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
|||||||
self.leaves_ = tleaves / self.n_estimators
|
self.leaves_ = tleaves / self.n_estimators
|
||||||
self.nodes_ = tnodes / self.n_estimators
|
self.nodes_ = tnodes / self.n_estimators
|
||||||
|
|
||||||
|
def _train(
|
||||||
|
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
|
||||||
|
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
|
||||||
|
n_samples = X.shape[0]
|
||||||
|
boot_samples = self._get_bootstrap_n_samples(n_samples)
|
||||||
|
estimator = clone(self.base_estimator_)
|
||||||
|
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
|
||||||
|
delayed(Odte._parallel_build_tree)(
|
||||||
|
estimator,
|
||||||
|
X,
|
||||||
|
y,
|
||||||
|
weights,
|
||||||
|
random_seed,
|
||||||
|
boot_samples,
|
||||||
|
self.max_features_,
|
||||||
|
self.be_hyperparams,
|
||||||
|
)
|
||||||
|
for random_seed in range(
|
||||||
|
self.random_state, self.random_state + self.n_estimators
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parallel_build_tree(
|
def _parallel_build_tree(
|
||||||
base_estimator_: Stree,
|
base_estimator_: BaseEstimator,
|
||||||
X: np.ndarray,
|
X: np.ndarray,
|
||||||
y: np.ndarray,
|
y: np.ndarray,
|
||||||
weights: np.ndarray,
|
weights: np.ndarray,
|
||||||
random_box: np.random.mtrand.RandomState,
|
|
||||||
random_seed: int,
|
random_seed: int,
|
||||||
boot_samples: int,
|
boot_samples: int,
|
||||||
max_features: int,
|
max_features: int,
|
||||||
@ -127,6 +142,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
|||||||
clf.set_params(**hyperparams_)
|
clf.set_params(**hyperparams_)
|
||||||
n_samples = X.shape[0]
|
n_samples = X.shape[0]
|
||||||
# bootstrap
|
# bootstrap
|
||||||
|
random_box = check_random_state(random_seed)
|
||||||
indices = random_box.randint(0, n_samples, boot_samples)
|
indices = random_box.randint(0, n_samples, boot_samples)
|
||||||
# update weights with the chosen samples
|
# update weights with the chosen samples
|
||||||
weights_update = np.bincount(indices, minlength=n_samples)
|
weights_update = np.bincount(indices, minlength=n_samples)
|
||||||
@ -138,30 +154,6 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
|||||||
clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
|
clf.fit(bootstrap[:, features], y[indices], current_weights[indices])
|
||||||
return (clf, features)
|
return (clf, features)
|
||||||
|
|
||||||
def _train(
|
|
||||||
self, X: np.ndarray, y: np.ndarray, weights: np.ndarray
|
|
||||||
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
|
|
||||||
random_box = self._initialize_random()
|
|
||||||
n_samples = X.shape[0]
|
|
||||||
boot_samples = self._get_bootstrap_n_samples(n_samples)
|
|
||||||
clf = clone(self.base_estimator_)
|
|
||||||
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
|
|
||||||
delayed(Odte._parallel_build_tree)(
|
|
||||||
clf,
|
|
||||||
X,
|
|
||||||
y,
|
|
||||||
weights,
|
|
||||||
random_box,
|
|
||||||
random_seed,
|
|
||||||
boot_samples,
|
|
||||||
self.max_features_,
|
|
||||||
self.be_hyperparams,
|
|
||||||
)
|
|
||||||
for random_seed in range(
|
|
||||||
self.random_state, self.random_state + self.n_estimators
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_bootstrap_n_samples(self, n_samples: int) -> int:
|
def _get_bootstrap_n_samples(self, n_samples: int) -> int:
|
||||||
if self.max_samples is None:
|
if self.max_samples is None:
|
||||||
return n_samples
|
return n_samples
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.3.1"
|
__version__ = "0.3.2"
|
||||||
|
@ -54,20 +54,6 @@ class Odte_test(unittest.TestCase):
|
|||||||
self.assertListEqual(expected, list(computed))
|
self.assertListEqual(expected, list(computed))
|
||||||
# print(f"{list(computed)},")
|
# print(f"{list(computed)},")
|
||||||
|
|
||||||
def test_initialize_random(self):
|
|
||||||
expected = [37, 235, 908]
|
|
||||||
tclf = Odte(random_state=self._random_state)
|
|
||||||
box = tclf._initialize_random()
|
|
||||||
computed = box.randint(0, 1000, 3)
|
|
||||||
self.assertListEqual(expected, computed.tolist())
|
|
||||||
# test None
|
|
||||||
tclf = Odte(random_state=None)
|
|
||||||
box = tclf._initialize_random()
|
|
||||||
computed = box.randint(101, 1000, 3)
|
|
||||||
for value in computed.tolist():
|
|
||||||
self.assertGreaterEqual(value, 101)
|
|
||||||
self.assertLessEqual(value, 1000)
|
|
||||||
|
|
||||||
def test_bogus_max_features(self):
|
def test_bogus_max_features(self):
|
||||||
values = ["duck", -0.1, 0.0]
|
values = ["duck", -0.1, 0.0]
|
||||||
for max_features in values:
|
for max_features in values:
|
||||||
@ -124,7 +110,7 @@ class Odte_test(unittest.TestCase):
|
|||||||
|
|
||||||
def test_score(self):
|
def test_score(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
expected = 0.9513333333333334
|
expected = 0.9533333333333334
|
||||||
tclf = Odte(
|
tclf = Odte(
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
max_features=None,
|
max_features=None,
|
||||||
@ -136,19 +122,18 @@ class Odte_test(unittest.TestCase):
|
|||||||
def test_score_splitter_max_features(self):
|
def test_score_splitter_max_features(self):
|
||||||
X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
|
X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
|
||||||
results = [
|
results = [
|
||||||
0.948,
|
0.958, # best auto
|
||||||
0.924,
|
0.942, # random auto
|
||||||
0.926,
|
0.932, # trandom auto
|
||||||
0.94,
|
0.95, # mutual auto
|
||||||
0.932,
|
0.944, # iwss auto
|
||||||
0.936,
|
0.946, # cfs auto
|
||||||
0.962,
|
0.97, # best None
|
||||||
0.962,
|
0.97, # random None
|
||||||
0.962,
|
0.97, # trandom None
|
||||||
0.962,
|
0.97, # mutual None
|
||||||
0.962,
|
0.97, # iwss None
|
||||||
0.962,
|
0.97, # cfs None
|
||||||
0.962,
|
|
||||||
]
|
]
|
||||||
random.seed(self._random_state)
|
random.seed(self._random_state)
|
||||||
for max_features in ["auto", None]:
|
for max_features in ["auto", None]:
|
||||||
@ -207,16 +192,25 @@ class Odte_test(unittest.TestCase):
|
|||||||
tclf = Odte(
|
tclf = Odte(
|
||||||
base_estimator=Stree(),
|
base_estimator=Stree(),
|
||||||
random_state=self._random_state,
|
random_state=self._random_state,
|
||||||
n_estimators=3,
|
n_estimators=5,
|
||||||
|
n_jobs=1,
|
||||||
|
)
|
||||||
|
tclf_p = Odte(
|
||||||
|
base_estimator=Stree(),
|
||||||
|
random_state=self._random_state,
|
||||||
|
n_estimators=5,
|
||||||
|
n_jobs=-1,
|
||||||
)
|
)
|
||||||
X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
|
X, y = load_dataset(self._random_state, n_features=16, n_samples=500)
|
||||||
tclf.fit(X, y)
|
tclf.fit(X, y)
|
||||||
self.assertAlmostEqual(6.0, tclf.depth_)
|
tclf_p.fit(X, y)
|
||||||
self.assertAlmostEqual(9.333333333333334, tclf.leaves_)
|
for clf in [tclf, tclf_p]:
|
||||||
self.assertAlmostEqual(17.666666666666668, tclf.nodes_)
|
self.assertAlmostEqual(5.8, clf.depth_)
|
||||||
nodes, leaves = tclf.nodes_leaves()
|
self.assertAlmostEqual(9.4, clf.leaves_)
|
||||||
self.assertAlmostEqual(9.333333333333334, leaves)
|
self.assertAlmostEqual(17.8, clf.nodes_)
|
||||||
self.assertAlmostEqual(17.666666666666668, nodes)
|
nodes, leaves = clf.nodes_leaves()
|
||||||
|
self.assertAlmostEqual(9.4, leaves)
|
||||||
|
self.assertAlmostEqual(17.8, nodes)
|
||||||
|
|
||||||
def test_nodes_leaves_SVC(self):
|
def test_nodes_leaves_SVC(self):
|
||||||
tclf = Odte(
|
tclf = Odte(
|
||||||
@ -257,3 +251,15 @@ class Odte_test(unittest.TestCase):
|
|||||||
def test_version(self):
|
def test_version(self):
|
||||||
tclf = Odte()
|
tclf = Odte()
|
||||||
self.assertEqual(__version__, tclf.version())
|
self.assertEqual(__version__, tclf.version())
|
||||||
|
|
||||||
|
def test_parallel_score(self):
|
||||||
|
tclf_p = Odte(
|
||||||
|
n_jobs=-1, random_state=self._random_state, n_estimators=30
|
||||||
|
)
|
||||||
|
tclf_s = Odte(
|
||||||
|
n_jobs=1, random_state=self._random_state, n_estimators=30
|
||||||
|
)
|
||||||
|
X, y = load_dataset(self._random_state, n_features=56, n_samples=1500)
|
||||||
|
tclf_p.fit(X, y)
|
||||||
|
tclf_s.fit(X, y)
|
||||||
|
self.assertAlmostEqual(tclf_p.score(X, y), tclf_s.score(X, y))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user