Update to scikit-learn 1.2

This commit is contained in:
Ricardo Montañana Gómez 2023-01-14 21:38:11 +01:00
parent 7300bd66db
commit cabf926eb1
Signed by untrusted user who does not match committer: rmontanana
GPG Key ID: 46064262FD9A7ADE
3 changed files with 194 additions and 194 deletions

View File

@ -1,174 +1,174 @@
{ {
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import datetime, time\n", "import datetime, time\n",
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"from sklearn.model_selection import train_test_split, cross_validate\n", "from sklearn.model_selection import train_test_split, cross_validate\n",
"from sklearn import tree\n", "from sklearn import tree\n",
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n", "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n", "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n",
"from stree import Stree\n", "from stree import Stree\n",
"from odte import Odte\n", "from odte import Odte\n",
"\n", "\n",
"random_state = 1" "random_state = 1"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_wine\n",
"X, y = load_wine(return_X_y=True)\n",
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = 20\n",
"clf = {}\n",
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n",
"clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n",
"clf[\"odte\"] = Odte(n_jobs=-1, estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n",
"clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
"clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"*\"*30,\"Results for wine\", \"*\"*30)\n",
"for clf_type, item in clf.items():\n",
" print(f\"Training {clf_type}...\")\n",
" now = time.time()\n",
" item.fit(Xtrain, ytrain)\n",
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"X, y = load_iris(return_X_y=True)\n",
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = 10\n",
"clf = {}\n",
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n",
"clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n",
"clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
"clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"*\"*30,\"Results for iris\", \"*\"*30)\n",
"for clf_type, item in clf.items():\n",
" print(f\"Training {clf_type}...\")\n",
" now = time.time()\n",
" item.fit(Xtrain, ytrain)\n",
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
"print(cross)\n",
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
"print(cross)\n",
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from sklearn.utils.estimator_checks import check_estimator\n",
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Odte(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
}
],
"metadata": {
"interpreter": {
"hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939"
},
"kernelspec": {
"display_name": "Python 3.9.2 64-bit ('general': venv)",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 2
}, },
{ "nbformat": 4,
"cell_type": "code", "nbformat_minor": 2
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_wine\n",
"X, y = load_wine(return_X_y=True)\n",
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = 20\n",
"clf = {}\n",
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n",
"clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n",
"clf[\"odte\"] = Odte(n_jobs=-1, base_estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n",
"clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
"clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"*\"*30,\"Results for wine\", \"*\"*30)\n",
"for clf_type, item in clf.items():\n",
" print(f\"Training {clf_type}...\")\n",
" now = time.time()\n",
" item.fit(Xtrain, ytrain)\n",
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"X, y = load_iris(return_X_y=True)\n",
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_estimators = 10\n",
"clf = {}\n",
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n",
"clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n",
"clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
"clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"print(\"*\"*30,\"Results for iris\", \"*\"*30)\n",
"for clf_type, item in clf.items():\n",
" print(f\"Training {clf_type}...\")\n",
" now = time.time()\n",
" item.fit(Xtrain, ytrain)\n",
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
"print(cross)\n",
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
"print(cross)\n",
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from sklearn.utils.estimator_checks import check_estimator\n",
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Odte(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
}
],
"metadata": {
"interpreter": {
"hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939"
},
"kernelspec": {
"display_name": "Python 3.9.2 64-bit ('general': venv)",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 2
},
"nbformat": 4,
"nbformat_minor": 2
} }

View File

@ -31,7 +31,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
self, self,
# n_jobs = -1 to use all available cores # n_jobs = -1 to use all available cores
n_jobs: int = -1, n_jobs: int = -1,
base_estimator: BaseEstimator = None, estimator: BaseEstimator = None,
random_state: int = 0, random_state: int = 0,
max_features: Optional[Union[str, int, float]] = None, max_features: Optional[Union[str, int, float]] = None,
max_samples: Optional[Union[int, float]] = None, max_samples: Optional[Union[int, float]] = None,
@ -39,10 +39,10 @@ class Odte(BaseEnsemble, ClassifierMixin):
be_hyperparams: str = "{}", be_hyperparams: str = "{}",
): ):
super().__init__( super().__init__(
base_estimator=base_estimator, estimator=estimator,
n_estimators=n_estimators, n_estimators=n_estimators,
) )
self.base_estimator = base_estimator self.estimator = estimator
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.n_estimators = n_estimators self.n_estimators = n_estimators
self.random_state = random_state self.random_state = random_state
@ -55,7 +55,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
return __version__ return __version__
def _validate_estimator(self) -> None: def _validate_estimator(self) -> None:
"""Check the estimator and set the base_estimator_ attribute.""" """Check the estimator and set the estimator_ attribute."""
super()._validate_estimator( super()._validate_estimator(
default=Stree(random_state=self.random_state) default=Stree(random_state=self.random_state)
) )
@ -79,7 +79,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
# Initialize computed parameters # Initialize computed parameters
# Build the estimator # Build the estimator
self.max_features_ = self._initialize_max_features() self.max_features_ = self._initialize_max_features()
# build base_estimator_ # build estimator_
self._validate_estimator() self._validate_estimator()
self.classes_, y = np.unique(y, return_inverse=True) self.classes_, y = np.unique(y, return_inverse=True)
self.n_classes_: int = self.classes_.shape[0] self.n_classes_: int = self.classes_.shape[0]
@ -108,7 +108,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
n_samples = X.shape[0] n_samples = X.shape[0]
boot_samples = self._get_bootstrap_n_samples(n_samples) boot_samples = self._get_bootstrap_n_samples(n_samples)
estimator = clone(self.base_estimator_) estimator = clone(self.estimator_)
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
delayed(Odte._parallel_build_tree)( delayed(Odte._parallel_build_tree)(
estimator, estimator,
@ -127,7 +127,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
@staticmethod @staticmethod
def _parallel_build_tree( def _parallel_build_tree(
base_estimator_: BaseEstimator, estimator_: BaseEstimator,
X: np.ndarray, X: np.ndarray,
y: np.ndarray, y: np.ndarray,
weights: np.ndarray, weights: np.ndarray,
@ -136,7 +136,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
max_features: int, max_features: int,
hyperparams: str, hyperparams: str,
) -> Tuple[BaseEstimator, Tuple[int, ...]]: ) -> Tuple[BaseEstimator, Tuple[int, ...]]:
clf = clone(base_estimator_) clf = clone(estimator_)
hyperparams_ = json.loads(hyperparams) hyperparams_ = json.loads(hyperparams)
hyperparams_.update(dict(random_state=random_seed)) hyperparams_.update(dict(random_state=random_seed))
clf.set_params(**hyperparams_) clf.set_params(**hyperparams_)

View File

@ -76,15 +76,15 @@ class Odte_test(unittest.TestCase):
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2] X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
expected = [0, 1, 1, 2] expected = [0, 1, 1, 2]
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=10, n_estimators=10,
n_jobs=-1, n_jobs=-1,
) )
tclf.set_params( tclf.set_params(
**dict( **dict(
base_estimator__kernel="rbf", estimator__kernel="rbf",
base_estimator__random_state=self._random_state, estimator__random_state=self._random_state,
) )
) )
computed = tclf.fit(X, y).predict(X) computed = tclf.fit(X, y).predict(X)
@ -96,14 +96,14 @@ class Odte_test(unittest.TestCase):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)
expected = y expected = y
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
max_features=1.0, max_features=1.0,
max_samples=0.1, max_samples=0.1,
) )
tclf.set_params( tclf.set_params(
**dict( **dict(
base_estimator__kernel="linear", estimator__kernel="linear",
) )
) )
computed = tclf.fit(X, y).predict(X) computed = tclf.fit(X, y).predict(X)
@ -146,16 +146,16 @@ class Odte_test(unittest.TestCase):
"cfs", "cfs",
]: ]:
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=3,
n_jobs=1, n_jobs=1,
) )
tclf.set_params( tclf.set_params(
**dict( **dict(
base_estimator__max_features=max_features, estimator__max_features=max_features,
base_estimator__splitter=splitter, estimator__splitter=splitter,
base_estimator__random_state=self._random_state, estimator__random_state=self._random_state,
) )
) )
expected = results.pop(0) expected = results.pop(0)
@ -182,7 +182,7 @@ class Odte_test(unittest.TestCase):
def test_nodes_leaves_not_fitted(self): def test_nodes_leaves_not_fitted(self):
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=3,
) )
@ -191,13 +191,13 @@ class Odte_test(unittest.TestCase):
def test_nodes_leaves_depth(self): def test_nodes_leaves_depth(self):
tclf = Odte( tclf = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=5, n_estimators=5,
n_jobs=1, n_jobs=1,
) )
tclf_p = Odte( tclf_p = Odte(
base_estimator=Stree(), estimator=Stree(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=5, n_estimators=5,
n_jobs=-1, n_jobs=-1,
@ -215,7 +215,7 @@ class Odte_test(unittest.TestCase):
def test_nodes_leaves_SVC(self): def test_nodes_leaves_SVC(self):
tclf = Odte( tclf = Odte(
base_estimator=SVC(), estimator=SVC(),
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=3,
) )
@ -227,7 +227,7 @@ class Odte_test(unittest.TestCase):
self.assertAlmostEqual(0.0, leaves) self.assertAlmostEqual(0.0, leaves)
self.assertAlmostEqual(0.0, nodes) self.assertAlmostEqual(0.0, nodes)
def test_base_estimator_hyperparams(self): def test_estimator_hyperparams(self):
data = [ data = [
(Stree(), {"max_features": 7, "max_depth": 2}), (Stree(), {"max_features": 7, "max_depth": 2}),
(SVC(), {"kernel": "linear", "cache_size": 100}), (SVC(), {"kernel": "linear", "cache_size": 100}),
@ -235,7 +235,7 @@ class Odte_test(unittest.TestCase):
for clf, hyperparams in data: for clf, hyperparams in data:
hyperparams_ = json.dumps(hyperparams) hyperparams_ = json.dumps(hyperparams)
tclf = Odte( tclf = Odte(
base_estimator=clf, estimator=clf,
random_state=self._random_state, random_state=self._random_state,
n_estimators=3, n_estimators=3,
be_hyperparams=hyperparams_, be_hyperparams=hyperparams_,