From cabf926eb14560ee7dde6ab55dd54c4e3107978b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sat, 14 Jan 2023 21:38:11 +0100 Subject: [PATCH] Update to scikit-learn 1.2 --- notebooks/wine_iris.ipynb | 342 +++++++++++++++++++------------------- odte/Odte.py | 16 +- odte/tests/Odte_tests.py | 30 ++-- 3 files changed, 194 insertions(+), 194 deletions(-) diff --git a/notebooks/wine_iris.ipynb b/notebooks/wine_iris.ipynb index f868ee7..c5d8883 100644 --- a/notebooks/wine_iris.ipynb +++ b/notebooks/wine_iris.ipynb @@ -1,174 +1,174 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import datetime, time\n", - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split, cross_validate\n", - "from sklearn import tree\n", - "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n", - "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n", - "from stree import Stree\n", - "from odte import Odte\n", - "\n", - "random_state = 1" - ] + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime, time\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, cross_validate\n", + "from sklearn import tree\n", + "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n", + "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n", + "from stree import Stree\n", + "from odte import Odte\n", + "\n", + "random_state = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "X, y = load_wine(return_X_y=True)\n", + "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 20\n", + "clf = {}\n", + "clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n", + "clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n", + "clf[\"odte\"] = Odte(n_jobs=-1, estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n", + "clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", + "clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"*\"*30,\"Results for wine\", \"*\"*30)\n", + "for clf_type, item in clf.items():\n", + " print(f\"Training {clf_type}...\")\n", + " now = time.time()\n", + " item.fit(Xtrain, ytrain)\n", + " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_iris\n", + "X, y = load_iris(return_X_y=True)\n", + "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 10\n", + "clf = {}\n", + "clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n", + "clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n", + "clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", + "clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"*\"*30,\"Results for iris\", \"*\"*30)\n", + "for clf_type, item in clf.items():\n", + " print(f\"Training {clf_type}...\")\n", + " now = time.time()\n", + " item.fit(Xtrain, ytrain)\n", + " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", + "print(cross)\n", + "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", + "print(cross)\n", + "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sklearn.utils.estimator_checks import check_estimator\n", + "# Make checks one by one\n", + "c = 0\n", + "checks = check_estimator(Odte(), generate_only=True)\n", + "for check in checks:\n", + " c += 1\n", + " print(c, check[1])\n", + " check[1](check[0])" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939" + }, + "kernelspec": { + "display_name": "Python 3.9.2 64-bit ('general': venv)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "orig_nbformat": 2 }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_wine\n", - "X, y = load_wine(return_X_y=True)\n", - "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_estimators = 20\n", - "clf = {}\n", - "clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n", - "clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n", - "clf[\"odte\"] = Odte(n_jobs=-1, base_estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n", - "clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", - "clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"*\"*30,\"Results for wine\", \"*\"*30)\n", - "for clf_type, item in clf.items():\n", - " print(f\"Training {clf_type}...\")\n", - " now = time.time()\n", - " item.fit(Xtrain, ytrain)\n", - " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import load_iris\n", - "X, y = load_iris(return_X_y=True)\n", - "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_estimators = 10\n", - "clf = {}\n", - "clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n", - "clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n", - "clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", - "clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"*\"*30,\"Results for iris\", \"*\"*30)\n", - "for clf_type, item in clf.items():\n", - " print(f\"Training {clf_type}...\")\n", - " now = time.time()\n", - " item.fit(Xtrain, ytrain)\n", - " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", - "print(cross)\n", - "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", - "print(cross)\n", - "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from sklearn.utils.estimator_checks import check_estimator\n", - "# Make checks one by one\n", - "c = 0\n", - "checks = check_estimator(Odte(), generate_only=True)\n", - "for check in checks:\n", - " c += 1\n", - " print(c, check[1])\n", - " check[1](check[0])" - ] - } - ], - "metadata": { - "interpreter": { - "hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939" - }, - "kernelspec": { - "display_name": "Python 3.9.2 64-bit ('general': venv)", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 2 - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 2 } diff --git a/odte/Odte.py b/odte/Odte.py index 825b6c3..c94afbb 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -31,7 +31,7 @@ class Odte(BaseEnsemble, ClassifierMixin): self, # n_jobs = -1 to use all available cores n_jobs: int = -1, - base_estimator: BaseEstimator = None, + estimator: BaseEstimator = None, random_state: int = 0, max_features: Optional[Union[str, int, float]] = None, max_samples: Optional[Union[int, float]] = None, @@ -39,10 +39,10 @@ class Odte(BaseEnsemble, ClassifierMixin): be_hyperparams: str = "{}", ): super().__init__( - base_estimator=base_estimator, + estimator=estimator, n_estimators=n_estimators, ) - self.base_estimator = base_estimator + self.estimator = estimator self.n_jobs = n_jobs self.n_estimators = n_estimators self.random_state = random_state @@ -55,7 +55,7 @@ class Odte(BaseEnsemble, ClassifierMixin): return __version__ def _validate_estimator(self) -> None: - """Check the estimator and set the base_estimator_ attribute.""" + """Check the estimator and set the estimator_ attribute.""" super()._validate_estimator( default=Stree(random_state=self.random_state) ) @@ -79,7 +79,7 @@ class Odte(BaseEnsemble, ClassifierMixin): # Initialize computed parameters # Build the estimator self.max_features_ = self._initialize_max_features() - # build base_estimator_ + # build estimator_ self._validate_estimator() self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_: int = self.classes_.shape[0] @@ -108,7 +108,7 @@ class Odte(BaseEnsemble, ClassifierMixin): ) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]: n_samples = X.shape[0] boot_samples = self._get_bootstrap_n_samples(n_samples) - estimator = clone(self.base_estimator_) + estimator = clone(self.estimator_) return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore delayed(Odte._parallel_build_tree)( estimator, @@ -127,7 +127,7 @@ class Odte(BaseEnsemble, ClassifierMixin): @staticmethod def _parallel_build_tree( - base_estimator_: BaseEstimator, + estimator_: BaseEstimator, X: np.ndarray, y: np.ndarray, weights: np.ndarray, @@ -136,7 +136,7 @@ class Odte(BaseEnsemble, ClassifierMixin): max_features: int, hyperparams: str, ) -> Tuple[BaseEstimator, Tuple[int, ...]]: - clf = clone(base_estimator_) + clf = clone(estimator_) hyperparams_ = json.loads(hyperparams) hyperparams_.update(dict(random_state=random_seed)) clf.set_params(**hyperparams_) diff --git a/odte/tests/Odte_tests.py b/odte/tests/Odte_tests.py index 3974b45..498e9f2 100644 --- a/odte/tests/Odte_tests.py +++ b/odte/tests/Odte_tests.py @@ -76,15 +76,15 @@ class Odte_test(unittest.TestCase): X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2] expected = [0, 1, 1, 2] tclf = Odte( - base_estimator=Stree(), + estimator=Stree(), random_state=self._random_state, n_estimators=10, n_jobs=-1, ) tclf.set_params( **dict( - base_estimator__kernel="rbf", - base_estimator__random_state=self._random_state, + estimator__kernel="rbf", + estimator__random_state=self._random_state, ) ) computed = tclf.fit(X, y).predict(X) @@ -96,14 +96,14 @@ class Odte_test(unittest.TestCase): X, y = load_dataset(self._random_state) expected = y tclf = Odte( - base_estimator=Stree(), + estimator=Stree(), random_state=self._random_state, max_features=1.0, max_samples=0.1, ) tclf.set_params( **dict( - base_estimator__kernel="linear", + estimator__kernel="linear", ) ) computed = tclf.fit(X, y).predict(X) @@ -146,16 +146,16 @@ class Odte_test(unittest.TestCase): "cfs", ]: tclf = Odte( - base_estimator=Stree(), + estimator=Stree(), random_state=self._random_state, n_estimators=3, n_jobs=1, ) tclf.set_params( **dict( - base_estimator__max_features=max_features, - base_estimator__splitter=splitter, - base_estimator__random_state=self._random_state, + estimator__max_features=max_features, + estimator__splitter=splitter, + estimator__random_state=self._random_state, ) ) expected = results.pop(0) @@ -182,7 +182,7 @@ class Odte_test(unittest.TestCase): def test_nodes_leaves_not_fitted(self): tclf = Odte( - base_estimator=Stree(), + estimator=Stree(), random_state=self._random_state, n_estimators=3, ) @@ -191,13 +191,13 @@ class Odte_test(unittest.TestCase): def test_nodes_leaves_depth(self): tclf = Odte( - base_estimator=Stree(), + estimator=Stree(), random_state=self._random_state, n_estimators=5, n_jobs=1, ) tclf_p = Odte( - base_estimator=Stree(), + estimator=Stree(), random_state=self._random_state, n_estimators=5, n_jobs=-1, @@ -215,7 +215,7 @@ class Odte_test(unittest.TestCase): def test_nodes_leaves_SVC(self): tclf = Odte( - base_estimator=SVC(), + estimator=SVC(), random_state=self._random_state, n_estimators=3, ) @@ -227,7 +227,7 @@ class Odte_test(unittest.TestCase): self.assertAlmostEqual(0.0, leaves) self.assertAlmostEqual(0.0, nodes) - def test_base_estimator_hyperparams(self): + def test_estimator_hyperparams(self): data = [ (Stree(), {"max_features": 7, "max_depth": 2}), (SVC(), {"kernel": "linear", "cache_size": 100}), @@ -235,7 +235,7 @@ class Odte_test(unittest.TestCase): for clf, hyperparams in data: hyperparams_ = json.dumps(hyperparams) tclf = Odte( - base_estimator=clf, + estimator=clf, random_state=self._random_state, n_estimators=3, be_hyperparams=hyperparams_,