mirror of
https://github.com/Doctorado-ML/Odte.git
synced 2025-07-11 00:02:30 +00:00
Update to scikit-learn 1.2
This commit is contained in:
parent
7300bd66db
commit
cabf926eb1
@ -1,174 +1,174 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import datetime, time\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split, cross_validate\n",
|
||||
"from sklearn import tree\n",
|
||||
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n",
|
||||
"from stree import Stree\n",
|
||||
"from odte import Odte\n",
|
||||
"\n",
|
||||
"random_state = 1"
|
||||
]
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import datetime, time\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split, cross_validate\n",
|
||||
"from sklearn import tree\n",
|
||||
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n",
|
||||
"from stree import Stree\n",
|
||||
"from odte import Odte\n",
|
||||
"\n",
|
||||
"random_state = 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_wine\n",
|
||||
"X, y = load_wine(return_X_y=True)\n",
|
||||
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"n_estimators = 20\n",
|
||||
"clf = {}\n",
|
||||
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n",
|
||||
"clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n",
|
||||
"clf[\"odte\"] = Odte(n_jobs=-1, estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n",
|
||||
"clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"*\"*30,\"Results for wine\", \"*\"*30)\n",
|
||||
"for clf_type, item in clf.items():\n",
|
||||
" print(f\"Training {clf_type}...\")\n",
|
||||
" now = time.time()\n",
|
||||
" item.fit(Xtrain, ytrain)\n",
|
||||
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_iris\n",
|
||||
"X, y = load_iris(return_X_y=True)\n",
|
||||
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"n_estimators = 10\n",
|
||||
"clf = {}\n",
|
||||
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n",
|
||||
"clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n",
|
||||
"clf[\"adaboost\"] = AdaBoostClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"clf[\"bagging\"] = BaggingClassifier(estimator=clf[\"stree\"], n_estimators=n_estimators)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"*\"*30,\"Results for iris\", \"*\"*30)\n",
|
||||
"for clf_type, item in clf.items():\n",
|
||||
" print(f\"Training {clf_type}...\")\n",
|
||||
" now = time.time()\n",
|
||||
" item.fit(Xtrain, ytrain)\n",
|
||||
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
|
||||
"print(cross)\n",
|
||||
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
|
||||
"print(cross)\n",
|
||||
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||
"# Make checks one by one\n",
|
||||
"c = 0\n",
|
||||
"checks = check_estimator(Odte(), generate_only=True)\n",
|
||||
"for check in checks:\n",
|
||||
" c += 1\n",
|
||||
" print(c, check[1])\n",
|
||||
" check[1](check[0])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.2 64-bit ('general': venv)",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
},
|
||||
"orig_nbformat": 2
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_wine\n",
|
||||
"X, y = load_wine(return_X_y=True)\n",
|
||||
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"n_estimators = 20\n",
|
||||
"clf = {}\n",
|
||||
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n",
|
||||
"clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n",
|
||||
"clf[\"odte\"] = Odte(n_jobs=-1, base_estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n",
|
||||
"clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"*\"*30,\"Results for wine\", \"*\"*30)\n",
|
||||
"for clf_type, item in clf.items():\n",
|
||||
" print(f\"Training {clf_type}...\")\n",
|
||||
" now = time.time()\n",
|
||||
" item.fit(Xtrain, ytrain)\n",
|
||||
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_iris\n",
|
||||
"X, y = load_iris(return_X_y=True)\n",
|
||||
"Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"n_estimators = 10\n",
|
||||
"clf = {}\n",
|
||||
"clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n",
|
||||
"clf[\"odte\"] = Odte(n_jobs=-1, random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n",
|
||||
"clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"*\"*30,\"Results for iris\", \"*\"*30)\n",
|
||||
"for clf_type, item in clf.items():\n",
|
||||
" print(f\"Training {clf_type}...\")\n",
|
||||
" now = time.time()\n",
|
||||
" item.fit(Xtrain, ytrain)\n",
|
||||
" print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
|
||||
"print(cross)\n",
|
||||
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n",
|
||||
"print(cross)\n",
|
||||
"print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||
"# Make checks one by one\n",
|
||||
"c = 0\n",
|
||||
"checks = check_estimator(Odte(), generate_only=True)\n",
|
||||
"for check in checks:\n",
|
||||
" c += 1\n",
|
||||
" print(c, check[1])\n",
|
||||
" check[1](check[0])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"interpreter": {
|
||||
"hash": "da86226729227d0e8962a5ec29ea906307507ca2c30ceaaf651c09a617630939"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.9.2 64-bit ('general': venv)",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
},
|
||||
"orig_nbformat": 2
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
|
16
odte/Odte.py
16
odte/Odte.py
@ -31,7 +31,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
||||
self,
|
||||
# n_jobs = -1 to use all available cores
|
||||
n_jobs: int = -1,
|
||||
base_estimator: BaseEstimator = None,
|
||||
estimator: BaseEstimator = None,
|
||||
random_state: int = 0,
|
||||
max_features: Optional[Union[str, int, float]] = None,
|
||||
max_samples: Optional[Union[int, float]] = None,
|
||||
@ -39,10 +39,10 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
||||
be_hyperparams: str = "{}",
|
||||
):
|
||||
super().__init__(
|
||||
base_estimator=base_estimator,
|
||||
estimator=estimator,
|
||||
n_estimators=n_estimators,
|
||||
)
|
||||
self.base_estimator = base_estimator
|
||||
self.estimator = estimator
|
||||
self.n_jobs = n_jobs
|
||||
self.n_estimators = n_estimators
|
||||
self.random_state = random_state
|
||||
@ -55,7 +55,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
||||
return __version__
|
||||
|
||||
def _validate_estimator(self) -> None:
|
||||
"""Check the estimator and set the base_estimator_ attribute."""
|
||||
"""Check the estimator and set the estimator_ attribute."""
|
||||
super()._validate_estimator(
|
||||
default=Stree(random_state=self.random_state)
|
||||
)
|
||||
@ -79,7 +79,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
||||
# Initialize computed parameters
|
||||
# Build the estimator
|
||||
self.max_features_ = self._initialize_max_features()
|
||||
# build base_estimator_
|
||||
# build estimator_
|
||||
self._validate_estimator()
|
||||
self.classes_, y = np.unique(y, return_inverse=True)
|
||||
self.n_classes_: int = self.classes_.shape[0]
|
||||
@ -108,7 +108,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
||||
) -> Tuple[List[BaseEstimator], List[Tuple[int, ...]]]:
|
||||
n_samples = X.shape[0]
|
||||
boot_samples = self._get_bootstrap_n_samples(n_samples)
|
||||
estimator = clone(self.base_estimator_)
|
||||
estimator = clone(self.estimator_)
|
||||
return Parallel(n_jobs=self.n_jobs, prefer="threads")( # type: ignore
|
||||
delayed(Odte._parallel_build_tree)(
|
||||
estimator,
|
||||
@ -127,7 +127,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
||||
|
||||
@staticmethod
|
||||
def _parallel_build_tree(
|
||||
base_estimator_: BaseEstimator,
|
||||
estimator_: BaseEstimator,
|
||||
X: np.ndarray,
|
||||
y: np.ndarray,
|
||||
weights: np.ndarray,
|
||||
@ -136,7 +136,7 @@ class Odte(BaseEnsemble, ClassifierMixin):
|
||||
max_features: int,
|
||||
hyperparams: str,
|
||||
) -> Tuple[BaseEstimator, Tuple[int, ...]]:
|
||||
clf = clone(base_estimator_)
|
||||
clf = clone(estimator_)
|
||||
hyperparams_ = json.loads(hyperparams)
|
||||
hyperparams_.update(dict(random_state=random_seed))
|
||||
clf.set_params(**hyperparams_)
|
||||
|
@ -76,15 +76,15 @@ class Odte_test(unittest.TestCase):
|
||||
X, y = [[1, 2], [5, 6], [9, 10], [16, 17]], [0, 1, 1, 2]
|
||||
expected = [0, 1, 1, 2]
|
||||
tclf = Odte(
|
||||
base_estimator=Stree(),
|
||||
estimator=Stree(),
|
||||
random_state=self._random_state,
|
||||
n_estimators=10,
|
||||
n_jobs=-1,
|
||||
)
|
||||
tclf.set_params(
|
||||
**dict(
|
||||
base_estimator__kernel="rbf",
|
||||
base_estimator__random_state=self._random_state,
|
||||
estimator__kernel="rbf",
|
||||
estimator__random_state=self._random_state,
|
||||
)
|
||||
)
|
||||
computed = tclf.fit(X, y).predict(X)
|
||||
@ -96,14 +96,14 @@ class Odte_test(unittest.TestCase):
|
||||
X, y = load_dataset(self._random_state)
|
||||
expected = y
|
||||
tclf = Odte(
|
||||
base_estimator=Stree(),
|
||||
estimator=Stree(),
|
||||
random_state=self._random_state,
|
||||
max_features=1.0,
|
||||
max_samples=0.1,
|
||||
)
|
||||
tclf.set_params(
|
||||
**dict(
|
||||
base_estimator__kernel="linear",
|
||||
estimator__kernel="linear",
|
||||
)
|
||||
)
|
||||
computed = tclf.fit(X, y).predict(X)
|
||||
@ -146,16 +146,16 @@ class Odte_test(unittest.TestCase):
|
||||
"cfs",
|
||||
]:
|
||||
tclf = Odte(
|
||||
base_estimator=Stree(),
|
||||
estimator=Stree(),
|
||||
random_state=self._random_state,
|
||||
n_estimators=3,
|
||||
n_jobs=1,
|
||||
)
|
||||
tclf.set_params(
|
||||
**dict(
|
||||
base_estimator__max_features=max_features,
|
||||
base_estimator__splitter=splitter,
|
||||
base_estimator__random_state=self._random_state,
|
||||
estimator__max_features=max_features,
|
||||
estimator__splitter=splitter,
|
||||
estimator__random_state=self._random_state,
|
||||
)
|
||||
)
|
||||
expected = results.pop(0)
|
||||
@ -182,7 +182,7 @@ class Odte_test(unittest.TestCase):
|
||||
|
||||
def test_nodes_leaves_not_fitted(self):
|
||||
tclf = Odte(
|
||||
base_estimator=Stree(),
|
||||
estimator=Stree(),
|
||||
random_state=self._random_state,
|
||||
n_estimators=3,
|
||||
)
|
||||
@ -191,13 +191,13 @@ class Odte_test(unittest.TestCase):
|
||||
|
||||
def test_nodes_leaves_depth(self):
|
||||
tclf = Odte(
|
||||
base_estimator=Stree(),
|
||||
estimator=Stree(),
|
||||
random_state=self._random_state,
|
||||
n_estimators=5,
|
||||
n_jobs=1,
|
||||
)
|
||||
tclf_p = Odte(
|
||||
base_estimator=Stree(),
|
||||
estimator=Stree(),
|
||||
random_state=self._random_state,
|
||||
n_estimators=5,
|
||||
n_jobs=-1,
|
||||
@ -215,7 +215,7 @@ class Odte_test(unittest.TestCase):
|
||||
|
||||
def test_nodes_leaves_SVC(self):
|
||||
tclf = Odte(
|
||||
base_estimator=SVC(),
|
||||
estimator=SVC(),
|
||||
random_state=self._random_state,
|
||||
n_estimators=3,
|
||||
)
|
||||
@ -227,7 +227,7 @@ class Odte_test(unittest.TestCase):
|
||||
self.assertAlmostEqual(0.0, leaves)
|
||||
self.assertAlmostEqual(0.0, nodes)
|
||||
|
||||
def test_base_estimator_hyperparams(self):
|
||||
def test_estimator_hyperparams(self):
|
||||
data = [
|
||||
(Stree(), {"max_features": 7, "max_depth": 2}),
|
||||
(SVC(), {"kernel": "linear", "cache_size": 100}),
|
||||
@ -235,7 +235,7 @@ class Odte_test(unittest.TestCase):
|
||||
for clf, hyperparams in data:
|
||||
hyperparams_ = json.dumps(hyperparams)
|
||||
tclf = Odte(
|
||||
base_estimator=clf,
|
||||
estimator=clf,
|
||||
random_state=self._random_state,
|
||||
n_estimators=3,
|
||||
be_hyperparams=hyperparams_,
|
||||
|
Loading…
x
Reference in New Issue
Block a user