diff --git a/notebooks/wine_iris.ipynb b/notebooks/wine_iris.ipynb new file mode 100644 index 0000000..ec1d342 --- /dev/null +++ b/notebooks/wine_iris.ipynb @@ -0,0 +1,177 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime, time\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split, cross_validate\n", + "from sklearn import tree\n", + "from sklearn.metrics import classification_report, confusion_matrix, f1_score\n", + "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n", + "from stree import Stree\n", + "from odte import Odte\n", + "\n", + "random_state = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_wine\n", + "X, y = load_wine(return_X_y=True)\n", + "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 20\n", + "clf = {}\n", + "clf[\"stree\"] = Stree(random_state=random_state, max_depth=5)\n", + "clf[\"stree\"].set_params(**dict(splitter=\"best\", kernel=\"linear\", max_features=\"auto\"))\n", + "clf[\"odte\"] = Odte(base_estimator=clf[\"stree\"], random_state=random_state, n_estimators=n_estimators, max_features=.8)\n", + "clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", + "clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "****************************** Results for wine ******************************\nTraining stree...\nScore: 94.444 in 0.17 seconds\nTraining odte...\nScore: 97.222 in 2.70 seconds\nTraining adaboost...\nScore: 94.444 in 0.60 seconds\nTraining bagging...\nScore: 100.000 in 2.55 seconds\n" + } + ], + "source": [ + "print(\"*\"*30,\"Results for wine\", \"*\"*30)\n", + "for clf_type, item in clf.items():\n", + " print(f\"Training {clf_type}...\")\n", + " now = time.time()\n", + " item.fit(Xtrain, ytrain)\n", + " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import load_iris\n", + "X, y = load_iris(return_X_y=True)\n", + "Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=.2, random_state=random_state)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 10\n", + "clf = {}\n", + "clf[\"stree\"] = Stree(random_state=random_state, max_depth=3)\n", + "clf[\"odte\"] = Odte(random_state=random_state, n_estimators=n_estimators, max_features=1.0)\n", + "clf[\"adaboost\"] = AdaBoostClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n", + "clf[\"bagging\"] = BaggingClassifier(base_estimator=clf[\"stree\"], n_estimators=n_estimators)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "****************************** Results for iris ******************************\nTraining stree...\nScore: 100.000 in 0.02 seconds\nTraining odte...\nScore: 93.333 in 0.12 seconds\nTraining adaboost...\nScore: 83.333 in 0.01 seconds\nTraining bagging...\nScore: 100.000 in 0.11 seconds\n" + } + ], + "source": [ + "print(\"*\"*30,\"Results for iris\", \"*\"*30)\n", + "for clf_type, item in clf.items():\n", + " print(f\"Training {clf_type}...\")\n", + " now = time.time()\n", + " item.fit(Xtrain, ytrain)\n", + " print(f\"Score: {item.score(Xtest, ytest) * 100:.3f} in {time.time()-now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "{'fit_time': array([0.15752316, 0.18354201, 0.14742589, 0.13827896, 0.14534211]), 'score_time': array([0.00940681, 0.01064587, 0.01085019, 0.00925183, 0.00878191]), 'test_score': array([0.8 , 0.93333333, 0.93333333, 0.93333333, 0.96666667]), 'train_score': array([0.875 , 0.95 , 0.98333333, 0.98333333, 0.95833333])}\n91.333 +- 0.058\n" + } + ], + "source": [ + "cross = cross_validate(estimator=clf[\"odte\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", + "print(cross)\n", + "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "{'fit_time': array([0.01752877, 0.03304005, 0.03542018, 0.03398919, 0.03945518]), 'score_time': array([0.00135112, 0.00164104, 0.00159597, 0.0018959 , 0.00189495]), 'test_score': array([1. , 0.93333333, 0.93333333, 0.93333333, 0.96666667]), 'train_score': array([0.93333333, 0.96666667, 0.96666667, 0.96666667, 0.95 ])}\n95.333 +- 0.027\n" + } + ], + "source": [ + "cross = cross_validate(estimator=clf[\"adaboost\"], X=X, y=y, n_jobs=-1, return_train_score=True)\n", + "print(cross)\n", + "print(f\"{np.mean(cross['test_score'])*100:.3f} +- {np.std(cross['test_score']):.3f}\")" + ] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39", + "display_name": "Python 3.7.6 64-bit ('general': venv)" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/odte/Odte.py b/odte/Odte.py index 02b6fb7..538aeec 100644 --- a/odte/Odte.py +++ b/odte/Odte.py @@ -192,10 +192,12 @@ class Odte(BaseEnsemble, ClassifierMixin): # Input validation X = check_array(X) if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} and " - "input n_features is {1}." - "".format(self.n_features_, X.shape[1])) + raise ValueError( + "Number of features of the model must " + "match the input. Model n_features is {0} and " + "input n_features is {1}." + "".format(self.n_features_, X.shape[1]) + ) for tree, features in zip(self.estimators_, self.subspaces_): n_samples = X.shape[0] result = np.zeros((n_samples, self.n_classes_))