From 724a4855fb490f2e37c7e43ebba7c7e39f025521 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 30 May 2020 11:09:59 +0200 Subject: [PATCH] Adapt some notebooks --- README.md | 2 +- notebooks/adaboost.ipynb | 190 ++++++++++++++++++++++++++++++++++++ notebooks/gridsearch.ipynb | 152 ++++++++++++++++------------- notebooks/test_graphs.ipynb | 4 +- stree/Strees.py | 9 +- 5 files changed, 279 insertions(+), 78 deletions(-) create mode 100644 notebooks/adaboost.ipynb diff --git a/README.md b/README.md index c104d30..540658c 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # Stree -Oblique Tree classifier based on SVM nodes +Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc. ![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png) diff --git a/notebooks/adaboost.ipynb b/notebooks/adaboost.ipynb new file mode 100644 index 0000000..cfed940 --- /dev/null +++ b/notebooks/adaboost.ipynb @@ -0,0 +1,190 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from sklearn.ensemble import AdaBoostClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.model_selection import GridSearchCV, train_test_split\n", + "from sklearn.datasets import load_iris\n", + "from stree import Stree" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "if not os.path.isfile('data/creditcard.csv'):\n", + " !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n", + " !tar xzf creditcard.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n" + } + ], + "source": [ + "random_state=1\n", + "\n", + "def load_creditcard(n_examples=0):\n", + " import pandas as pd\n", + " import numpy as np\n", + " import random\n", + " df = pd.read_csv('data/creditcard.csv')\n", + " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", + " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", + " y = df.Class\n", + " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", + " if n_examples > 0:\n", + " # Take first n_examples samples\n", + " X = X[:n_examples, :]\n", + " y = y[:n_examples, :]\n", + " else:\n", + " # Take all the positive samples with a number of random negatives\n", + " if n_examples < 0:\n", + " Xt = X[(y == 1).ravel()]\n", + " yt = y[(y == 1).ravel()]\n", + " indices = random.sample(range(X.shape[0]), -1 * n_examples)\n", + " X = np.append(Xt, X[indices], axis=0)\n", + " y = np.append(yt, y[indices], axis=0)\n", + " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", + " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", + " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n", + " Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", + " return Xtrain, Xtest, ytrain, ytest\n", + "\n", + "data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n", + "# data = load_creditcard(5000) # Take the first 5000 samples\n", + "# data = load_creditcard(0) # Take all the samples\n", + "\n", + "Xtrain = data[0]\n", + "Xtest = data[1]\n", + "ytrain = data[2]\n", + "ytest = data[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Score Train: 0.986857825567503\nScore Test: 0.9805013927576601\nTook 0.12 seconds\n" + } + ], + "source": [ + "now = time.time()\n", + "clf = Stree(max_depth=3, random_state=random_state)\n", + "clf.fit(Xtrain, ytrain)\n", + "print(\"Score Train: \", clf.score(Xtrain, ytrain))\n", + "print(\"Score Test: \", clf.score(Xtest, ytest))\n", + "print(f\"Took {time.time() - now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Score Train: 0.997610513739546\nScore Test: 0.9721448467966574\nTook 7.80 seconds\n" + } + ], + "source": [ + "now = time.time()\n", + "clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n", + "clf2.fit(Xtrain, ytrain)\n", + "print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n", + "print(\"Score Test: \", clf2.score(Xtest, ytest))\n", + "print(f\"Took {time.time() - now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Score Train: 0.9796893667861409\nScore Test: 0.9554317548746518\nTook 0.48 seconds\n" + } + ], + "source": [ + "now = time.time()\n", + "clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n", + "clf3.fit(Xtrain, ytrain)\n", + "print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n", + "print(\"Score Test: \", clf3.score(Xtest, ytest))\n", + "print(f\"Took {time.time() - now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Score Train: 1.0\nScore Test: 0.9721448467966574\nTook 0.86 seconds\n" + } + ], + "source": [ + "now = time.time()\n", + "clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n", + "clf4.fit(Xtrain, ytrain)\n", + "print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n", + "print(\"Score Test: \", clf4.score(Xtest, ytest))\n", + "print(f\"Took {time.time() - now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + }, + "orig_nbformat": 2, + "kernelspec": { + "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39", + "display_name": "Python 3.7.6 64-bit ('general': venv)" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/notebooks/gridsearch.ipynb b/notebooks/gridsearch.ipynb index adc4978..0d5ae8e 100644 --- a/notebooks/gridsearch.ipynb +++ b/notebooks/gridsearch.ipynb @@ -20,8 +20,10 @@ "metadata": {}, "outputs": [], "source": [ - "#X, y = load_iris(return_X_y=True)\n", - "#y[y==2] = 0" + "import os\n", + "if not os.path.isfile('data/creditcard.csv'):\n", + " !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n", + " !tar xzf creditcard.tgz" ] }, { @@ -32,7 +34,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n" + "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.555% 198\nValid: 83.445% 998\n" } ], "source": [ @@ -79,88 +81,54 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "root\nroot - Down - Leaf class=1.0 belief=0.976000 counts=(array([0., 1.]), array([ 3, 122]))\nroot - Up - Leaf class=0.0 belief=0.977528 counts=(array([0., 1.]), array([696, 16]))\n\n" - } - ], + "outputs": [], "source": [ - "c = Stree(max_depth=2)\n", - "c.fit(Xtrain, ytrain)\n", - "print(c)" + "parameters = {\n", + " 'base_estimator': [Stree()],\n", + " 'n_estimators': [50, 100, 150],\n", + " 'learning_rate': [.5, 1],\n", + " 'base_estimator__tol': [.1, 1e-02],\n", + " 'base_estimator__max_depth': [5, 7],\n", + " 'base_estimator__C': [1, 3]\n", + "}\n", + "#'max_depth': [3, 5]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "({'C': 1.0,\n 'class_weight': None,\n 'dual': True,\n 'fit_intercept': True,\n 'intercept_scaling': 1,\n 'loss': 'squared_hinge',\n 'max_iter': 1000,\n 'multi_class': 'ovr',\n 'penalty': 'l2',\n 'random_state': None,\n 'tol': 0.0001,\n 'verbose': 0},\n {'ccp_alpha': 0.0,\n 'class_weight': None,\n 'criterion': 'gini',\n 'max_depth': None,\n 'max_features': None,\n 'max_leaf_nodes': None,\n 'min_impurity_decrease': 0.0,\n 'min_impurity_split': None,\n 'min_samples_leaf': 1,\n 'min_samples_split': 2,\n 'min_weight_fraction_leaf': 0.0,\n 'presort': 'deprecated',\n 'random_state': None,\n 'splitter': 'best'})" + }, + "metadata": {}, + "execution_count": 5 + } + ], "source": [ - "#'base_estimator': [DecisionTreeClassifier(max_depth=1), Stree(max_depth=2), Stree(max_depth=3)],\n", - "parameters = {\n", - " 'base_estimator': [LinearSVC(), Stree(max_depth=2), Stree(max_depth=3)],\n", - " 'n_estimators': [20, 50, 100, 150],\n", - " 'learning_rate': [.5, 1, 1.5] \n", - "}" + "LinearSVC().get_params(), DecisionTreeClassifier().get_params()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [], - "source": [ - "#parameters = {\n", - "# 'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=5), Stree(), Stree(C=.1), Stree(C=.01), Stree(C=3)],\n", - "# 'n_estimators': [20, 50, 100, 150],\n", - "# 'learning_rate': [.5, 1, 1.5] \n", - "#}" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", - "text": "(X: numpy.ndarray, y: numpy.ndarray, sample_weight: = None) -> 'Stree'\n" - } - ], - "source": [ - "from inspect import signature\n", - "print(signature(c.fit))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.utils.validation import _check_sample_weight" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.1671s.) Setting batch_size=2.\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 1.4s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.0413s.) Setting batch_size=4.\n[Parallel(n_jobs=-1)]: Done 50 tasks | elapsed: 1.4s\n[Parallel(n_jobs=-1)]: Batch computation too slow (7.7880s.) Setting batch_size=1.\n[Parallel(n_jobs=-1)]: Done 74 tasks | elapsed: 9.2s\n[Parallel(n_jobs=-1)]: Done 121 tasks | elapsed: 48.9s\n[Parallel(n_jobs=-1)]: Done 140 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 161 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 1.6min finished\n" + "text": "Fitting 5 folds for each of 48 candidates, totalling 240 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 4.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 7.1s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 10.4s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 17.9s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 22.8s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 27.4s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 33.0s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 39.3s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 48.6s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 57.5s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 1.1min\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 1.5min\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 1.7min\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 1.9min\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 2.2min\n[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 2.6min finished\n" }, { "output_type": "execute_result", "data": { - "text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [LinearSVC(), Stree(max_depth=2),\n Stree(max_depth=3)],\n 'learning_rate': [0.5, 1, 1.5],\n 'n_estimators': [20, 50, 100, 150]},\n return_train_score=True, verbose=10)" + "text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=7, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__max_depth': [5, 7],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1],\n 'n_estimators': [50, 100, 150]},\n return_train_score=True, verbose=10)" }, "metadata": {}, - "execution_count": 9 + "execution_count": 6 } ], "source": [ @@ -172,25 +140,75 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "(AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=7, tol=0.1),\n learning_rate=0.5, random_state=2020),\n 0.9808810949529512)" + }, + "metadata": {}, + "execution_count": 7 + } + ], + "source": [ + "grid.best_estimator_, grid.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "[{'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 1,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 5,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.1,\n 'learning_rate': 1,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 0.5,\n 'n_estimators': 150},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 50},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 100},\n {'base_estimator': Stree(C=1, max_depth=7, tol=0.1),\n 'base_estimator__C': 3,\n 'base_estimator__max_depth': 7,\n 'base_estimator__tol': 0.01,\n 'learning_rate': 1,\n 'n_estimators': 150}]" + }, + "metadata": {}, + "execution_count": 8 + } + ], + "source": [ + "grid.cv_results_[\"params\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", - "text": "AdaBoostClassifier(base_estimator=Stree(max_depth=2), learning_rate=0.5,\n n_estimators=150, random_state=2020)\n" + "text": "base_estimator base_estimator__C base_estimator__max_depth \\\n0 1 5 \n1 1 5 \n2 1 5 \n3 1 5 \n4 1 5 \n5 1 5 \n6 1 5 \n7 1 5 \n8 1 5 \n9 1 5 \n10 1 5 \n11 1 5 \n12 1 7 \n13 1 7 \n14 1 7 \n15 1 7 \n16 1 7 \n17 1 7 \n18 1 7 \n19 1 7 \n20 1 7 \n21 1 7 \n22 1 7 \n23 1 7 \n24 3 5 \n25 3 5 \n26 3 5 \n27 3 5 \n28 3 5 \n29 3 5 \n30 3 5 \n31 3 5 \n32 3 5 \n33 3 5 \n34 3 5 \n35 3 5 \n36 3 7 \n37 3 7 \n38 3 7 \n39 3 7 \n40 3 7 \n41 3 7 \n42 3 7 \n43 3 7 \n44 3 7 \n45 3 7 \n46 3 7 \n47 3 7 \n\n base_estimator__tol learning_rate n_estimators Accuracy \n0 0.10 0.5 50 0.979691 \n1 0.10 0.5 100 0.978493 \n2 0.10 0.5 150 0.978493 \n3 0.10 1.0 50 0.978486 \n4 0.10 1.0 100 0.978493 \n5 0.10 1.0 150 0.979691 \n6 0.01 0.5 50 0.979691 \n7 0.01 0.5 100 0.978493 \n8 0.01 0.5 150 0.978493 \n9 0.01 1.0 50 0.978486 \n10 0.01 1.0 100 0.978493 \n11 0.01 1.0 150 0.979691 \n12 0.10 0.5 50 0.980881 \n13 0.10 0.5 100 0.980881 \n14 0.10 0.5 150 0.978486 \n15 0.10 1.0 50 0.979691 \n16 0.10 1.0 100 0.976098 \n17 0.10 1.0 150 0.976098 \n18 0.01 0.5 50 0.980881 \n19 0.01 0.5 100 0.980881 \n20 0.01 0.5 150 0.978486 \n21 0.01 1.0 50 0.979691 \n22 0.01 1.0 100 0.976098 \n23 0.01 1.0 150 0.976098 \n24 0.10 0.5 50 0.979691 \n25 0.10 0.5 100 0.979683 \n26 0.10 0.5 150 0.977303 \n27 0.10 1.0 50 0.978493 \n28 0.10 1.0 100 0.977295 \n29 0.10 1.0 150 0.977295 \n30 0.01 0.5 50 0.979691 \n31 0.01 0.5 100 0.979683 \n32 0.01 0.5 150 0.977303 \n33 0.01 1.0 50 0.978493 \n34 0.01 1.0 100 0.977295 \n35 0.01 1.0 150 0.977295 \n36 0.10 0.5 50 0.980881 \n37 0.10 0.5 100 0.977303 \n38 0.10 0.5 150 0.978500 \n39 0.10 1.0 50 0.977303 \n40 0.10 1.0 100 0.978493 \n41 0.10 1.0 150 0.978493 \n42 0.01 0.5 50 0.980881 \n43 0.01 0.5 100 0.977303 \n44 0.01 0.5 150 0.978500 \n45 0.01 1.0 50 0.977303 \n46 0.01 1.0 100 0.978493 \n47 0.01 1.0 150 0.978493 \n" } ], "source": [ - "print(grid.best_estimator_)" + "import pandas as pd\n", + "res = pd.concat([pd.DataFrame(grid.cv_results_[\"params\"]),pd.DataFrame(grid.cv_results_[\"mean_test_score\"], columns=[\"Accuracy\"])], axis=1)\n", + "\n", + "#print(res.sort_values(['Accuracy'], ascending=False))\n", + "print(res)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 10, "metadata": {}, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": "{'cv': None,\n 'error_score': nan,\n 'estimator__algorithm': 'SAMME.R',\n 'estimator__base_estimator': None,\n 'estimator__learning_rate': 1.0,\n 'estimator__n_estimators': 50,\n 'estimator__random_state': 2020,\n 'estimator': AdaBoostClassifier(random_state=2020),\n 'iid': 'deprecated',\n 'n_jobs': -1,\n 'param_grid': {'base_estimator': [Stree(C=1, max_depth=7, tol=0.1)],\n 'n_estimators': [50, 100, 150],\n 'learning_rate': [0.5, 1],\n 'base_estimator__tol': [0.1, 0.01],\n 'base_estimator__max_depth': [5, 7],\n 'base_estimator__C': [1, 3]},\n 'pre_dispatch': '2*n_jobs',\n 'refit': True,\n 'return_train_score': True,\n 'scoring': None,\n 'verbose': 10}" + }, + "metadata": {}, + "execution_count": 10 + } + ], "source": [ - "AdaBoostClassifier(base_estimator=Stree(max_depth=3), learning_rate=0.5,\n", - " n_estimators=20, random_state=2020)" + "grid.get_params()" ] } ], diff --git a/notebooks/test_graphs.ipynb b/notebooks/test_graphs.ipynb index a288634..5c07aec 100644 --- a/notebooks/test_graphs.ipynb +++ b/notebooks/test_graphs.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -24,7 +24,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'stree'" ] } diff --git a/stree/Strees.py b/stree/Strees.py index 072dd62..5ed2e03 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -152,11 +152,6 @@ class Stree(BaseEstimator, ClassifierMixin): # doesn't work with multiclass as each sample has to do inner product with its own coeficients # computes positition of every sample is w.r.t. the hyperplane res = self._linear_function(data, node) - # data_up, data_down = self._split_array(data, down) - # indices_up, indices_down = self._split_array(indices, down) - # res_up, res_down = self._split_array(res, down) - # weight_up, weight_down = self._split_array(weights, down) - #return [data_up, indices_up, data_down, indices_down, weight_up, weight_down, res_up, res_down] return res def _split_criteria(self, data: np.array) -> np.array: @@ -176,7 +171,6 @@ class Stree(BaseEstimator, ClassifierMixin): sample_weight = _check_sample_weight(sample_weight, X) check_classification_targets(y) # Initialize computed parameters - #self.random_state = check_random_state(self.random_state) self.classes_ = np.unique(y) self.n_iter_ = self.max_iter self.depth_ = 0 @@ -316,8 +310,7 @@ class Stree(BaseEstimator, ClassifierMixin): # sklearn check check_is_fitted(self) yp = self.predict(X).reshape(y.shape) - right = (yp == y).astype(int) - return np.sum(right) / len(y) + return np.mean(yp == y) def __iter__(self) -> Siterator: try: