From 5b791bc5bfd4503d5710f56611b10ddd504d1967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sun, 15 Jan 2023 01:21:32 +0100 Subject: [PATCH] New_version_sklearn (#56) * test: :test_tube: Update max_iter as int in test_multiclass_dataset * refactor: :memo: Rename base_estimator to estimator as the former is deprectated in notebook * refactor: :pushpin: Convert max_iter to int as needed in sklearn 1.2 * chore: :bookmark: Update version info to 1.3.1 --- notebooks/gridsearch.ipynb | 502 ++++++++++++++++++------------------- stree/Strees.py | 2 +- stree/_version.py | 2 +- stree/tests/Stree_test.py | 2 +- 4 files changed, 254 insertions(+), 254 deletions(-) diff --git a/notebooks/gridsearch.ipynb b/notebooks/gridsearch.ipynb index af3c741..ebd11b9 100644 --- a/notebooks/gridsearch.ipynb +++ b/notebooks/gridsearch.ipynb @@ -1,253 +1,253 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test Gridsearch\n", - "with different kernels and different configurations" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test Gridsearch\n", + "with different kernels and different configurations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup\n", + "Uncomment the next cell if STree is not already installed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Google Colab setup\n", + "#\n", + "#!pip install git+https://github.com/doctorado-ml/stree\n", + "!pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "zIHKVxthDZEa" + }, + "outputs": [], + "source": [ + "import random\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.ensemble import AdaBoostClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.model_selection import GridSearchCV, train_test_split\n", + "from stree import Stree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IEmq50QgDZEi" + }, + "outputs": [], + "source": [ + "if not os.path.isfile('data/creditcard.csv'):\n", + " !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n", + " !tar xzf creditcard.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "z9Q-YUfBDZEq", + "outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b", + "tags": [] + }, + "outputs": [], + "source": [ + "random_state=1\n", + "\n", + "def load_creditcard(n_examples=0):\n", + " df = pd.read_csv('data/creditcard.csv')\n", + " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", + " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", + " y = df.Class\n", + " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", + " if n_examples > 0:\n", + " # Take first n_examples samples\n", + " X = X[:n_examples, :]\n", + " y = y[:n_examples, :]\n", + " else:\n", + " # Take all the positive samples with a number of random negatives\n", + " if n_examples < 0:\n", + " Xt = X[(y == 1).ravel()]\n", + " yt = y[(y == 1).ravel()]\n", + " indices = random.sample(range(X.shape[0]), -1 * n_examples)\n", + " X = np.append(Xt, X[indices], axis=0)\n", + " y = np.append(yt, y[indices], axis=0)\n", + " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", + " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", + " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n", + " Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", + " return Xtrain, Xtest, ytrain, ytest\n", + "\n", + "data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n", + "# data = load_creditcard(5000) # Take the first 5000 samples\n", + "# data = load_creditcard(0) # Take all the samples\n", + "\n", + "Xtrain = data[0]\n", + "Xtest = data[1]\n", + "ytrain = data[2]\n", + "ytest = data[3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HmX3kR4PDZEw" + }, + "outputs": [], + "source": [ + "parameters = [{\n", + " 'base_estimator': [Stree(random_state=random_state)],\n", + " 'n_estimators': [10, 25],\n", + " 'learning_rate': [.5, 1],\n", + " 'estimator__split_criteria': ['max_samples', 'impurity'],\n", + " 'estimator__tol': [.1, 1e-02],\n", + " 'estimator__max_depth': [3, 5, 7],\n", + " 'estimator__C': [1, 7, 55],\n", + " 'estimator__kernel': ['linear']\n", + "},\n", + "{\n", + " 'base_estimator': [Stree(random_state=random_state)],\n", + " 'n_estimators': [10, 25],\n", + " 'learning_rate': [.5, 1],\n", + " 'estimator__split_criteria': ['max_samples', 'impurity'],\n", + " 'estimator__tol': [.1, 1e-02],\n", + " 'estimator__max_depth': [3, 5, 7],\n", + " 'estimator__C': [1, 7, 55],\n", + " 'estimator__degree': [3, 5, 7],\n", + " 'estimator__kernel': ['poly']\n", + "},\n", + "{\n", + " 'base_estimator': [Stree(random_state=random_state)],\n", + " 'n_estimators': [10, 25],\n", + " 'learning_rate': [.5, 1],\n", + " 'estimator__split_criteria': ['max_samples', 'impurity'],\n", + " 'estimator__tol': [.1, 1e-02],\n", + " 'estimator__max_depth': [3, 5, 7],\n", + " 'estimator__C': [1, 7, 55],\n", + " 'estimator__gamma': [.1, 1, 10],\n", + " 'estimator__kernel': ['rbf']\n", + "}]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Stree().get_params()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CrcB8o6EDZE5", + "outputId": "7703413a-d563-4289-a13b-532f38f82762", + "tags": [] + }, + "outputs": [], + "source": [ + "clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n", + "grid = GridSearchCV(clf, parameters, verbose=5, n_jobs=-1, return_train_score=True)\n", + "grid.fit(Xtrain, ytrain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ZjX88NoYDZE8", + "outputId": "285163c8-fa33-4915-8ae7-61c4f7844344", + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Best estimator: \", grid.best_estimator_)\n", + "print(\"Best hyperparameters: \", grid.best_params_)\n", + "print(\"Best accuracy: \", grid.best_score_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Best estimator: AdaBoostClassifier(algorithm='SAMME',\n", + " base_estimator=Stree(C=55, max_depth=7, random_state=1,\n", + " split_criteria='max_samples', tol=0.1),\n", + " learning_rate=0.5, n_estimators=25, random_state=1)\n", + "Best hyperparameters: {'base_estimator': Stree(C=55, max_depth=7, random_state=1, split_criteria='max_samples', tol=0.1), 'estimator__C': 55, 'estimator__kernel': 'linear', 'estimator__max_depth': 7, 'estimator__split_criteria': 'max_samples', 'estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 25}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Best accuracy: 0.9511777695988222" + ] + } + ], + "metadata": { + "colab": { + "name": "gridsearch.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.2-final" + } }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup\n", - "Uncomment the next cell if STree is not already installed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#\n", - "# Google Colab setup\n", - "#\n", - "#!pip install git+https://github.com/doctorado-ml/stree\n", - "!pip install pandas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "zIHKVxthDZEa" - }, - "outputs": [], - "source": [ - "import random\n", - "import os\n", - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.ensemble import AdaBoostClassifier\n", - "from sklearn.svm import LinearSVC\n", - "from sklearn.model_selection import GridSearchCV, train_test_split\n", - "from stree import Stree" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "IEmq50QgDZEi" - }, - "outputs": [], - "source": [ - "if not os.path.isfile('data/creditcard.csv'):\n", - " !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n", - " !tar xzf creditcard.tgz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "z9Q-YUfBDZEq", - "outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b", - "tags": [] - }, - "outputs": [], - "source": [ - "random_state=1\n", - "\n", - "def load_creditcard(n_examples=0):\n", - " df = pd.read_csv('data/creditcard.csv')\n", - " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", - " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", - " y = df.Class\n", - " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", - " if n_examples > 0:\n", - " # Take first n_examples samples\n", - " X = X[:n_examples, :]\n", - " y = y[:n_examples, :]\n", - " else:\n", - " # Take all the positive samples with a number of random negatives\n", - " if n_examples < 0:\n", - " Xt = X[(y == 1).ravel()]\n", - " yt = y[(y == 1).ravel()]\n", - " indices = random.sample(range(X.shape[0]), -1 * n_examples)\n", - " X = np.append(Xt, X[indices], axis=0)\n", - " y = np.append(yt, y[indices], axis=0)\n", - " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", - " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", - " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n", - " Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", - " return Xtrain, Xtest, ytrain, ytest\n", - "\n", - "data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n", - "# data = load_creditcard(5000) # Take the first 5000 samples\n", - "# data = load_creditcard(0) # Take all the samples\n", - "\n", - "Xtrain = data[0]\n", - "Xtest = data[1]\n", - "ytrain = data[2]\n", - "ytest = data[3]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tests" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "HmX3kR4PDZEw" - }, - "outputs": [], - "source": [ - "parameters = [{\n", - " 'base_estimator': [Stree(random_state=random_state)],\n", - " 'n_estimators': [10, 25],\n", - " 'learning_rate': [.5, 1],\n", - " 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n", - " 'base_estimator__tol': [.1, 1e-02],\n", - " 'base_estimator__max_depth': [3, 5, 7],\n", - " 'base_estimator__C': [1, 7, 55],\n", - " 'base_estimator__kernel': ['linear']\n", - "},\n", - "{\n", - " 'base_estimator': [Stree(random_state=random_state)],\n", - " 'n_estimators': [10, 25],\n", - " 'learning_rate': [.5, 1],\n", - " 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n", - " 'base_estimator__tol': [.1, 1e-02],\n", - " 'base_estimator__max_depth': [3, 5, 7],\n", - " 'base_estimator__C': [1, 7, 55],\n", - " 'base_estimator__degree': [3, 5, 7],\n", - " 'base_estimator__kernel': ['poly']\n", - "},\n", - "{\n", - " 'base_estimator': [Stree(random_state=random_state)],\n", - " 'n_estimators': [10, 25],\n", - " 'learning_rate': [.5, 1],\n", - " 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n", - " 'base_estimator__tol': [.1, 1e-02],\n", - " 'base_estimator__max_depth': [3, 5, 7],\n", - " 'base_estimator__C': [1, 7, 55],\n", - " 'base_estimator__gamma': [.1, 1, 10],\n", - " 'base_estimator__kernel': ['rbf']\n", - "}]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Stree().get_params()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "CrcB8o6EDZE5", - "outputId": "7703413a-d563-4289-a13b-532f38f82762", - "tags": [] - }, - "outputs": [], - "source": [ - "clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n", - "grid = GridSearchCV(clf, parameters, verbose=5, n_jobs=-1, return_train_score=True)\n", - "grid.fit(Xtrain, ytrain)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": {}, - "colab_type": "code", - "id": "ZjX88NoYDZE8", - "outputId": "285163c8-fa33-4915-8ae7-61c4f7844344", - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"Best estimator: \", grid.best_estimator_)\n", - "print(\"Best hyperparameters: \", grid.best_params_)\n", - "print(\"Best accuracy: \", grid.best_score_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Best estimator: AdaBoostClassifier(algorithm='SAMME',\n", - " base_estimator=Stree(C=55, max_depth=7, random_state=1,\n", - " split_criteria='max_samples', tol=0.1),\n", - " learning_rate=0.5, n_estimators=25, random_state=1)\n", - "Best hyperparameters: {'base_estimator': Stree(C=55, max_depth=7, random_state=1, split_criteria='max_samples', tol=0.1), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 7, 'base_estimator__split_criteria': 'max_samples', 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 25}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Best accuracy: 0.9511777695988222" - ] - } - ], - "metadata": { - "colab": { - "name": "gridsearch.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2-final" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/stree/Strees.py b/stree/Strees.py index 23e0290..b18a0d1 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -139,7 +139,7 @@ class Stree(BaseEstimator, ClassifierMixin): self, C: float = 1.0, kernel: str = "linear", - max_iter: int = 1e5, + max_iter: int = int(1e5), random_state: int = None, max_depth: int = None, tol: float = 1e-4, diff --git a/stree/_version.py b/stree/_version.py index 67bc602..9c73af2 100644 --- a/stree/_version.py +++ b/stree/_version.py @@ -1 +1 @@ -__version__ = "1.3.0" +__version__ = "1.3.1" diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index 0f13eaa..67f618e 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -306,7 +306,7 @@ class Stree_test(unittest.TestCase): for criteria in ["max_samples", "impurity"]: for kernel in self._kernels: clf = Stree( - max_iter=1e4, + max_iter=int(1e4), multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",