From 7e932de072147382311cbf9d2fbec53627cf3e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 9 Jun 2020 01:46:38 +0200 Subject: [PATCH] #3 Add sample_weights to score, update notebooks Update readme to use new names of notebooks --- README.md | 12 +- notebooks/{test.ipynb => benchmark.ipynb} | 0 notebooks/features.ipynb | 370 ++++++++++++++++++++++ notebooks/test2.ipynb | 337 -------------------- stree/Strees.py | 38 ++- 5 files changed, 406 insertions(+), 351 deletions(-) rename notebooks/{test.ipynb => benchmark.ipynb} (100%) create mode 100644 notebooks/features.ipynb delete mode 100644 notebooks/test2.ipynb diff --git a/README.md b/README.md index 0ae995d..95e94bd 100644 --- a/README.md +++ b/README.md @@ -18,21 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree ### Jupyter notebooks -##### Slow launch but better integration +* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark -* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/test.ipynb) Test notebook +* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark -##### Fast launch but have to run first commented out cell for setup - -* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test.ipynb) Test notebook - -* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test2.ipynb) Another Test notebook +* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features * [![Adaboost](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost * [![Gridsearch](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch -* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics notebook +* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics ### Command line diff --git a/notebooks/test.ipynb b/notebooks/benchmark.ipynb similarity index 100% rename from notebooks/test.ipynb rename to notebooks/benchmark.ipynb diff --git a/notebooks/features.ipynb b/notebooks/features.ipynb new file mode 100644 index 0000000..3f277f9 --- /dev/null +++ b/notebooks/features.ipynb @@ -0,0 +1,370 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test smple_weight, kernels, C, sklearn estimator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup\n", + "Uncomment the next cell if STree is not already installed" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Google Colab setup\n", + "#\n", + "#!pip install git+https://github.com/doctorado-ml/stree" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.svm import SVC\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.utils.estimator_checks import check_estimator\n", + "from sklearn.datasets import make_classification, load_iris, load_wine\n", + "from sklearn.model_selection import train_test_split\n", + "from stree import Stree\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "if not os.path.isfile('data/creditcard.csv'):\n", + " !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n", + " !tar xzf creditcard.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n" + } + ], + "source": [ + "random_state=1\n", + "\n", + "def load_creditcard(n_examples=0):\n", + " import pandas as pd\n", + " import numpy as np\n", + " import random\n", + " df = pd.read_csv('data/creditcard.csv')\n", + " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", + " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", + " y = df.Class\n", + " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", + " if n_examples > 0:\n", + " # Take first n_examples samples\n", + " X = X[:n_examples, :]\n", + " y = y[:n_examples, :]\n", + " else:\n", + " # Take all the positive samples with a number of random negatives\n", + " if n_examples < 0:\n", + " Xt = X[(y == 1).ravel()]\n", + " yt = y[(y == 1).ravel()]\n", + " indices = random.sample(range(X.shape[0]), -1 * n_examples)\n", + " X = np.append(Xt, X[indices], axis=0)\n", + " y = np.append(yt, y[indices], axis=0)\n", + " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", + " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", + " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n", + " Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", + " return Xtrain, Xtest, ytrain, ytest\n", + "\n", + "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n", + "# data = load_creditcard(5000) # Take the first 5000 samples\n", + "data = load_creditcard(-1000) # Take all the samples\n", + "\n", + "Xtrain = data[0]\n", + "Xtest = data[1]\n", + "ytrain = data[2]\n", + "ytest = data[3]\n", + "# Set weights inverse to its count class in dataset\n", + "weights = np.ones(Xtrain.shape[0],) * 1.00244\n", + "weights[ytrain==1] = 1.99755\n", + "weights_test = np.ones(Xtest.shape[0],) * 1.00244\n", + "weights_test[ytest==1] = 1.99755 " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tests" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test smple_weights\n", + "Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Accuracy of Train without weights 0.9722222222222222\nAccuracy of Train with weights 0.9875478927203065\nAccuracy of Tests without weights 0.9508928571428571\nAccuracy of Tests with weights 0.9486607142857143\n" + } + ], + "source": [ + "C = 23\n", + "print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n", + "print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n", + "print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n", + "print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test accuracy with different kernels\n", + "Compute accuracy on train and test set with default hyperparmeters of every kernel" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Time: 0.27s\tKernel: linear\tAccuracy_train: 0.9712643678160919\tAccuracy_test: 0.953125\nTime: 0.08s\tKernel: rbf\tAccuracy_train: 0.9932950191570882\tAccuracy_test: 0.9620535714285714\nTime: 0.05s\tKernel: poly\tAccuracy_train: 0.9923371647509579\tAccuracy_test: 0.9419642857142857\n" + } + ], + "source": [ + "random_state=1\n", + "for kernel in ['linear', 'rbf', 'poly']:\n", + " now = time.time()\n", + " clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)\n", + " accuracy_train = clf.score(Xtrain, ytrain)\n", + " accuracy_test = clf.score(Xtest, ytest)\n", + " time_spent = time.time() - now\n", + " print(f\"Time: {time_spent:.2f}s\\tKernel: {kernel}\\tAccuracy_train: {accuracy_train}\\tAccuracy_test: {accuracy_test}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test diferent values of C" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [ + "outputPrepend" + ] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9550\nClassifier's accuracy (test) : 0.9554\nroot\nroot - Down, - Leaf class=1 belief= 0.977636 counts=(array([0, 1]), array([ 7, 306]))\nroot - Up, - Leaf class=0 belief= 0.945280 counts=(array([0, 1]), array([691, 40]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9554\nroot\nroot - Down, - Leaf class=1 belief= 0.983923 counts=(array([0, 1]), array([ 5, 306]))\nroot - Up, - Leaf class=0 belief= 0.945430 counts=(array([0, 1]), array([693, 40]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([4]))\nroot - Up, - Leaf class=0 belief= 0.951989 counts=(array([0, 1]), array([694, 35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9703\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([5]))\nroot - Up\nroot - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up, - Leaf class=0 belief= 0.957004 counts=(array([0, 1]), array([690, 31]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9799\nClassifier's accuracy (test) : 0.9531\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up\nroot - Down - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Down - Up - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([15]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([9]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([10]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up, - Leaf class=0 belief= 0.969653 counts=(array([0, 1]), array([671, 21]))\n\n**************************************************\n0.5032 secs\n" + } + ], + "source": [ + "t = time.time()\n", + "for C in (.001, .01, 1, 5, 17):\n", + " clf = Stree(C=C, random_state=random_state)\n", + " clf.fit(Xtrain, ytrain)\n", + " print(f\"************** C={C} ****************************\")\n", + " print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n", + " print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n", + " print(clf)\n", + " print(f\"**************************************************\")\n", + "print(f\"{time.time() - t:.4f} secs\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test iterator\n", + "Check different weays of using the iterator" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up\nroot - Down - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Down - Up - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([15]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([9]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([10]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up, - Leaf class=0 belief= 0.969653 counts=(array([0, 1]), array([671, 21]))\n" + } + ], + "source": [ + "#check iterator\n", + "for i in list(clf):\n", + " print(i)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up\nroot - Down - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Down - Up - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([15]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([9]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([10]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up, - Leaf class=0 belief= 0.969653 counts=(array([0, 1]), array([671, 21]))\n" + } + ], + "source": [ + "#check iterator again\n", + "for i in clf:\n", + " print(i)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test STree is a sklearn estimator" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "1 functools.partial(, 'Stree')\n2 functools.partial(, 'Stree')\n3 functools.partial(, 'Stree')\n4 functools.partial(, 'Stree')\n5 functools.partial(, 'Stree')\n6 functools.partial(, 'Stree')\n7 functools.partial(, 'Stree')\n8 functools.partial(, 'Stree')\n9 functools.partial(, 'Stree')\n10 functools.partial(, 'Stree', readonly_memmap=True)\n11 functools.partial(, 'Stree')\n12 functools.partial(, 'Stree')\n13 functools.partial(, 'Stree')\n14 functools.partial(, 'Stree')\n15 functools.partial(, 'Stree')\n16 functools.partial(, 'Stree')\n17 functools.partial(, 'Stree')\n18 functools.partial(, 'Stree')\n19 functools.partial(, 'Stree')\n20 functools.partial(, 'Stree')\n21 functools.partial(, 'Stree')\n22 functools.partial(, 'Stree')\n23 functools.partial(, 'Stree')\n24 functools.partial(, 'Stree', readonly_memmap=True)\n25 functools.partial(, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(, 'Stree')\n27 functools.partial(, 'Stree')\n28 functools.partial(, 'Stree')\n29 functools.partial(, 'Stree')\n30 functools.partial(, 'Stree')\n31 functools.partial(, 'Stree')\n32 functools.partial(, 'Stree')\n33 functools.partial(, 'Stree')\n34 functools.partial(, 'Stree')\n35 functools.partial(, 'Stree')\n36 functools.partial(, 'Stree')\n37 functools.partial(, 'Stree')\n38 functools.partial(, 'Stree')\n39 functools.partial(, 'Stree')\n40 functools.partial(, 'Stree')\n41 functools.partial(, 'Stree')\n42 functools.partial(, 'Stree')\n43 functools.partial(, 'Stree')\n" + } + ], + "source": [ + "# Make checks one by one\n", + "c = 0\n", + "checks = check_estimator(Stree(), generate_only=True)\n", + "for check in checks:\n", + " c += 1\n", + " print(c, check[1])\n", + " check[1](check[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Check if the classifier is a sklearn estimator\n", + "check_estimator(Stree())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compare to SVM" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "== Not Weighted ===\nSVC train score ..: 0.9530651340996169\nSTree train score : 0.960727969348659\nSVC test score ...: 0.9620535714285714\nSTree test score .: 0.9642857142857143\n==== Weighted =====\nSVC train score ..: 0.960727969348659\nSTree train score : 0.960727969348659\nSVC test score ...: 0.953125\nSTree test score .: 0.9553571428571429\n*SVC test score ..: 0.9397723008352139\n*STree test score : 0.9431162390279932\n" + } + ], + "source": [ + "svc = SVC(C=7, kernel='rbf', gamma=.001, random_state=random_state)\n", + "clf = Stree(C=17, kernel='rbf', gamma=.001, random_state=random_state)\n", + "svc.fit(Xtrain, ytrain)\n", + "clf.fit(Xtrain, ytrain)\n", + "print(\"== Not Weighted ===\")\n", + "print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n", + "print(\"STree train score :\", clf.score(Xtrain, ytrain))\n", + "print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n", + "print(\"STree test score .:\", clf.score(Xtest, ytest))\n", + "svc.fit(Xtrain, ytrain, weights)\n", + "clf.fit(Xtrain, ytrain, weights)\n", + "print(\"==== Weighted =====\")\n", + "print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n", + "print(\"STree train score :\", clf.score(Xtrain, ytrain))\n", + "print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n", + "print(\"STree test score .:\", clf.score(Xtest, ytest))\n", + "print(\"*SVC test score ..:\", svc.score(Xtest, ytest, weights_test))\n", + "print(\"*STree test score :\", clf.score(Xtest, ytest, weights_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "root\nroot - Down, - Leaf class=1 belief= 0.978056 counts=(array([0, 1]), array([ 7, 312]))\nroot - Up, - Leaf class=0 belief= 0.953103 counts=(array([0, 1]), array([691, 34]))\n\n" + } + ], + "source": [ + "print(clf)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.7.6 64-bit ('general': venv)", + "language": "python", + "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6-final" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/notebooks/test2.ipynb b/notebooks/test2.ipynb deleted file mode 100644 index 1a90773..0000000 --- a/notebooks/test2.ipynb +++ /dev/null @@ -1,337 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test smple_weight, kernels, C, sklearn estimator" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup\n", - "Uncomment the next cell if STree is not already installed" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "#\n", - "# Google Colab setup\n", - "#\n", - "#!pip install git+https://github.com/doctorado-ml/stree" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "from sklearn.svm import SVC\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "from sklearn.datasets import make_classification, load_iris, load_wine\n", - "from sklearn.model_selection import train_test_split\n", - "from stree import Stree\n", - "import time" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "if not os.path.isfile('data/creditcard.csv'):\n", - " !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n", - " !tar xzf creditcard.tgz" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n" - } - ], - "source": [ - "random_state=1\n", - "\n", - "def load_creditcard(n_examples=0):\n", - " import pandas as pd\n", - " import numpy as np\n", - " import random\n", - " df = pd.read_csv('data/creditcard.csv')\n", - " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", - " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", - " y = df.Class\n", - " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", - " if n_examples > 0:\n", - " # Take first n_examples samples\n", - " X = X[:n_examples, :]\n", - " y = y[:n_examples, :]\n", - " else:\n", - " # Take all the positive samples with a number of random negatives\n", - " if n_examples < 0:\n", - " Xt = X[(y == 1).ravel()]\n", - " yt = y[(y == 1).ravel()]\n", - " indices = random.sample(range(X.shape[0]), -1 * n_examples)\n", - " X = np.append(Xt, X[indices], axis=0)\n", - " y = np.append(yt, y[indices], axis=0)\n", - " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", - " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", - " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n", - " Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", - " return Xtrain, Xtest, ytrain, ytest\n", - "\n", - "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n", - "# data = load_creditcard(5000) # Take the first 5000 samples\n", - "data = load_creditcard(-1000) # Take all the samples\n", - "\n", - "Xtrain = data[0]\n", - "Xtest = data[1]\n", - "ytrain = data[2]\n", - "ytest = data[3]\n", - "# Set weights inverse to its count class in dataset\n", - "weights = np.ones(Xtrain.shape[0],) * 1.00244\n", - "weights[ytrain==1] = 1.99755 " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Tests" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test smple_weights\n", - "Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "Accuracy of Train without weights 0.9770114942528736\nAccuracy of Train with weights 0.9818007662835249\nAccuracy of Tests without weights 0.953125\nAccuracy of Tests with weights 0.9419642857142857\n" - } - ], - "source": [ - "C = 23\n", - "print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n", - "print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n", - "print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n", - "print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test accuracy with different kernels\n", - "Compute accuracy on train and test set with default hyperparmeters of every kernel" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "Time: 0.20s\tKernel: linear\tAccuracy_train: 0.9712643678160919\tAccuracy_test: 0.9575892857142857\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9932950191570882\tAccuracy_test: 0.9620535714285714\nTime: 0.09s\tKernel: poly\tAccuracy_train: 0.9904214559386973\tAccuracy_test: 0.9508928571428571\n" - } - ], - "source": [ - "random_state=1\n", - "for kernel in ['linear', 'rbf', 'poly']:\n", - " now = time.time()\n", - " clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)\n", - " accuracy_train = clf.score(Xtrain, ytrain)\n", - " accuracy_test = clf.score(Xtest, ytest)\n", - " time_spent = time.time() - now\n", - " print(f\"Time: {time_spent:.2f}s\\tKernel: {kernel}\\tAccuracy_train: {accuracy_train}\\tAccuracy_test: {accuracy_test}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test diferent values of C" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "tags": [ - "outputPrepend" - ] - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9550\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 0.980583 counts=(array([0, 1]), array([ 6, 303]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up, - Leaf class=0 belief= 0.943836 counts=(array([0, 1]), array([689, 41]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 0.990228 counts=(array([0, 1]), array([ 3, 304]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up, - Leaf class=0 belief= 0.942935 counts=(array([0, 1]), array([694, 42]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([4]))\nroot - Up, - Leaf class=0 belief= 0.951989 counts=(array([0, 1]), array([694, 35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9674\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([312]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([4]))\nroot - Up\nroot - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, - Leaf class=0 belief= 0.953039 counts=(array([0, 1]), array([690, 34]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9770\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([314]))\nroot - Down - Up\nroot - Down - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([12]))\nroot - Up\nroot - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Up, - Leaf class=0 belief= 0.965714 counts=(array([0, 1]), array([676, 24]))\n\n**************************************************\n0.9578 secs\n" - } - ], - "source": [ - "t = time.time()\n", - "for C in (.001, .01, 1, 5, 17):\n", - " clf = Stree(C=C, random_state=random_state)\n", - " clf.fit(Xtrain, ytrain)\n", - " print(f\"************** C={C} ****************************\")\n", - " print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n", - " print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n", - " print(clf)\n", - " print(f\"**************************************************\")\n", - "print(f\"{time.time() - t:.4f} secs\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "[[0.88204928 0.11795072]\n [0.8640131 0.1359869 ]\n [0.94207521 0.05792479]\n [0.90219947 0.09780053]]\n" - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.svm import LinearSVC\n", - "from sklearn.calibration import CalibratedClassifierCV\n", - "scaler = StandardScaler()\n", - "cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n", - "cclf.fit(Xtrain, ytrain)\n", - "res = cclf.predict_proba(Xtest)\n", - "print(res[:4, :])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test iterator\n", - "Check different weays of using the iterator" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([314]))\nroot - Down - Up\nroot - Down - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([12]))\nroot - Up\nroot - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Up, - Leaf class=0 belief= 0.965714 counts=(array([0, 1]), array([676, 24]))\n" - } - ], - "source": [ - "#check iterator\n", - "for i in list(clf):\n", - " print(i)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([314]))\nroot - Down - Up\nroot - Down - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([12]))\nroot - Up\nroot - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Up, - Leaf class=0 belief= 0.965714 counts=(array([0, 1]), array([676, 24]))\n" - } - ], - "source": [ - "#check iterator again\n", - "for i in clf:\n", - " print(i)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Test STree is a sklearn estimator" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "1 functools.partial(, 'Stree')\n2 functools.partial(, 'Stree')\n3 functools.partial(, 'Stree')\n4 functools.partial(, 'Stree')\n5 functools.partial(, 'Stree')\n6 functools.partial(, 'Stree')\n7 functools.partial(, 'Stree')\n8 functools.partial(, 'Stree')\n9 functools.partial(, 'Stree')\n10 functools.partial(, 'Stree', readonly_memmap=True)\n11 functools.partial(, 'Stree')\n12 functools.partial(, 'Stree')\n13 functools.partial(, 'Stree')\n14 functools.partial(, 'Stree')\n15 functools.partial(, 'Stree')\n16 functools.partial(, 'Stree')\n17 functools.partial(, 'Stree')\n18 functools.partial(, 'Stree')\n19 functools.partial(, 'Stree')\n20 functools.partial(, 'Stree')\n21 functools.partial(, 'Stree')\n22 functools.partial(, 'Stree')\n23 functools.partial(, 'Stree')\n24 functools.partial(, 'Stree', readonly_memmap=True)\n25 functools.partial(, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(, 'Stree')\n27 functools.partial(, 'Stree')\n28 functools.partial(, 'Stree')\n29 functools.partial(, 'Stree')\n30 functools.partial(, 'Stree')\n31 functools.partial(, 'Stree')\n32 functools.partial(, 'Stree')\n33 functools.partial(, 'Stree')\n34 functools.partial(, 'Stree')\n35 functools.partial(, 'Stree')\n36 functools.partial(, 'Stree')\n37 functools.partial(, 'Stree')\n38 functools.partial(, 'Stree')\n39 functools.partial(, 'Stree')\n40 functools.partial(, 'Stree')\n41 functools.partial(, 'Stree')\n42 functools.partial(, 'Stree')\n43 functools.partial(, 'Stree')\n" - } - ], - "source": [ - "# Make checks one by one\n", - "c = 0\n", - "checks = check_estimator(Stree(), generate_only=True)\n", - "for check in checks:\n", - " c += 1\n", - " print(c, check[1])\n", - " check[1](check[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Check if the classifier is a sklearn estimator\n", - "from sklearn.utils.estimator_checks import check_estimator\n", - "check_estimator(Stree())" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.7.6 64-bit ('general': venv)", - "language": "python", - "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6-final" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/stree/Strees.py b/stree/Strees.py index d2a385c..0b02e39 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -11,6 +11,7 @@ import os import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import SVC, LinearSVC +from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import ( check_X_y, @@ -18,6 +19,8 @@ from sklearn.utils.validation import ( check_is_fitted, _check_sample_weight, ) +from sklearn.utils.sparsefuncs import count_nonzero +from sklearn.metrics._classification import _weighted_sum, _check_targets class Snode: @@ -201,6 +204,13 @@ class Stree(BaseEstimator, ClassifierMixin): ) -> "Stree": """Build the tree based on the dataset of samples and its labels + :param X: dataset of samples to make predictions + :type X: np.array + :param y: samples labels + :type y: np.array + :param sample_weight: weights of the samples. Rescale C per sample. + Hi' weights force the classifier to put more emphasis on these points + :type sample_weight: np.array optional :raises ValueError: if parameters C or max_depth are out of bounds :return: itself to be able to chain actions: fit().predict() ... :rtype: Stree @@ -284,7 +294,8 @@ class Stree(BaseEstimator, ClassifierMixin): :type X: np.ndarray :param y: samples labels :type y: np.ndarray - :param sample_weight: weight of samples (used in boosting) + :param sample_weight: weight of samples. Rescale C per sample. + Hi weights force the classifier to put more emphasis on these points. :type sample_weight: np.ndarray :param depth: actual depth in the tree :type depth: int @@ -435,20 +446,35 @@ class Stree(BaseEstimator, ClassifierMixin): result[:, 0] = 1 - result[:, 1] return self._reorder_results(result, indices) - def score(self, X: np.array, y: np.array) -> float: + def score( + self, X: np.array, y: np.array, sample_weight: np.array = None + ) -> float: """Compute accuracy of the prediction :param X: dataset of samples to make predictions :type X: np.array - :param y: samples labels - :type y: np.array + :param y_true: samples labels + :type y_true: np.array + :param sample_weight: weights of the samples. Rescale C per sample. + Hi' weights force the classifier to put more emphasis on these points + :type sample_weight: np.array optional :return: accuracy of the prediction :rtype: float """ # sklearn check check_is_fitted(self) - yp = self.predict(X).reshape(y.shape) - return np.mean(yp == y) + + y_pred = self.predict(X).reshape(y.shape) + # Compute accuracy for each possible representation + y_type, y_true, y_pred = _check_targets(y, y_pred) + check_consistent_length(y_true, y_pred, sample_weight) + if y_type.startswith("multilabel"): + differing_labels = count_nonzero(y_true - y_pred, axis=1) + score = differing_labels == 0 + else: + score = y_true == y_pred + + return _weighted_sum(score, sample_weight, normalize=True) def __iter__(self) -> Siterator: """Create an iterator to be able to visit the nodes of the tree in preorder,