Adapt some notebooks

Refactor split_data adding sample_weight
First approach
2025-08-18 08:56:00 +00:00 · 2020-05-30 11:09:59 +02:00 · 2020-05-29 18:52:23 +02:00 · 2020-05-29 12:46:10 +02:00
12 changed files with 960 additions and 588 deletions
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 # Stree
-Oblique Tree classifier based on SVM nodes
+Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.
 ![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png)
@@ -18,15 +18,15 @@ pip install git+https://github.com/doctorado-ml/stree
 ##### Slow launch but better integration
-* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/test.ipynb) Test notebook
+* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/test.ipynb) Test notebook
 ##### Fast launch but have to run first commented out cell for setup
-* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test.ipynb) Test notebook
+* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test.ipynb) Test notebook
-* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test2.ipynb) Another Test notebook
+* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test2.ipynb) Another Test notebook
-* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/test_graphs.ipynb) Test Graphics notebook
+* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics notebook
 ### Command line
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -1 +0,0 @@
 *
--- a/notebooks/adaboost.ipynb
+++ b/notebooks/adaboost.ipynb
@@ -0,0 +1,190 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
    "from sklearn.datasets import load_iris\n",
    "from stree import Stree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "if not os.path.isfile('data/creditcard.csv'):\n",
    "    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
    "    !tar xzf creditcard.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28)  y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
    }
   ],
   "source": [
    "random_state=1\n",
    "\n",
    "def load_creditcard(n_examples=0):\n",
    "    import pandas as pd\n",
    "    import numpy as np\n",
    "    import random\n",
    "    df = pd.read_csv('data/creditcard.csv')\n",
    "    print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
    "    print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
    "    y = df.Class\n",
    "    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
    "    if n_examples > 0:\n",
    "        # Take first n_examples samples\n",
    "        X = X[:n_examples, :]\n",
    "        y = y[:n_examples, :]\n",
    "    else:\n",
    "        # Take all the positive samples with a number of random negatives\n",
    "        if n_examples < 0:\n",
    "            Xt = X[(y == 1).ravel()]\n",
    "            yt = y[(y == 1).ravel()]\n",
    "            indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
    "            X = np.append(Xt, X[indices], axis=0)\n",
    "            y = np.append(yt, y[indices], axis=0)\n",
    "    print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
    "    print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
    "    print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
    "    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
    "    return Xtrain, Xtest, ytrain, ytest\n",
    "\n",
    "data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
    "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
    "# data = load_creditcard(0) # Take all the samples\n",
    "\n",
    "Xtrain = data[0]\n",
    "Xtest = data[1]\n",
    "ytrain = data[2]\n",
    "ytest = data[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Score Train:  0.986857825567503\nScore Test:  0.9805013927576601\nTook 0.12 seconds\n"
    }
   ],
   "source": [
    "now = time.time()\n",
    "clf = Stree(max_depth=3, random_state=random_state)\n",
    "clf.fit(Xtrain, ytrain)\n",
    "print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
    "print(\"Score Test: \", clf.score(Xtest, ytest))\n",
    "print(f\"Took {time.time() - now:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Score Train:  0.997610513739546\nScore Test:  0.9721448467966574\nTook 7.80 seconds\n"
    }
   ],
   "source": [
    "now = time.time()\n",
    "clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n",
    "clf2.fit(Xtrain, ytrain)\n",
    "print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n",
    "print(\"Score Test: \", clf2.score(Xtest, ytest))\n",
    "print(f\"Took {time.time() - now:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Score Train:  0.9796893667861409\nScore Test:  0.9554317548746518\nTook 0.48 seconds\n"
    }
   ],
   "source": [
    "now = time.time()\n",
    "clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n",
    "clf3.fit(Xtrain, ytrain)\n",
    "print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n",
    "print(\"Score Test: \", clf3.score(Xtest, ytest))\n",
    "print(f\"Took {time.time() - now:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Score Train:  1.0\nScore Test:  0.9721448467966574\nTook 0.86 seconds\n"
    }
   ],
   "source": [
    "now = time.time()\n",
    "clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n",
    "clf4.fit(Xtrain, ytrain)\n",
    "print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n",
    "print(\"Score Test: \", clf4.score(Xtest, ytest))\n",
    "print(f\"Took {time.time() - now:.2f} seconds\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
   "display_name": "Python 3.7.6 64-bit ('general': venv)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/notebooks/crcard_graphs.ipynb
+++ b/notebooks/crcard_graphs.ipynb
--- a/notebooks/gridsearch.ipynb
+++ b/notebooks/gridsearch.ipynb
--- a/notebooks/test.ipynb
+++ b/notebooks/test.ipynb
--- a/notebooks/test2.ipynb
+++ b/notebooks/test2.ipynb
@@ -0,0 +1,227 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#\n",
    "# Google Colab setup\n",
    "#\n",
    "#!pip install git+https://github.com/doctorado-ml/stree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.datasets import make_classification, load_iris, load_wine\n",
    "from sklearn.model_selection import train_test_split\n",
    "from stree import Stree\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "if not os.path.isfile('data/creditcard.csv'):\n",
    "    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
    "    !tar xzf creditcard.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28)  y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
    }
   ],
   "source": [
    "random_state=1\n",
    "\n",
    "def load_creditcard(n_examples=0):\n",
    "    import pandas as pd\n",
    "    import numpy as np\n",
    "    import random\n",
    "    df = pd.read_csv('data/creditcard.csv')\n",
    "    print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
    "    print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
    "    y = df.Class\n",
    "    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
    "    if n_examples > 0:\n",
    "        # Take first n_examples samples\n",
    "        X = X[:n_examples, :]\n",
    "        y = y[:n_examples, :]\n",
    "    else:\n",
    "        # Take all the positive samples with a number of random negatives\n",
    "        if n_examples < 0:\n",
    "            Xt = X[(y == 1).ravel()]\n",
    "            yt = y[(y == 1).ravel()]\n",
    "            indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
    "            X = np.append(Xt, X[indices], axis=0)\n",
    "            y = np.append(yt, y[indices], axis=0)\n",
    "    print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
    "    print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
    "    print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
    "    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
    "    return Xtrain, Xtest, ytrain, ytest\n",
    "\n",
    "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
    "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
    "data = load_creditcard(-1000) # Take all the samples\n",
    "\n",
    "Xtrain = data[0]\n",
    "Xtest = data[1]\n",
    "ytrain = data[2]\n",
    "ytest = data[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "tags": [
     "outputPrepend"
    ]
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9797\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1.0 belief=0.984127 counts=(array([0., 1.]), array([  2, 124]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up\nroot - Up - Down, <cgaf> - Leaf class=0.0 belief=0.750000 counts=(array([0., 1.]), array([3, 1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.980029 counts=(array([0., 1.]), array([687,  14]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9809\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([124]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.977560 counts=(array([0., 1.]), array([697,  16]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695,  11]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9777\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695,  11]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9916\nClassifier's accuracy (test) : 0.9833\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683,   7]))\n\n**************************************************\n0.2235 secs\n"
    }
   ],
   "source": [
    "t = time.time()\n",
    "for C in (.001, .01, 1, 5, 17):\n",
    "    clf = Stree(C=C, random_state=random_state)\n",
    "    clf.fit(Xtrain, ytrain)\n",
    "    print(f\"************** C={C} ****************************\")\n",
    "    print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
    "    print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
    "    print(clf)\n",
    "    print(f\"**************************************************\")\n",
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.calibration import CalibratedClassifierCV\n",
    "scaler = StandardScaler()\n",
    "cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
    "cclf.fit(Xtrain, ytrain)\n",
    "res = cclf.predict_proba(Xtest)\n",
    "#an array containing probabilities of belonging to the 1st class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683,   7]))\n"
    }
   ],
   "source": [
    "#check iterator\n",
    "for i in list(clf):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683,   7]))\n"
    }
   ],
   "source": [
    "#check iterator again\n",
    "for i in clf:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if the classifier is a sklearn estimator\n",
    "from sklearn.utils.estimator_checks import check_estimator\n",
    "check_estimator(Stree())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12aabb320>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12aab0440>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12aab0320>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12aaaac20>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12aaaad40>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12aaaae60>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12aaaaf80>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12aaac0e0>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12aaac290>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12aaac200>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x12aab0560>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x12aab0200>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x12aab0680>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12aabb200>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12aaaab00>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x12aab08c0>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12aabb560>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x12aab0f80>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x12aab69e0>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x12aab09e0>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12aabf050>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x12aaa0c20>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x12aab6680>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x12aab6560>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12aabbb90>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12aabf170>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12aaac7a0>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12aaac950>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12aaaca70>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12aaacb90>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12aaaccb0>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12aabbdd0>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12aabbef0>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12aaac3b0>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12aaac680>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12aabf320>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12aabf3b0>, 'Stree')\n"
    }
   ],
   "source": [
    "# Make checks one by one\n",
    "c = 0\n",
    "checks = check_estimator(Stree(), generate_only=True)\n",
    "for check in checks:\n",
    "    c += 1\n",
    "    print(c, check[1])\n",
    "    check[1](check[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7.6 64-bit ('general': venv)",
   "language": "python",
   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/notebooks/test_graphs.ipynb
+++ b/notebooks/test_graphs.ipynb
@@ -0,0 +1,197 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#\n",
    "# Google Colab setup\n",
    "#\n",
    "#!pip install git+https://github.com/doctorado-ml/stree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'stree'",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-12-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'stree'"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import random\n",
    "import numpy as np\n",
    "from sklearn.datasets import make_blobs\n",
    "from sklearn.svm import LinearSVC\n",
    "from stree import Stree, Stree_grapher"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_data(random_state):\n",
    "    random.seed(random_state)\n",
    "    X, y = make_blobs(centers=10, n_features=3, n_samples=500, random_state=random_state)\n",
    "    def make_binary(y):\n",
    "        for i in range(2, 10):\n",
    "            y[y==i] = random.randint(0, 1)\n",
    "        return y\n",
    "    y = make_binary(y)\n",
    "    #print(X.shape, np.unique(y), y[y==0].shape, y[y==1].shape)\n",
    "    return X, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "NameError",
     "evalue": "name 'Stree_grapher' is not defined",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-4-b909470cb406>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mC\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m.01\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m200\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;31m#gr.save_all(save_folder='data/', save_prefix='7')\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'Stree_grapher' is not defined"
     ]
    }
   ],
   "source": [
    "X, y = build_data(10)\n",
    "gr = Stree_grapher(dict(C=.01, max_iter=200))\n",
    "gr.fit(X, y)\n",
    "#gr.save_all(save_folder='data/', save_prefix='7')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "NameError",
     "evalue": "name 'gr' is not defined",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-5-efa3db892bfd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
     ]
    }
   ],
   "source": [
    "print(gr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "NameError",
     "evalue": "name 'gr' is not defined",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-6-0e62f081c9aa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Agg'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msave_folder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'data/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
     ]
    }
   ],
   "source": [
    "import matplotlib\n",
    "matplotlib.use('Agg')\n",
    "gr.save_all(save_folder='data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "NameError",
     "evalue": "name 'gr' is not defined",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-7-b0484cfe9d26>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;31m#%matplotlib inline\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'widget'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tree_gr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_hyperplane\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
     ]
    }
   ],
   "source": [
    "#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
    "#%matplotlib inline\n",
    "%matplotlib widget\n",
    "gr._tree_gr.plot_hyperplane()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "error",
     "ename": "NameError",
     "evalue": "name 'gr' is not defined",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-8-4277c1aacbe2>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'matplotlib'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'inline'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;31m#%matplotlib widget\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot_all\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'gr' is not defined"
     ]
    }
   ],
   "source": [
    "#Uncomment one of the following lines to display graphics: static(inline), dynamic(widget)\n",
    "%matplotlib inline\n",
    "#%matplotlib widget\n",
    "gr.plot_all()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -13,7 +13,8 @@ import os
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.svm import LinearSVC
-from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
+from sklearn.utils.multiclass import check_classification_targets
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, _check_sample_weight, check_random_state
 class Snode:
@@ -102,25 +103,29 @@ class Siterator:
 class Stree(BaseEstimator, ClassifierMixin):
    """
    """
    __folder = 'data/'
-    def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0, use_predictions: bool = False):
+    def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = None,
                max_depth: int=None, tol: float=1e-4, use_predictions: bool = False):
        self.max_iter = max_iter
        self.C = C
        self.random_state = random_state
        self.use_predictions = use_predictions
        self.max_depth = max_depth
        self.tol = tol
-    def get_params(self, deep=True):
+    def get_params(self, deep: bool=True) -> dict:
        """Get dict with hyperparameters and its values to accomplish sklearn rules
        """
        return {
            'C': self.C,
            'random_state': self.random_state,
            'max_iter': self.max_iter,
-            'use_predictions': self.use_predictions
+            'use_predictions': self.use_predictions,
            'max_depth': self.max_depth,
            'tol': self.tol
        }
-    def set_params(self, **parameters):
+    def set_params(self, **parameters: dict):
        """Set hyperparmeters as specified by sklearn, needed in Gridsearchs
        """
        for parameter, value in parameters.items():
@@ -128,42 +133,49 @@ class Stree(BaseEstimator, ClassifierMixin):
        return self
    # Added binary_only tag as required by sklearn check_estimator
-    def _more_tags(self):
+    def _more_tags(self) -> dict:
        return {'binary_only': True}
    def _linear_function(self, data: np.array, node: Snode) -> np.array:
        coef = node._vector[0, :].reshape(-1, data.shape[1])
        return data.dot(coef.T) + node._interceptor[0]
-    def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
+    def _split_array(self, origin: np.array, down: np.array) -> list:
        up = ~down
        return origin[up[:, 0]] if any(up) else None, \
            origin[down[:, 0]] if any(down) else None
    def _distances(self, node: Snode, data: np.ndarray) -> np.array:
        if self.use_predictions:
            yp = node._clf.predict(data)
            down = (yp == 1).reshape(-1, 1)
            res = np.expand_dims(node._clf.decision_function(data), 1)
        else:
            # doesn't work with multiclass as each sample has to do inner product with its own coeficients
            # computes positition of every sample is w.r.t. the hyperplane
            res = self._linear_function(data, node)
-            down = res > 0
+        return res
        up = ~down
        data_down = data[down[:, 0]] if any(down) else None
        indices_down = indices[down[:, 0]] if any(down) else None
        res_down = res[down[:, 0]] if any(down) else None
        data_up = data[up[:, 0]] if any(up) else None
        indices_up = indices[up[:, 0]] if any(up) else None
        res_up = res[up[:, 0]] if any(up) else None
        return [data_up, indices_up, data_down, indices_down, res_up, res_down]
-    def fit(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> 'Stree':
+    def _split_criteria(self, data: np.array) -> np.array:
-        from sklearn.utils.multiclass import check_classification_targets
+        return data > 0
    def fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None) -> 'Stree':
        # Check parameters are Ok.
        if type(y).__name__ == 'np.ndarray':
            y = y.ravel()
        if self.C < 0:
            raise ValueError(f"Penalty term must be positive... got (C={self.C:f})")
        self.__max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
        if self.__max_depth < 1:
            raise ValueError(f"Maximum depth has to be greater than 1... got (max_depth={self.max_depth})")
        check_classification_targets(y)
        X, y = check_X_y(X, y)
        sample_weight = _check_sample_weight(sample_weight, X)
        check_classification_targets(y)
        # Initialize computed parameters
        self.classes_ = np.unique(y)
        self.n_iter_ = self.max_iter
-        check_classification_targets(y)
+        self.depth_ = 0
        self.n_features_in_ = X.shape[1]
-        self.tree_ = self.train(X, y.ravel(), title)
+        self.tree_ = self.train(X, y, sample_weight, 1, 'root')
        self._build_predictor()
        return self
@@ -180,25 +192,32 @@ class Stree(BaseEstimator, ClassifierMixin):
        run_tree(self.tree_)
-    def train(self, X: np.ndarray, y: np.ndarray, title: str = 'root') -> Snode:
+    def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray, depth: int, title: str) -> Snode:
-        if np.unique(y).shape[0] == 1:
+        
        if depth > self.__max_depth:
            return None
        if np.unique(y).shape[0] == 1 :
            # only 1 class => pure dataset
            return Snode(None, X, y, title + ', <pure>')
        # Train the model
-        clf = LinearSVC(max_iter=self.max_iter, C=self.C,
+        clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
-                        random_state=self.random_state)
+                        C=self.C)  #, sample_weight=sample_weight)
-        clf.fit(X, y)
+        clf.fit(X, y, sample_weight=sample_weight)
        tree = Snode(clf, X, y, title)
-        X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
+        self.depth_ = max(depth, self.depth_)
        down = self._split_criteria(self._distances(tree, X))
        X_U, X_D = self._split_array(X, down)
        y_u, y_d = self._split_array(y, down)
        sw_u, sw_d = self._split_array(sample_weight, down)
        if X_U is None or X_D is None:
            # didn't part anything
            return Snode(clf, X, y, title + ', <cgaf>')
-        tree.set_up(self.train(X_U, y_u, title + ' - Up'))
+        tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + ' - Up'))
-        tree.set_down(self.train(X_D, y_d, title + ' - Down'))
+        tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + ' - Down'))
        return tree
-    def _reorder_results(self, y: np.array, indices: np.array, proba=False) -> np.array:
+    def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
-        if proba:
+        if y.ndim > 1 and y.shape[1] > 1:
            # if predict_proba return np.array of floats
            y_ordered = np.zeros(y.shape, dtype=float)
        else:
@@ -217,10 +236,12 @@ class Stree(BaseEstimator, ClassifierMixin):
                # set a class for every sample in dataset
                prediction = np.full((xp.shape[0], 1), node._class)
                return prediction, indices
-            u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
+            down = self._split_criteria(self._distances(node, xp))
-            k, l = predict_class(d, i_d, node.get_down())
+            X_U, X_D = self._split_array(xp, down)
-            m, n = predict_class(u, i_u, node.get_up())
+            i_u, i_d = self._split_array(indices, down)
-            return np.append(k, m), np.append(l, n)
+            prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
            prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
            return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
        # sklearn check
        check_is_fitted(self, ['tree_'])
@@ -257,10 +278,15 @@ class Stree(BaseEstimator, ClassifierMixin):
                prediction = np.full((xp.shape[0], 1), node._class)
                prediction_proba = dist
                return np.append(prediction, prediction_proba, axis=1), indices
-            u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
+            distances = self._distances(node, xp)
-            k, l = predict_class(d, i_d, r_d, node.get_down())
+            down = self._split_criteria(distances)
-            m, n = predict_class(u, i_u, r_u, node.get_up())
+            
-            return np.append(k, m), np.append(l, n)
+            X_U, X_D = self._split_array(xp, down)
            i_u, i_d = self._split_array(indices, down)
            di_u, di_d = self._split_array(distances, down)
            prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
            prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
            return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
        # sklearn check
        check_is_fitted(self, ['tree_'])
@@ -273,9 +299,10 @@ class Stree(BaseEstimator, ClassifierMixin):
        result = result.reshape(X.shape[0], 2)
        # Turn distances to hyperplane into probabilities based on fitting distances
        # of samples to its hyperplane that classified them, to the sigmoid function
-        result[:, 1] = 1 / (1 + np.exp(-result[:, 1])) # Probability of being 1
+        # Probability of being 1
        result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
        result[:, 0] = 1 - result[:, 1]  # Probability of being 0
-        return self._reorder_results(result, indices, proba=True)
+        return self._reorder_results(result, indices)
    def score(self, X: np.array, y: np.array) -> float:
        """Return accuracy
@@ -283,11 +310,14 @@ class Stree(BaseEstimator, ClassifierMixin):
        # sklearn check
        check_is_fitted(self)
        yp = self.predict(X).reshape(y.shape)
-        right = (yp == y).astype(int)
+        return np.mean(yp == y)
        return np.sum(right) / len(y)
-    def __iter__(self):
+    def __iter__(self) -> Siterator:
-        return Siterator(self.tree_)
+        try:
            tree = self.tree_
        except:
            tree = None
        return Siterator(tree)
    def __str__(self) -> str:
        output = ''
@@ -295,33 +325,3 @@ class Stree(BaseEstimator, ClassifierMixin):
            output += str(i) + '\n'
        return output
    def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
        """Save the dataset of the node in a csv file
        :param tree: node with data to save
        :type tree: Snode
        :param catalog: catalog file handler
        :type catalog: typing.TextIO
        :param number: sequential number for the generated file name
        :type number: int
        """
        data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
        name = f"{self.__folder}dataset{number}.csv"
        np.savetxt(name, data, delimiter=",")
        catalog.write(f"{name}, - {str(tree)}")
        if tree.is_leaf():
            return
        self._save_datasets(tree.get_down(), catalog, number + 1)
        self._save_datasets(tree.get_up(), catalog, number + 2)
    def get_catalog_name(self):
        return self.__folder + "catalog.txt"
    def save_sub_datasets(self):
        """Save the every dataset stored in the tree to check with manual classifier
        """
        if not os.path.isdir(self.__folder):
            os.mkdir(self.__folder)
        with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
            self._save_datasets(self.tree_, catalog, 1)
--- a/stree/tests/Strees_test.py
+++ b/stree/tests/Strees_test.py
@@ -107,18 +107,6 @@ class Stree_test(unittest.TestCase):
                    res.append(y_original[row])
        return res
    def test_subdatasets(self):
        """Check if the subdatasets files have the same labels as the original dataset
        """
        self._clf.save_sub_datasets()
        with open(self._clf.get_catalog_name()) as cat_file:
            catalog = csv.reader(cat_file, delimiter=',')
            for row in catalog:
                X, y = self._get_Xy()
                x_file, y_file = self._get_file_data(row[0])
                y_original = np.array(self._find_out(x_file, X, y), dtype=int)
                self.assertTrue(np.array_equal(y_file, y_original))
    def test_single_prediction(self):
        X, y = self._get_Xy()
        yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
@@ -135,10 +123,9 @@ class Stree_test(unittest.TestCase):
        X, y = self._get_Xy()
        accuracy_score = self._clf.score(X, y)
        yp = self._clf.predict(X)
-        right = (yp == y).astype(int)
+        accuracy_computed = np.mean(yp == y)
        accuracy_computed = sum(right) / len(y)
        self.assertEqual(accuracy_score, accuracy_computed)
-        self.assertGreater(accuracy_score, 0.8)
+        self.assertGreater(accuracy_score, 0.9)
    def test_single_predict_proba(self):
        """Check that element 28 has a prediction different that the current label
@@ -253,6 +240,30 @@ class Stree_test(unittest.TestCase):
        from sklearn.utils.estimator_checks import check_estimator
        check_estimator(Stree())
    def test_exception_if_C_is_negative(self):
        tclf = Stree(C=-1)
        with self.assertRaises(ValueError):
            tclf.fit(*self._get_Xy())
    def test_check_max_depth_is_positive_or_None(self):
        tcl = Stree()
        self.assertIsNone(tcl.max_depth)
        tcl = Stree(max_depth=1)
        self.assertGreaterEqual(1, tcl.max_depth)
        with self.assertRaises(ValueError):
            tcl = Stree(max_depth=-1)
            tcl.fit(*self._get_Xy())
    def test_check_max_depth(self):
        depth = 3
        tcl = Stree(random_state=self._random_state, max_depth=depth)        
        tcl.fit(*self._get_Xy())
        self.assertEqual(depth, tcl.depth_)
    def test_unfitted_tree_is_iterable(self):
        tcl = Stree()
        self.assertEqual(0, len(list(tcl)))
 class Snode_test(unittest.TestCase):
    def __init__(self, *args, **kwargs):
--- a/test2.ipynb
+++ b/test2.ipynb
@@ -1,227 +0,0 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#\n",
    "# Google Colab setup\n",
    "#\n",
    "#!pip install git+https://github.com/doctorado-ml/stree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.datasets import make_classification, load_iris, load_wine\n",
    "from sklearn.model_selection import train_test_split\n",
    "from stree import Stree\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "if not os.path.isfile('data/creditcard.csv'):\n",
    "    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
    "    !tar xzf creditcard.tgz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28)  y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
    }
   ],
   "source": [
    "random_state=1\n",
    "\n",
    "def load_creditcard(n_examples=0):\n",
    "    import pandas as pd\n",
    "    import numpy as np\n",
    "    import random\n",
    "    df = pd.read_csv('data/creditcard.csv')\n",
    "    print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
    "    print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
    "    y = df.Class\n",
    "    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
    "    if n_examples > 0:\n",
    "        # Take first n_examples samples\n",
    "        X = X[:n_examples, :]\n",
    "        y = y[:n_examples, :]\n",
    "    else:\n",
    "        # Take all the positive samples with a number of random negatives\n",
    "        if n_examples < 0:\n",
    "            Xt = X[(y == 1).ravel()]\n",
    "            yt = y[(y == 1).ravel()]\n",
    "            indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
    "            X = np.append(Xt, X[indices], axis=0)\n",
    "            y = np.append(yt, y[indices], axis=0)\n",
    "    print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
    "    print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
    "    print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
    "    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
    "    return Xtrain, Xtest, ytrain, ytest\n",
    "\n",
    "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
    "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
    "data = load_creditcard(-1000) # Take all the samples\n",
    "\n",
    "Xtrain = data[0]\n",
    "Xtest = data[1]\n",
    "ytrain = data[2]\n",
    "ytest = data[3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "tags": [
     "outputPrepend"
    ]
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9579\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.987013 counts=(array([0, 1]), array([  4, 304]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.945652 counts=(array([0, 1]), array([696,  40]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9579\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.990196 counts=(array([0, 1]), array([  3, 303]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.944444 counts=(array([0, 1]), array([697,  41]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9693\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([6]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief=0.955923 counts=(array([0, 1]), array([694,  32]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9713\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([314]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([6]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.958564 counts=(array([0, 1]), array([694,  30]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9780\nClassifier's accuracy (test) : 0.9420\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([682,  23]))\n\n**************************************************\n0.4537 secs\n"
    }
   ],
   "source": [
    "t = time.time()\n",
    "for C in (.001, .01, 1, 5, 17):\n",
    "    clf = Stree(C=C, random_state=random_state)\n",
    "    clf.fit(Xtrain, ytrain)\n",
    "    print(f\"************** C={C} ****************************\")\n",
    "    print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
    "    print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
    "    print(clf)\n",
    "    print(f\"**************************************************\")\n",
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.calibration import CalibratedClassifierCV\n",
    "scaler = StandardScaler()\n",
    "cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
    "cclf.fit(Xtrain, ytrain)\n",
    "res = cclf.predict_proba(Xtest)\n",
    "#an array containing probabilities of belonging to the 1st class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([682,  23]))\n"
    }
   ],
   "source": [
    "#check iterator\n",
    "for i in list(clf):\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([301]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([13]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([17]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.967376 counts=(array([0, 1]), array([682,  23]))\n"
    }
   ],
   "source": [
    "#check iterator again\n",
    "for i in clf:\n",
    "    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check if the classifier is a sklearn estimator\n",
    "from sklearn.utils.estimator_checks import check_estimator\n",
    "check_estimator(Stree())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12d18e0e0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12d185200>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12d1850e0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12d17eb00>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12d17ec20>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12d17ed40>, 'Stree')\n7 functools.partial(<function check_sample_weights_invariance at 0x12d17ee60>, 'Stree')\n8 functools.partial(<function check_estimators_fit_returns_self at 0x12d189200>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12d189200>, 'Stree', readonly_memmap=True)\n10 functools.partial(<function check_complex_data at 0x12d181050>, 'Stree')\n11 functools.partial(<function check_dtype_object at 0x12d17ef80>, 'Stree')\n12 functools.partial(<function check_estimators_empty_data_messages at 0x12d185320>, 'Stree')\n13 functools.partial(<function check_pipeline_consistency at 0x12d181f80>, 'Stree')\n14 functools.partial(<function check_estimators_nan_inf at 0x12d185440>, 'Stree')\n15 functools.partial(<function check_estimators_overwrite_params at 0x12d189f80>, 'Stree')\n16 functools.partial(<function check_estimator_sparse_data at 0x12d17e9e0>, 'Stree')\n17 functools.partial(<function check_estimators_pickle at 0x12d185680>, 'Stree')\n18 functools.partial(<function check_classifier_data_not_an_array at 0x12d18e320>, 'Stree')\n19 functools.partial(<function check_classifiers_one_label at 0x12d185d40>, 'Stree')\n20 functools.partial(<function check_classifiers_classes at 0x12d1897a0>, 'Stree')\n21 functools.partial(<function check_estimators_partial_fit_n_features at 0x12d1857a0>, 'Stree')\n22 functools.partial(<function check_classifiers_train at 0x12d185e60>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12d185e60>, 'Stree', readonly_memmap=True)\n24 functools.partial(<function check_classifiers_regression_target at 0x12d18ed40>, 'Stree')\n25 functools.partial(<function check_supervised_y_no_nan at 0x12d17cb00>, 'Stree')\n26 functools.partial(<function check_supervised_y_2d at 0x12d189440>, 'Stree')\n27 functools.partial(<function check_estimators_unfitted at 0x12d189320>, 'Stree')\n28 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12d18e8c0>, 'Stree')\n29 functools.partial(<function check_decision_proba_consistency at 0x12d18ee60>, 'Stree')\n30 functools.partial(<function check_fit2d_predict1d at 0x12d181560>, 'Stree')\n31 functools.partial(<function check_methods_subset_invariance at 0x12d181710>, 'Stree')\n32 functools.partial(<function check_fit2d_1sample at 0x12d181830>, 'Stree')\n33 functools.partial(<function check_fit2d_1feature at 0x12d181950>, 'Stree')\n34 functools.partial(<function check_fit1d at 0x12d181a70>, 'Stree')\n35 functools.partial(<function check_get_params_invariance at 0x12d18eb00>, 'Stree')\n36 functools.partial(<function check_set_params at 0x12d18ec20>, 'Stree')\n37 functools.partial(<function check_dict_unchanged at 0x12d181170>, 'Stree')\n38 functools.partial(<function check_dont_overwrite_parameters at 0x12d181440>, 'Stree')\n39 functools.partial(<function check_fit_idempotent at 0x12d192050>, 'Stree')\n"
    }
   ],
   "source": [
    "# Make checks one by one\n",
    "c = 0\n",
    "checks = check_estimator(Stree(), generate_only=True)\n",
    "for check in checks:\n",
    "    c += 1\n",
    "    print(c, check[1])\n",
    "    check[1](check[0])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/test_graphs.ipynb
+++ b/test_graphs.ipynb
Author	SHA1	Message	Date
Ricardo Montañana	724a4855fb	Adapt some notebooks	2020-05-30 11:09:59 +02:00
Ricardo Montañana	a22ae81b54	Refactor split_data adding sample_weight	2020-05-29 18:52:23 +02:00
Ricardo Montañana	ed98054f0d	First approach Added max_depth, tol, weighted samples	2020-05-29 12:46:10 +02:00