Adapt some notebooks

2025-08-15 23:46:02 +00:00 · 2020-05-30 11:09:59 +02:00
parent a22ae81b54
commit 724a4855fb
5 changed files with 279 additions and 78 deletions
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 # Stree

-Oblique Tree classifier based on SVM nodes
+Oblique Tree classifier based on SVM nodes. The nodes are built and splitted with sklearn LinearSVC models.Stree is a sklearn estimator and can be integrated in pipelines, grid searches, etc.

 ![Stree](https://raw.github.com/doctorado-ml/stree/master/example.png)

--- a/notebooks/adaboost.ipynb
+++ b/notebooks/adaboost.ipynb
@@ -0,0 +1,190 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from sklearn.ensemble import AdaBoostClassifier\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
+    "from sklearn.datasets import load_iris\n",
+    "from stree import Stree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "if not os.path.isfile('data/creditcard.csv'):\n",
+    "    !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
+    "    !tar xzf creditcard.tgz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28)  y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
+    }
+   ],
+   "source": [
+    "random_state=1\n",
+    "\n",
+    "def load_creditcard(n_examples=0):\n",
+    "    import pandas as pd\n",
+    "    import numpy as np\n",
+    "    import random\n",
+    "    df = pd.read_csv('data/creditcard.csv')\n",
+    "    print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
+    "    print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
+    "    y = df.Class\n",
+    "    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
+    "    if n_examples > 0:\n",
+    "        # Take first n_examples samples\n",
+    "        X = X[:n_examples, :]\n",
+    "        y = y[:n_examples, :]\n",
+    "    else:\n",
+    "        # Take all the positive samples with a number of random negatives\n",
+    "        if n_examples < 0:\n",
+    "            Xt = X[(y == 1).ravel()]\n",
+    "            yt = y[(y == 1).ravel()]\n",
+    "            indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
+    "            X = np.append(Xt, X[indices], axis=0)\n",
+    "            y = np.append(yt, y[indices], axis=0)\n",
+    "    print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
+    "    print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
+    "    print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
+    "    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
+    "    return Xtrain, Xtest, ytrain, ytest\n",
+    "\n",
+    "data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
+    "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
+    "# data = load_creditcard(0) # Take all the samples\n",
+    "\n",
+    "Xtrain = data[0]\n",
+    "Xtest = data[1]\n",
+    "ytrain = data[2]\n",
+    "ytest = data[3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Score Train:  0.986857825567503\nScore Test:  0.9805013927576601\nTook 0.12 seconds\n"
+    }
+   ],
+   "source": [
+    "now = time.time()\n",
+    "clf = Stree(max_depth=3, random_state=random_state)\n",
+    "clf.fit(Xtrain, ytrain)\n",
+    "print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
+    "print(\"Score Test: \", clf.score(Xtest, ytest))\n",
+    "print(f\"Took {time.time() - now:.2f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Score Train:  0.997610513739546\nScore Test:  0.9721448467966574\nTook 7.80 seconds\n"
+    }
+   ],
+   "source": [
+    "now = time.time()\n",
+    "clf2 = AdaBoostClassifier(Stree(max_depth=3, random_state=random_state), n_estimators=100, random_state=random_state)\n",
+    "clf2.fit(Xtrain, ytrain)\n",
+    "print(\"Score Train: \", clf2.score(Xtrain, ytrain))\n",
+    "print(\"Score Test: \", clf2.score(Xtest, ytest))\n",
+    "print(f\"Took {time.time() - now:.2f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Score Train:  0.9796893667861409\nScore Test:  0.9554317548746518\nTook 0.48 seconds\n"
+    }
+   ],
+   "source": [
+    "now = time.time()\n",
+    "clf3 = AdaBoostClassifier(LinearSVC(random_state=random_state), n_estimators=100, random_state=random_state, algorithm='SAMME')\n",
+    "clf3.fit(Xtrain, ytrain)\n",
+    "print(\"Score Train: \", clf3.score(Xtrain, ytrain))\n",
+    "print(\"Score Test: \", clf3.score(Xtest, ytest))\n",
+    "print(f\"Took {time.time() - now:.2f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Score Train:  1.0\nScore Test:  0.9721448467966574\nTook 0.86 seconds\n"
+    }
+   ],
+   "source": [
+    "now = time.time()\n",
+    "clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=random_state), n_estimators=100, random_state=random_state)\n",
+    "clf4.fit(Xtrain, ytrain)\n",
+    "print(\"Score Train: \", clf4.score(Xtrain, ytrain))\n",
+    "print(\"Score Test: \", clf4.score(Xtest, ytest))\n",
+    "print(f\"Took {time.time() - now:.2f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
+   "display_name": "Python 3.7.6 64-bit ('general': venv)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/notebooks/gridsearch.ipynb
+++ b/notebooks/gridsearch.ipynb
--- a/notebooks/test_graphs.ipynb
+++ b/notebooks/test_graphs.ipynb
@@ -14,7 +14,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
@@ -24,7 +24,7 @@
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-2-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-12-36af63297651>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmake_blobs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mLinearSVC\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mstree\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mStree\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mStree_grapher\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'stree'"
     ]
    }
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -152,11 +152,6 @@ class Stree(BaseEstimator, ClassifierMixin):
            # doesn't work with multiclass as each sample has to do inner product with its own coeficients
            # computes positition of every sample is w.r.t. the hyperplane
            res = self._linear_function(data, node)
-        # data_up, data_down = self._split_array(data, down)
-        # indices_up, indices_down = self._split_array(indices, down)
-        # res_up, res_down = self._split_array(res, down)
-        # weight_up, weight_down = self._split_array(weights, down)
-        #return [data_up, indices_up, data_down, indices_down, weight_up, weight_down, res_up, res_down]
        return res

    def _split_criteria(self, data: np.array) -> np.array:
@@ -176,7 +171,6 @@ class Stree(BaseEstimator, ClassifierMixin):
        sample_weight = _check_sample_weight(sample_weight, X)
        check_classification_targets(y)
        # Initialize computed parameters
-        #self.random_state = check_random_state(self.random_state)
        self.classes_ = np.unique(y)
        self.n_iter_ = self.max_iter
        self.depth_ = 0
@@ -316,8 +310,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        # sklearn check
        check_is_fitted(self)
        yp = self.predict(X).reshape(y.shape)
-        right = (yp == y).astype(int)
-        return np.sum(right) / len(y)
+        return np.mean(yp == y)

    def __iter__(self) -> Siterator:
        try: