Fix issues in notebooks

2025-08-15 15:36:00 +00:00 · 2020-05-17 18:36:28 +02:00
parent 9f30627e47
commit 91af875062
2 changed files with 57 additions and 101 deletions
--- a/test.ipynb
+++ b/test.ipynb
@@ -210,9 +210,9 @@
   "metadata": {},
   "source": [
    "```\n",
-    "************************************************************************************************************************************\n",
+    "******************************************************************************************************************\n",
    "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset\n",
-    "************************************************************************************************************************************\n",
+    "******************************************************************************************************************\n",
    "Model: Linear Tree       Time:  23.05 seconds\t f1: 0.7645\n",
    "Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n",
    "Model: Stree (SVM Tree)\t Time:  49.45 seconds\t f1: 0.8467\n",
@@ -221,17 +221,6 @@
    "Model: Neural Network\t Time:  25.47 seconds\t f1: 0.8328\n",
    "```"
   ]
-  },
-  {
-   "cell_type": "raw",
-   "metadata": {},
-   "source": [
-    "************************************************************************************************************************************\n",
-    "*The best f1 model is Random Forest, with a f1 score: 0.8791 in 1513.23 seconds with 0.7 samples in train dataset\n",
-    "************************************************************************************************************************************\n",
-    "Model: Linear Tree\t Time:  25.18 seconds\t f1: 0.7645\n",
-    "Model: Random Forest\t Time: 1513.23 seconds\t f1: 0.8791"
-   ]
  }
 ],
 "metadata": {
@@ -251,7 +240,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.6-final"
+   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
@@ -305,4 +294,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/test2.ipynb
+++ b/test2.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -17,7 +17,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -29,28 +29,31 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28)  y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
-    }
-   ],
+   "outputs": [],
   "source": [
+    "import time\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from trees.Stree import Stree\n",
+    "\n",
+    "random_state=1\n",
+    "\n",
    "def load_creditcard(n_examples=0):\n",
+    "    import pandas as pd\n",
+    "    import numpy as np\n",
+    "    import random\n",
    "    df = pd.read_csv('data/creditcard.csv')\n",
-    "    print(\"*Original Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
-    "    print(\"*Original Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
-    "    y = np.expand_dims(df.Class.values, axis=1)\n",
+    "    print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
+    "    print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
+    "    y = df.Class\n",
    "    X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
-    "    #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
-    "    #return Xtrain, Xtest, ytrain, ytest\n",
    "    if n_examples > 0:\n",
+    "        # Take first n_examples samples\n",
    "        X = X[:n_examples, :]\n",
    "        y = y[:n_examples, :]\n",
    "    else:\n",
+    "        # Take all the positive samples with a number of random negatives\n",
    "        if n_examples < 0:\n",
    "            Xt = X[(y == 1).ravel()]\n",
    "            yt = y[(y == 1).ravel()]\n",
@@ -58,59 +61,44 @@
    "            X = np.append(Xt, X[indices], axis=0)\n",
    "            y = np.append(yt, y[indices], axis=0)\n",
    "    print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
-    "    print(\"-Generated Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
-    "    print(\"-Generated Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n",
-    "    return X, y\n",
+    "    print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
+    "    print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
+    "    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
+    "    return Xtrain, Xtest, ytrain, ytest\n",
    "\n",
-    "random_state = 1\n",
+    "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
+    "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
+    "data = load_creditcard() # Take all the samples\n",
    "\n",
-    "\n",
-    "# Datasets\n",
-    "\n",
-    "#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
-    "#                    n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
-    "#                    class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)\n",
-    "\n",
-    "#X, y = load_wine(return_X_y=True)\n",
-    "#X, y = load_iris(return_X_y=True)\n",
-    "#y[y==2]=0\n",
-    "\n",
-    "X, y = load_creditcard()"
+    "Xtrain = data[0]\n",
+    "Xtest = data[1]\n",
+    "ytrain = data[2]\n",
+    "ytest = data[3]"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "root\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242,    103]))\nroot - Down - Up\nroot - Down - Up - Down\nroot - Down - Up - Down - Down, <cgaf> - Leaf class=0 belief=0.857143 counts=(array([0, 1]), array([18,  3]))\nroot - Down - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nroot - Down - Up - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Down - Up - Up - Up, <cgaf> - Leaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nroot - Up - Down - Down - Down, <cgaf> - Leaf class=0 belief=0.920000 counts=(array([0, 1]), array([23,  2]))\nroot - Up - Down - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Up, <cgaf> - Leaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n41.5053 secs\n"
-    }
-   ],
+   "outputs": [],
   "source": [
    "t = time.time()\n",
-    "clf = Stree(C=.01, random_state=random_state, use_predictions=False)\n",
-    "clf.fit(X, y)\n",
+    "clf = Stree(C=.01, random_state=random_state)\n",
+    "clf.fit(Xtrain, ytrain)\n",
    "print(clf)\n",
+    "print()\n",
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "Accuracy: 0.999512\n0.2389 secs\n"
-    }
-   ],
+   "outputs": [],
   "source": [
    "t = time.time()\n",
-    "clf.score(X, y)\n",
+    "print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
+    "print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
  },
@@ -125,53 +113,37 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "0.9991397683343457\n13.6326 secs\n"
-    }
-   ],
+   "outputs": [],
   "source": [
    "t = time.time()\n",
    "clf2 = LinearSVC(C=.01, random_state=random_state)\n",
-    "clf2.fit(X, y)\n",
-    "print(clf2.score(X, y))\n",
+    "clf2.fit(Xtrain, ytrain)\n",
+    "print(clf2.score(Xtest, ytest))\n",
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": "1.0\n18.8308 secs\n"
-    }
-   ],
+   "outputs": [],
   "source": [
    "t = time.time()\n",
    "clf3 = DecisionTreeClassifier(random_state=random_state)\n",
-    "clf3.fit(X, y)\n",
-    "print(clf3.score(X, y))\n",
+    "clf3.fit(Xtrain, ytrain)\n",
+    "print(clf3.score(Xtest, ytest))\n",
    "print(f\"{time.time() - t:.4f} secs\")"
   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "from sklearn.utils.estimator_checks import check_estimator\n",
-    "clf = Stree()\n",
-    "check_estimator(clf)"
-   ]
  }
 ],
 "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
@@ -182,14 +154,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.6-final"
-  },
-  "orig_nbformat": 2,
-  "kernelspec": {
-   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
-   "display_name": "Python 3.7.6 64-bit ('general': venv)"
+   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}