diff --git a/test.ipynb b/test.ipynb index b43acf6..c94aaee 100644 --- a/test.ipynb +++ b/test.ipynb @@ -210,9 +210,9 @@ "metadata": {}, "source": [ "```\n", - "************************************************************************************************************************************\n", + "******************************************************************************************************************\n", "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset\n", - "************************************************************************************************************************************\n", + "******************************************************************************************************************\n", "Model: Linear Tree Time: 23.05 seconds\t f1: 0.7645\n", "Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n", "Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8467\n", @@ -221,17 +221,6 @@ "Model: Neural Network\t Time: 25.47 seconds\t f1: 0.8328\n", "```" ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "************************************************************************************************************************************\n", - "*The best f1 model is Random Forest, with a f1 score: 0.8791 in 1513.23 seconds with 0.7 samples in train dataset\n", - "************************************************************************************************************************************\n", - "Model: Linear Tree\t Time: 25.18 seconds\t f1: 0.7645\n", - "Model: Random Forest\t Time: 1513.23 seconds\t f1: 0.8791" - ] } ], "metadata": { @@ -251,7 +240,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6-final" + "version": "3.7.6" }, "toc": { "base_numbering": 1, @@ -305,4 +294,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/test2.ipynb b/test2.ipynb index 6e53d93..349bdab 100644 --- a/test2.ipynb +++ b/test2.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -29,28 +29,31 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n" - } - ], + "outputs": [], "source": [ + "import time\n", + "from sklearn.model_selection import train_test_split\n", + "from trees.Stree import Stree\n", + "\n", + "random_state=1\n", + "\n", "def load_creditcard(n_examples=0):\n", + " import pandas as pd\n", + " import numpy as np\n", + " import random\n", " df = pd.read_csv('data/creditcard.csv')\n", - " print(\"*Original Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", - " print(\"*Original Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", - " y = np.expand_dims(df.Class.values, axis=1)\n", + " print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n", + " print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n", + " y = df.Class\n", " X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n", - " #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", - " #return Xtrain, Xtest, ytrain, ytest\n", " if n_examples > 0:\n", + " # Take first n_examples samples\n", " X = X[:n_examples, :]\n", " y = y[:n_examples, :]\n", " else:\n", + " # Take all the positive samples with a number of random negatives\n", " if n_examples < 0:\n", " Xt = X[(y == 1).ravel()]\n", " yt = y[(y == 1).ravel()]\n", @@ -58,59 +61,44 @@ " X = np.append(Xt, X[indices], axis=0)\n", " y = np.append(yt, y[indices], axis=0)\n", " print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n", - " print(\"-Generated Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", - " print(\"-Generated Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n", - " return X, y\n", + " print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n", + " print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n", + " Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n", + " return Xtrain, Xtest, ytrain, ytest\n", "\n", - "random_state = 1\n", + "# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n", + "# data = load_creditcard(5000) # Take the first 5000 samples\n", + "data = load_creditcard() # Take all the samples\n", "\n", - "\n", - "# Datasets\n", - "\n", - "#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n", - "# n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n", - "# class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)\n", - "\n", - "#X, y = load_wine(return_X_y=True)\n", - "#X, y = load_iris(return_X_y=True)\n", - "#y[y==2]=0\n", - "\n", - "X, y = load_creditcard()" + "Xtrain = data[0]\n", + "Xtest = data[1]\n", + "ytrain = data[2]\n", + "ytest = data[3]" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\nroot - Down - Up\nroot - Down - Up - Down\nroot - Down - Up - Down - Down, - Leaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nroot - Down - Up - Down - Up, - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nroot - Down - Up - Up - Down, - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Down - Up - Up - Up, - Leaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nroot - Up - Down - Down - Down, - Leaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up - Down - Down - Up, - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Up, - Leaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n41.5053 secs\n" - } - ], + "outputs": [], "source": [ "t = time.time()\n", - "clf = Stree(C=.01, random_state=random_state, use_predictions=False)\n", - "clf.fit(X, y)\n", + "clf = Stree(C=.01, random_state=random_state)\n", + "clf.fit(Xtrain, ytrain)\n", "print(clf)\n", + "print()\n", "print(f\"{time.time() - t:.4f} secs\")" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "Accuracy: 0.999512\n0.2389 secs\n" - } - ], + "outputs": [], "source": [ "t = time.time()\n", - "clf.score(X, y)\n", + "print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n", + "print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n", "print(f\"{time.time() - t:.4f} secs\")" ] }, @@ -125,53 +113,37 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "0.9991397683343457\n13.6326 secs\n" - } - ], + "outputs": [], "source": [ "t = time.time()\n", "clf2 = LinearSVC(C=.01, random_state=random_state)\n", - "clf2.fit(X, y)\n", - "print(clf2.score(X, y))\n", + "clf2.fit(Xtrain, ytrain)\n", + "print(clf2.score(Xtest, ytest))\n", "print(f\"{time.time() - t:.4f} secs\")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": "1.0\n18.8308 secs\n" - } - ], + "outputs": [], "source": [ "t = time.time()\n", "clf3 = DecisionTreeClassifier(random_state=random_state)\n", - "clf3.fit(X, y)\n", - "print(clf3.score(X, y))\n", + "clf3.fit(Xtrain, ytrain)\n", + "print(clf3.score(Xtest, ytest))\n", "print(f\"{time.time() - t:.4f} secs\")" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "from sklearn.utils.estimator_checks import check_estimator\n", - "clf = Stree()\n", - "check_estimator(clf)" - ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -182,14 +154,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6-final" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39", - "display_name": "Python 3.7.6 64-bit ('general': venv)" + "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +}