mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 15:36:00 +00:00
Fix issues in notebooks
This commit is contained in:
19
test.ipynb
19
test.ipynb
@@ -210,9 +210,9 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"```\n",
|
||||
"************************************************************************************************************************************\n",
|
||||
"******************************************************************************************************************\n",
|
||||
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 218.966 seconds with 0.7 samples in train dataset\n",
|
||||
"************************************************************************************************************************************\n",
|
||||
"******************************************************************************************************************\n",
|
||||
"Model: Linear Tree Time: 23.05 seconds\t f1: 0.7645\n",
|
||||
"Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n",
|
||||
"Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8467\n",
|
||||
@@ -221,17 +221,6 @@
|
||||
"Model: Neural Network\t Time: 25.47 seconds\t f1: 0.8328\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"************************************************************************************************************************************\n",
|
||||
"*The best f1 model is Random Forest, with a f1 score: 0.8791 in 1513.23 seconds with 0.7 samples in train dataset\n",
|
||||
"************************************************************************************************************************************\n",
|
||||
"Model: Linear Tree\t Time: 25.18 seconds\t f1: 0.7645\n",
|
||||
"Model: Random Forest\t Time: 1513.23 seconds\t f1: 0.8791"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -251,7 +240,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.7.6"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
@@ -305,4 +294,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
|
139
test2.ipynb
139
test2.ipynb
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -17,7 +17,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -29,28 +29,31 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "*Original Fraud: 0.173% 492\n*Original Valid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807, 1)\n-Generated Fraud: 0.173% 492\n-Generated Valid: 99.827% 284315\n"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from trees.Stree import Stree\n",
|
||||
"\n",
|
||||
"random_state=1\n",
|
||||
"\n",
|
||||
"def load_creditcard(n_examples=0):\n",
|
||||
" import pandas as pd\n",
|
||||
" import numpy as np\n",
|
||||
" import random\n",
|
||||
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||
" print(\"*Original Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||
" print(\"*Original Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||
" y = np.expand_dims(df.Class.values, axis=1)\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||
" y = df.Class\n",
|
||||
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||
" #Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||
" #return Xtrain, Xtest, ytrain, ytest\n",
|
||||
" if n_examples > 0:\n",
|
||||
" # Take first n_examples samples\n",
|
||||
" X = X[:n_examples, :]\n",
|
||||
" y = y[:n_examples, :]\n",
|
||||
" else:\n",
|
||||
" # Take all the positive samples with a number of random negatives\n",
|
||||
" if n_examples < 0:\n",
|
||||
" Xt = X[(y == 1).ravel()]\n",
|
||||
" yt = y[(y == 1).ravel()]\n",
|
||||
@@ -58,59 +61,44 @@
|
||||
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||
" y = np.append(yt, y[indices], axis=0)\n",
|
||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||
" print(\"-Generated Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||
" print(\"-Generated Valid: {0:.3f}% {1}\".format(len(y[y == 0])*100/X.shape[0], len(y[y == 0])))\n",
|
||||
" return X, y\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||
"\n",
|
||||
"random_state = 1\n",
|
||||
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||
"data = load_creditcard() # Take all the samples\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Datasets\n",
|
||||
"\n",
|
||||
"#X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, \n",
|
||||
"# n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2,\n",
|
||||
"# class_sep=1.5, flip_y=0,weights=[0.5,0.5], random_state=random_state)\n",
|
||||
"\n",
|
||||
"#X, y = load_wine(return_X_y=True)\n",
|
||||
"#X, y = load_iris(return_X_y=True)\n",
|
||||
"#y[y==2]=0\n",
|
||||
"\n",
|
||||
"X, y = load_creditcard()"
|
||||
"Xtrain = data[0]\n",
|
||||
"Xtest = data[1]\n",
|
||||
"ytrain = data[2]\n",
|
||||
"ytest = data[3]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=0 belief=0.999638 counts=(array([0, 1]), array([284242, 103]))\nroot - Down - Up\nroot - Down - Up - Down\nroot - Down - Up - Down - Down, <cgaf> - Leaf class=0 belief=0.857143 counts=(array([0, 1]), array([18, 3]))\nroot - Down - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up\nroot - Down - Up - Up - Down, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Down - Up - Up - Up, <cgaf> - Leaf class=1 belief=0.862069 counts=(array([0, 1]), array([ 16, 100]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down\nroot - Up - Down - Down - Down, <cgaf> - Leaf class=0 belief=0.920000 counts=(array([0, 1]), array([23, 2]))\nroot - Up - Down - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Up, <cgaf> - Leaf class=1 belief=0.948980 counts=(array([0, 1]), array([ 15, 279]))\n\n41.5053 secs\n"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"t = time.time()\n",
|
||||
"clf = Stree(C=.01, random_state=random_state, use_predictions=False)\n",
|
||||
"clf.fit(X, y)\n",
|
||||
"clf = Stree(C=.01, random_state=random_state)\n",
|
||||
"clf.fit(Xtrain, ytrain)\n",
|
||||
"print(clf)\n",
|
||||
"print()\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Accuracy: 0.999512\n0.2389 secs\n"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"t = time.time()\n",
|
||||
"clf.score(X, y)\n",
|
||||
"print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
|
||||
"print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
@@ -125,53 +113,37 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "0.9991397683343457\n13.6326 secs\n"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"t = time.time()\n",
|
||||
"clf2 = LinearSVC(C=.01, random_state=random_state)\n",
|
||||
"clf2.fit(X, y)\n",
|
||||
"print(clf2.score(X, y))\n",
|
||||
"clf2.fit(Xtrain, ytrain)\n",
|
||||
"print(clf2.score(Xtest, ytest))\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "1.0\n18.8308 secs\n"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"t = time.time()\n",
|
||||
"clf3 = DecisionTreeClassifier(random_state=random_state)\n",
|
||||
"clf3.fit(X, y)\n",
|
||||
"print(clf3.score(X, y))\n",
|
||||
"clf3.fit(Xtrain, ytrain)\n",
|
||||
"print(clf3.score(Xtest, ytest))\n",
|
||||
"print(f\"{time.time() - t:.4f} secs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||
"clf = Stree()\n",
|
||||
"check_estimator(clf)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@@ -182,14 +154,9 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||
"version": "3.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user