mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 15:36:00 +00:00
#2 update benchmark notebook
This commit is contained in:
@@ -17,7 +17,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -29,7 +29,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -45,7 +45,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -64,15 +64,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2020-06-14 23:45:42\n"
|
||||
]
|
||||
"name": "stdout",
|
||||
"text": "2020-06-15 10:17:17\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -88,7 +86,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -100,16 +98,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\n",
|
||||
"Valid: 99.827% 284,315\n"
|
||||
]
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -119,7 +114,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -131,16 +126,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"X shape: (284807, 29)\n",
|
||||
"y shape: (284807,)\n"
|
||||
]
|
||||
"name": "stdout",
|
||||
"text": "X shape: (284807, 29)\ny shape: (284807,)\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -159,7 +151,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -170,7 +162,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -180,7 +172,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -190,7 +182,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -200,7 +192,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -210,7 +202,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -227,7 +219,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -252,179 +244,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"************************** Linear Tree **********************\n",
|
||||
"Train Model Linear Tree took: 13.52 seconds\n",
|
||||
"=========== Linear Tree - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 1.000000 1.000000 1.000000 199020\n",
|
||||
" 1 1.000000 1.000000 1.000000 344\n",
|
||||
"\n",
|
||||
" accuracy 1.000000 199364\n",
|
||||
" macro avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"weighted avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"\n",
|
||||
"=========== Linear Tree - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999578 0.999613 0.999596 85295\n",
|
||||
" 1 0.772414 0.756757 0.764505 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999192 85443\n",
|
||||
" macro avg 0.885996 0.878185 0.882050 85443\n",
|
||||
"weighted avg 0.999184 0.999192 0.999188 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[199020 0]\n",
|
||||
" [ 0 344]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85262 33]\n",
|
||||
" [ 36 112]]\n",
|
||||
"************************** Random Forest **********************\n",
|
||||
"Train Model Random Forest took: 152.5 seconds\n",
|
||||
"=========== Random Forest - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 1.000000 1.000000 1.000000 199020\n",
|
||||
" 1 1.000000 1.000000 1.000000 344\n",
|
||||
"\n",
|
||||
" accuracy 1.000000 199364\n",
|
||||
" macro avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"weighted avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"\n",
|
||||
"=========== Random Forest - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999660 0.999965 0.999812 85295\n",
|
||||
" 1 0.975410 0.804054 0.881481 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999625 85443\n",
|
||||
" macro avg 0.987535 0.902009 0.940647 85443\n",
|
||||
"weighted avg 0.999618 0.999625 0.999607 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[199020 0]\n",
|
||||
" [ 0 344]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85292 3]\n",
|
||||
" [ 29 119]]\n",
|
||||
"************************** Stree (SVM Tree) **********************\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
|
||||
" \"the number of iterations.\", ConvergenceWarning)\n",
|
||||
"/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
|
||||
" \"the number of iterations.\", ConvergenceWarning)\n",
|
||||
"/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
|
||||
" \"the number of iterations.\", ConvergenceWarning)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train Model Stree (SVM Tree) took: 32.55 seconds\n",
|
||||
"=========== Stree (SVM Tree) - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999623 0.999864 0.999744 199020\n",
|
||||
" 1 0.908784 0.781977 0.840625 344\n",
|
||||
"\n",
|
||||
" accuracy 0.999488 199364\n",
|
||||
" macro avg 0.954204 0.890921 0.920184 199364\n",
|
||||
"weighted avg 0.999467 0.999488 0.999469 199364\n",
|
||||
"\n",
|
||||
"=========== Stree (SVM Tree) - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999637 0.999918 0.999777 85295\n",
|
||||
" 1 0.943548 0.790541 0.860294 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999555 85443\n",
|
||||
" macro avg 0.971593 0.895229 0.930036 85443\n",
|
||||
"weighted avg 0.999540 0.999555 0.999536 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[198993 27]\n",
|
||||
" [ 75 269]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85288 7]\n",
|
||||
" [ 31 117]]\n",
|
||||
"************************** AdaBoost model **********************\n",
|
||||
"Train Model AdaBoost model took: 47.34 seconds\n",
|
||||
"=========== AdaBoost model - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999392 0.999678 0.999535 199020\n",
|
||||
" 1 0.777003 0.648256 0.706815 344\n",
|
||||
"\n",
|
||||
" accuracy 0.999072 199364\n",
|
||||
" macro avg 0.888198 0.823967 0.853175 199364\n",
|
||||
"weighted avg 0.999008 0.999072 0.999030 199364\n",
|
||||
"\n",
|
||||
"=========== AdaBoost model - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999484 0.999707 0.999596 85295\n",
|
||||
" 1 0.806202 0.702703 0.750903 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999192 85443\n",
|
||||
" macro avg 0.902843 0.851205 0.875249 85443\n",
|
||||
"weighted avg 0.999149 0.999192 0.999165 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[198956 64]\n",
|
||||
" [ 121 223]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85270 25]\n",
|
||||
" [ 44 104]]\n",
|
||||
"************************** Gradient Boost. **********************\n",
|
||||
"Train Model Gradient Boost. took: 244.1 seconds\n",
|
||||
"=========== Gradient Boost. - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999096 0.999854 0.999475 199020\n",
|
||||
" 1 0.849741 0.476744 0.610801 344\n",
|
||||
"\n",
|
||||
" accuracy 0.998952 199364\n",
|
||||
" macro avg 0.924419 0.738299 0.805138 199364\n",
|
||||
"weighted avg 0.998839 0.998952 0.998804 199364\n",
|
||||
"\n",
|
||||
"=========== Gradient Boost. - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.998981 0.999730 0.999355 85295\n",
|
||||
" 1 0.726190 0.412162 0.525862 148\n",
|
||||
"\n",
|
||||
" accuracy 0.998713 85443\n",
|
||||
" macro avg 0.862586 0.705946 0.762609 85443\n",
|
||||
"weighted avg 0.998508 0.998713 0.998535 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[198991 29]\n",
|
||||
" [ 180 164]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85272 23]\n",
|
||||
" [ 87 61]]\n"
|
||||
]
|
||||
"text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 13.91 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 173.1 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 38.4 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999623 0.999864 0.999744 199020\n 1 0.908784 0.781977 0.840625 344\n\n accuracy 0.999488 199364\n macro avg 0.954204 0.890921 0.920184 199364\nweighted avg 0.999467 0.999488 0.999469 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999637 0.999918 0.999777 85295\n 1 0.943548 0.790541 0.860294 148\n\n accuracy 0.999555 85443\n macro avg 0.971593 0.895229 0.930036 85443\nweighted avg 0.999540 0.999555 0.999536 85443\n\nConfusion Matrix in Train\n[[198993 27]\n [ 75 269]]\nConfusion Matrix in Test\n[[85288 7]\n [ 31 117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 47.21 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Train & Test models\n",
|
||||
"models = {\n",
|
||||
" 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n",
|
||||
" 'AdaBoost model': adaboost, 'Gradient Boost.': gradient\n",
|
||||
" 'AdaBoost model': adaboost\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"best_f1 = 0\n",
|
||||
@@ -440,22 +273,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"**************************************************************************************************************\n",
|
||||
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
|
||||
"**************************************************************************************************************\n",
|
||||
"Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n",
|
||||
"Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
|
||||
"Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n",
|
||||
"Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n",
|
||||
"Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n"
|
||||
]
|
||||
"name": "stdout",
|
||||
"text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 173.095 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 13.91 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 173.09 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 38.40 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time: 47.21 seconds\t f1: 0.7509\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -466,6 +290,20 @@
|
||||
" print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**************************************************************************************************************\n",
|
||||
"*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
|
||||
"**************************************************************************************************************\n",
|
||||
"Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n",
|
||||
"Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
|
||||
"Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n",
|
||||
"Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n",
|
||||
"Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -487,9 +325,9 @@
|
||||
"metadata": {
|
||||
"hide_input": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -501,7 +339,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6"
|
||||
"version": "3.7.6-final"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
@@ -555,4 +393,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
@@ -193,8 +193,8 @@ class Splitter:
|
||||
def information_gain(
|
||||
self, labels_up: np.array, labels_dn: np.array
|
||||
) -> float:
|
||||
card_up = labels_up.shape[0]
|
||||
card_dn = labels_dn.shape[0]
|
||||
card_up = labels_up.shape[0] if labels_up is not None else 0
|
||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||
samples = card_up + card_dn
|
||||
up = card_up / samples * self.criterion_function(labels_up)
|
||||
dn = card_dn / samples * self.criterion_function(labels_dn)
|
||||
|
Reference in New Issue
Block a user