mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
12 Commits
add_subspa
...
enhance-pa
Author | SHA1 | Date | |
---|---|---|---|
ddc0fe15b8
|
|||
c593b55bec | |||
044918f834 | |||
f5706c3159
|
|||
be552fdd6c
|
|||
5e3a8e3ec5
|
|||
554ec03c32
|
|||
4b7e4a3fb0
|
|||
76723993fd
|
|||
ecd0b86f4d
|
|||
3e52a4746c
|
|||
|
a20e45e8e7 |
@@ -10,5 +10,4 @@ exclude_lines =
|
||||
if __name__ == .__main__.:
|
||||
ignore_errors = True
|
||||
omit =
|
||||
stree/tests/*
|
||||
stree/__init__.py
|
4
.gitignore
vendored
4
.gitignore
vendored
@@ -130,4 +130,6 @@ dmypy.json
|
||||
|
||||
.idea
|
||||
.vscode
|
||||
.pre-commit-config.yaml
|
||||
.pre-commit-config.yaml
|
||||
|
||||
**.csv
|
@@ -17,7 +17,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -29,7 +29,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -39,13 +39,13 @@
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn import tree\n",
|
||||
"from sklearn.metrics import classification_report, confusion_matrix, f1_score\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier\n",
|
||||
"from stree import Stree"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -64,13 +64,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "2020-06-15 10:17:17\n"
|
||||
"text": [
|
||||
"2020-11-01 11:14:06\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -86,7 +90,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -98,13 +102,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -114,7 +122,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -126,13 +134,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "X shape: (284807, 29)\ny shape: (284807,)\n"
|
||||
"text": [
|
||||
"X shape: (284807, 29)\ny shape: (284807,)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -151,7 +163,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -162,7 +174,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -172,7 +184,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -182,17 +194,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Stree\n",
|
||||
"stree = Stree(random_state=random_state, C=.01)"
|
||||
"stree = Stree(random_state=random_state, C=.01, max_iter=1e3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -202,12 +214,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Gradient Boosting\n",
|
||||
"gradient = GradientBoostingClassifier(random_state=random_state)"
|
||||
"# Bagging\n",
|
||||
"bagging = BaggingClassifier(random_state=random_state)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -219,7 +231,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@@ -244,20 +256,163 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"execution_count": 16,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 13.91 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 173.1 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 38.4 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999623 0.999864 0.999744 199020\n 1 0.908784 0.781977 0.840625 344\n\n accuracy 0.999488 199364\n macro avg 0.954204 0.890921 0.920184 199364\nweighted avg 0.999467 0.999488 0.999469 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999637 0.999918 0.999777 85295\n 1 0.943548 0.790541 0.860294 148\n\n accuracy 0.999555 85443\n macro avg 0.971593 0.895229 0.930036 85443\nweighted avg 0.999540 0.999555 0.999536 85443\n\nConfusion Matrix in Train\n[[198993 27]\n [ 75 269]]\nConfusion Matrix in Test\n[[85288 7]\n [ 31 117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 47.21 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n"
|
||||
"text": [
|
||||
"************************** Linear Tree **********************\n",
|
||||
"Train Model Linear Tree took: 15.14 seconds\n",
|
||||
"=========== Linear Tree - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 1.000000 1.000000 1.000000 199020\n",
|
||||
" 1 1.000000 1.000000 1.000000 344\n",
|
||||
"\n",
|
||||
" accuracy 1.000000 199364\n",
|
||||
" macro avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"weighted avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"\n",
|
||||
"=========== Linear Tree - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999578 0.999613 0.999596 85295\n",
|
||||
" 1 0.772414 0.756757 0.764505 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999192 85443\n",
|
||||
" macro avg 0.885996 0.878185 0.882050 85443\n",
|
||||
"weighted avg 0.999184 0.999192 0.999188 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[199020 0]\n",
|
||||
" [ 0 344]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85262 33]\n",
|
||||
" [ 36 112]]\n",
|
||||
"************************** Random Forest **********************\n",
|
||||
"Train Model Random Forest took: 181.1 seconds\n",
|
||||
"=========== Random Forest - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 1.000000 1.000000 1.000000 199020\n",
|
||||
" 1 1.000000 1.000000 1.000000 344\n",
|
||||
"\n",
|
||||
" accuracy 1.000000 199364\n",
|
||||
" macro avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"weighted avg 1.000000 1.000000 1.000000 199364\n",
|
||||
"\n",
|
||||
"=========== Random Forest - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999660 0.999965 0.999812 85295\n",
|
||||
" 1 0.975410 0.804054 0.881481 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999625 85443\n",
|
||||
" macro avg 0.987535 0.902009 0.940647 85443\n",
|
||||
"weighted avg 0.999618 0.999625 0.999607 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[199020 0]\n",
|
||||
" [ 0 344]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85292 3]\n",
|
||||
" [ 29 119]]\n",
|
||||
"************************** Stree (SVM Tree) **********************\n",
|
||||
"Train Model Stree (SVM Tree) took: 36.6 seconds\n",
|
||||
"=========== Stree (SVM Tree) - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999623 0.999864 0.999744 199020\n",
|
||||
" 1 0.908784 0.781977 0.840625 344\n",
|
||||
"\n",
|
||||
" accuracy 0.999488 199364\n",
|
||||
" macro avg 0.954204 0.890921 0.920184 199364\n",
|
||||
"weighted avg 0.999467 0.999488 0.999469 199364\n",
|
||||
"\n",
|
||||
"=========== Stree (SVM Tree) - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999637 0.999918 0.999777 85295\n",
|
||||
" 1 0.943548 0.790541 0.860294 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999555 85443\n",
|
||||
" macro avg 0.971593 0.895229 0.930036 85443\n",
|
||||
"weighted avg 0.999540 0.999555 0.999536 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[198993 27]\n",
|
||||
" [ 75 269]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85288 7]\n",
|
||||
" [ 31 117]]\n",
|
||||
"************************** AdaBoost model **********************\n",
|
||||
"Train Model AdaBoost model took: 46.14 seconds\n",
|
||||
"=========== AdaBoost model - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999392 0.999678 0.999535 199020\n",
|
||||
" 1 0.777003 0.648256 0.706815 344\n",
|
||||
"\n",
|
||||
" accuracy 0.999072 199364\n",
|
||||
" macro avg 0.888198 0.823967 0.853175 199364\n",
|
||||
"weighted avg 0.999008 0.999072 0.999030 199364\n",
|
||||
"\n",
|
||||
"=========== AdaBoost model - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999484 0.999707 0.999596 85295\n",
|
||||
" 1 0.806202 0.702703 0.750903 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999192 85443\n",
|
||||
" macro avg 0.902843 0.851205 0.875249 85443\n",
|
||||
"weighted avg 0.999149 0.999192 0.999165 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[198956 64]\n",
|
||||
" [ 121 223]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85270 25]\n",
|
||||
" [ 44 104]]\n",
|
||||
"************************** Bagging model **********************\n",
|
||||
"Train Model Bagging model took: 77.73 seconds\n",
|
||||
"=========== Bagging model - Train 199,364 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999864 1.000000 0.999932 199020\n",
|
||||
" 1 1.000000 0.921512 0.959153 344\n",
|
||||
"\n",
|
||||
" accuracy 0.999865 199364\n",
|
||||
" macro avg 0.999932 0.960756 0.979542 199364\n",
|
||||
"weighted avg 0.999865 0.999865 0.999862 199364\n",
|
||||
"\n",
|
||||
"=========== Bagging model - Test 85,443 samples =============\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" 0 0.999637 0.999953 0.999795 85295\n",
|
||||
" 1 0.966942 0.790541 0.869888 148\n",
|
||||
"\n",
|
||||
" accuracy 0.999590 85443\n",
|
||||
" macro avg 0.983289 0.895247 0.934842 85443\n",
|
||||
"weighted avg 0.999580 0.999590 0.999570 85443\n",
|
||||
"\n",
|
||||
"Confusion Matrix in Train\n",
|
||||
"[[199020 0]\n",
|
||||
" [ 27 317]]\n",
|
||||
"Confusion Matrix in Test\n",
|
||||
"[[85291 4]\n",
|
||||
" [ 31 117]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Train & Test models\n",
|
||||
"models = {\n",
|
||||
" 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n",
|
||||
" 'AdaBoost model': adaboost\n",
|
||||
" 'AdaBoost model': adaboost, 'Bagging model': bagging\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"best_f1 = 0\n",
|
||||
@@ -273,13 +428,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"execution_count": 17,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 173.095 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 13.91 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 173.09 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 38.40 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time: 47.21 seconds\t f1: 0.7509\n"
|
||||
"text": [
|
||||
"**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 181.07 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 15.14 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 181.07 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 36.60 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time: 46.14 seconds\t f1: 0.7509\nModel: Bagging model\t Time: 77.73 seconds\t f1: 0.8699\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -314,20 +473,53 @@
|
||||
"******************************************************************************************************************\n",
|
||||
"Model: Linear Tree Time: 23.05 seconds\t f1: 0.7645\n",
|
||||
"Model: Random Forest\t Time: 218.97 seconds\t f1: 0.8815\n",
|
||||
"Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8467\n",
|
||||
"Model: Stree (SVM Tree)\t Time: 49.45 seconds\t f1: 0.8603\n",
|
||||
"Model: AdaBoost model\t Time: 73.83 seconds\t f1: 0.7509\n",
|
||||
"Model: Gradient Boost.\t Time: 388.69 seconds\t f1: 0.5259\n",
|
||||
"Model: Neural Network\t Time: 25.47 seconds\t f1: 0.8328\n",
|
||||
"Model: Bagging model\t Time: 77.93 seconds\t f1: 0.8699\n",
|
||||
"\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'C': 0.01,\n",
|
||||
" 'criterion': 'entropy',\n",
|
||||
" 'degree': 3,\n",
|
||||
" 'gamma': 'scale',\n",
|
||||
" 'kernel': 'linear',\n",
|
||||
" 'max_depth': None,\n",
|
||||
" 'max_features': None,\n",
|
||||
" 'max_iter': 1000.0,\n",
|
||||
" 'min_samples_split': 0,\n",
|
||||
" 'random_state': 2020,\n",
|
||||
" 'split_criteria': 'impurity',\n",
|
||||
" 'splitter': 'random',\n",
|
||||
" 'tol': 0.0001}"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 18
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stree.get_params()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"hide_input": false,
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
||||
"display_name": "Python 3.8.4 64-bit ('general': venv)",
|
||||
"language": "python",
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
||||
"name": "python38464bitgeneralvenv77203c0a6afd4428bd66253ef62753dc"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
@@ -339,7 +531,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.8.4-final"
|
||||
},
|
||||
"toc": {
|
||||
"base_numbering": 1,
|
||||
|
@@ -4,7 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test AdaBoost with different configurations"
|
||||
"# Test Stree with AdaBoost and Bagging with different configurations"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -34,11 +34,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"from sklearn.ensemble import AdaBoostClassifier\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"from sklearn.svm import LinearSVC, SVC\n",
|
||||
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||
"from sklearn.datasets import load_iris\n",
|
||||
"from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from stree import Stree"
|
||||
]
|
||||
},
|
||||
@@ -57,12 +54,20 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n"
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\n",
|
||||
"Valid: 99.827% 284315\n",
|
||||
"X.shape (100492, 28) y.shape (100492,)\n",
|
||||
"Fraud: 0.652% 655\n",
|
||||
"Valid: 99.348% 99837\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -117,23 +122,27 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## STree alone on the whole dataset and linear kernel"
|
||||
"## STree alone with 100.000 samples and linear kernel"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Score Train: 0.9985499829409757\nScore Test: 0.998407854584052\nTook 39.45 seconds\n"
|
||||
"text": [
|
||||
"Score Train: 0.9985073353804162\nScore Test: 0.9983746848878864\nTook 35.80 seconds\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"now = time.time()\n",
|
||||
"clf = Stree(max_depth=3, random_state=random_state)\n",
|
||||
"clf = Stree(max_depth=3, random_state=random_state, max_iter=1e3)\n",
|
||||
"clf.fit(Xtrain, ytrain)\n",
|
||||
"print(\"Score Train: \", clf.score(Xtrain, ytrain))\n",
|
||||
"print(\"Score Test: \", clf.score(Xtest, ytest))\n",
|
||||
@@ -144,7 +153,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Different kernels with different configuations"
|
||||
"## Adaboost"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -161,18 +170,24 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n"
|
||||
"text": [
|
||||
"Kernel: linear\tTime: 49.66 seconds\tScore Train: 0.9983225\tScore Test: 0.9983083\n",
|
||||
"Kernel: rbf\tTime: 12.73 seconds\tScore Train: 0.9934891\tScore Test: 0.9934656\n",
|
||||
"Kernel: poly\tTime: 76.24 seconds\tScore Train: 0.9972706\tScore Test: 0.9969152\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||
" now = time.time()\n",
|
||||
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state, max_iter=1e3), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf.fit(Xtrain, ytrain)\n",
|
||||
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||
" score_test = clf.score(Xtest, ytest)\n",
|
||||
@@ -183,24 +198,41 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test algorithm SAMME in AdaBoost to check speed/accuracy"
|
||||
"## Bagging"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"n_estimators = 10\n",
|
||||
"C = 7\n",
|
||||
"max_depth = 3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n"
|
||||
"text": [
|
||||
"Kernel: linear\tTime: 231.51 seconds\tScore Train: 0.9984931\tScore Test: 0.9983083\n",
|
||||
"Kernel: rbf\tTime: 114.77 seconds\tScore Train: 0.9992323\tScore Test: 0.9983083\n",
|
||||
"Kernel: poly\tTime: 67.87 seconds\tScore Train: 0.9993319\tScore Test: 0.9985074\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for kernel in ['linear', 'rbf', 'poly']:\n",
|
||||
" now = time.time()\n",
|
||||
" clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
" clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state, max_iter=1e3), n_estimators=n_estimators, random_state=random_state)\n",
|
||||
" clf.fit(Xtrain, ytrain)\n",
|
||||
" score_train = clf.score(Xtrain, ytrain)\n",
|
||||
" score_test = clf.score(Xtest, ytest)\n",
|
||||
@@ -219,12 +251,12 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.8.4-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||
"name": "python38464bitgeneralf6de308d3831407c8bd68d4a5e328a38",
|
||||
"display_name": "Python 3.8.4 64-bit ('general')"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
@@ -4,7 +4,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Test smple_weight, kernels, C, sklearn estimator"
|
||||
"# Test sample_weight, kernels, C, sklearn estimator"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -47,7 +47,9 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
@@ -59,12 +61,16 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (5492, 28) y.shape (5492,)\nFraud: 9.141% 502\nValid: 90.859% 4990\n[0.09183143 0.09183143 0.09183143 0.09183143] [0.09041262 0.09041262 0.09041262 0.09041262]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -94,22 +100,29 @@
|
||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state)\n",
|
||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||
"\n",
|
||||
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
||||
"data = load_creditcard(-5000) # Take all true samples with up to 5000 of the others\n",
|
||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||
"data = load_creditcard(-1000) # Take all the samples\n",
|
||||
"# data = load_creditcard(-1000) # Take 1000 samples\n",
|
||||
"\n",
|
||||
"Xtrain = data[0]\n",
|
||||
"Xtest = data[1]\n",
|
||||
"ytrain = data[2]\n",
|
||||
"ytest = data[3]\n",
|
||||
"_, data = np.unique(ytrain, return_counts=True)\n",
|
||||
"wtrain = (data[1] / np.sum(data), data[0] / np.sum(data))\n",
|
||||
"_, data = np.unique(ytest, return_counts=True)\n",
|
||||
"wtest = (data[1] / np.sum(data), data[0] / np.sum(data))\n",
|
||||
"# Set weights inverse to its count class in dataset\n",
|
||||
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
|
||||
"weights[ytrain==1] = 1.99755\n",
|
||||
"weights_test = np.ones(Xtest.shape[0],) * 1.00244\n",
|
||||
"weights_test[ytest==1] = 1.99755 "
|
||||
"weights = np.ones(Xtrain.shape[0],)\n",
|
||||
"weights[ytrain==0] = wtrain[0]\n",
|
||||
"weights[ytrain==1] = wtrain[1]\n",
|
||||
"weights_test = np.ones(Xtest.shape[0],)\n",
|
||||
"weights_test[ytest==0] = wtest[0]\n",
|
||||
"weights_test[ytest==1] = wtest[1]\n",
|
||||
"print(weights[:4], weights_test[:4])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -123,19 +136,26 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Test smple_weights\n",
|
||||
"## Test sample_weights\n",
|
||||
"Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Accuracy of Train without weights 0.9808429118773946\nAccuracy of Train with weights 0.9904214559386973\nAccuracy of Tests without weights 0.9441964285714286\nAccuracy of Tests with weights 0.9375\n"
|
||||
"text": [
|
||||
"Accuracy of Train without weights 0.9851716961498439\n",
|
||||
"Accuracy of Train with weights 0.986732570239334\n",
|
||||
"Accuracy of Tests without weights 0.9866504854368932\n",
|
||||
"Accuracy of Tests with weights 0.9781553398058253\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -157,12 +177,18 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Time: 0.13s\tKernel: linear\tAccuracy_train: 0.9693486590038314\tAccuracy_test: 0.9598214285714286\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9923371647509579\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: poly\tAccuracy_train: 0.9913793103448276\tAccuracy_test: 0.9375\n"
|
||||
"text": [
|
||||
"Time: 26.03s\tKernel: linear\tAccuracy_train: 0.9851716961498439\tAccuracy_test: 0.9866504854368932\n",
|
||||
"Time: 0.54s\tKernel: rbf\tAccuracy_train: 0.9947970863683663\tAccuracy_test: 0.9878640776699029\n",
|
||||
"Time: 0.43s\tKernel: poly\tAccuracy_train: 0.9960978147762747\tAccuracy_test: 0.9854368932038835\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -187,15 +213,65 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"tags": [
|
||||
"outputPrepend"
|
||||
]
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9487\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0374\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.984076 impurity=0.0313 counts=(array([0, 1]), array([ 5, 309]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.947874 impurity=0.0988 counts=(array([0, 1]), array([691, 38]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9531\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0192\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.993506 impurity=0.0129 counts=(array([0, 1]), array([ 2, 306]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.944218 impurity=0.1053 counts=(array([0, 1]), array([694, 41]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9643\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0189\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951989 impurity=0.0914 counts=(array([0, 1]), array([694, 35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9621\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0250\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([4]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951923 impurity=0.0915 counts=(array([0, 1]), array([693, 35]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9703\nClassifier's accuracy (test) : 0.9665\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n\n**************************************************\n0.4375 secs\n"
|
||||
"text": [
|
||||
"************** C=0.001 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9828\n",
|
||||
"Classifier's accuracy (test) : 0.9848\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.981716 impurity=0.1317 counts=(array([0, 1]), array([3490, 65]))\n",
|
||||
"root - Up, <cgaf> - Leaf class=1 belief= 0.996540 impurity=0.0333 counts=(array([0, 1]), array([ 1, 288]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=0.01 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9834\n",
|
||||
"Classifier's accuracy (test) : 0.9854\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.982269 impurity=0.1285 counts=(array([0, 1]), array([3490, 63]))\n",
|
||||
"root - Up, <cgaf> - Leaf class=1 belief= 0.996564 impurity=0.0331 counts=(array([0, 1]), array([ 1, 290]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=1 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9847\n",
|
||||
"Classifier's accuracy (test) : 0.9867\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.983371 impurity=0.1221 counts=(array([0, 1]), array([3489, 59]))\n",
|
||||
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0584 counts=(array([0, 1]), array([ 2, 294]))\n",
|
||||
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([2]))\n",
|
||||
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([294]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=5 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9852\n",
|
||||
"Classifier's accuracy (test) : 0.9867\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\n",
|
||||
"root - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\n",
|
||||
"root - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\n",
|
||||
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\n",
|
||||
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\n",
|
||||
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"************** C=17 ****************************\n",
|
||||
"Classifier's accuracy (train): 0.9852\n",
|
||||
"Classifier's accuracy (test) : 0.9867\n",
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\n",
|
||||
"root - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\n",
|
||||
"root - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\n",
|
||||
"root - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\n",
|
||||
"root - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\n",
|
||||
"root - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\n",
|
||||
"root - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n",
|
||||
"\n",
|
||||
"**************************************************\n",
|
||||
"64.5792 secs\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -222,12 +298,16 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n"
|
||||
"text": [
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\nroot - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\nroot - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -239,12 +319,16 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n"
|
||||
"text": [
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.1205 counts=(array([0, 1]), array([3488, 58]))\nroot - Down - Down, <cgaf> - Leaf class=0 belief= 0.983921 impurity=0.1188 counts=(array([0, 1]), array([3488, 57]))\nroot - Down - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0812 counts=(array([0, 1]), array([ 3, 295]))\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up - Up, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([295]))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -263,12 +347,58 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12735b3b0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x1273514d0>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x1273513b0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12734acb0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12734add0>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12734aef0>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12734d050>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12734d170>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x1273564d0>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x1273564d0>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12734d320>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12734d290>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x1273515f0>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x127351290>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x127351710>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12735b290>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12734ab90>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x127351950>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12735b5f0>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x127356050>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x127356a70>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x127351a70>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12735f0e0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x1273449e0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x127356710>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x1273565f0>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12735bc20>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12735f200>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12734d830>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12734d9e0>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12734db00>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12734dc20>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12734dd40>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12735be60>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12735bf80>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12734d440>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12734d710>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12735f3b0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12735f440>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x12735f4d0>, 'Stree')\n"
|
||||
"text": [
|
||||
"1 functools.partial(<function check_no_attributes_set_in_init at 0x125acaee0>, 'Stree')\n",
|
||||
"2 functools.partial(<function check_estimators_dtypes at 0x125ac7040>, 'Stree')\n",
|
||||
"3 functools.partial(<function check_fit_score_takes_y at 0x125ac2ee0>, 'Stree')\n",
|
||||
"4 functools.partial(<function check_sample_weights_pandas_series at 0x125ac0820>, 'Stree')\n",
|
||||
"5 functools.partial(<function check_sample_weights_not_an_array at 0x125ac0940>, 'Stree')\n",
|
||||
"6 functools.partial(<function check_sample_weights_list at 0x125ac0a60>, 'Stree')\n",
|
||||
"7 functools.partial(<function check_sample_weights_shape at 0x125ac0b80>, 'Stree')\n",
|
||||
"8 functools.partial(<function check_sample_weights_invariance at 0x125ac0ca0>, 'Stree')\n",
|
||||
"9 functools.partial(<function check_estimators_fit_returns_self at 0x125aca040>, 'Stree')\n",
|
||||
"10 functools.partial(<function check_estimators_fit_returns_self at 0x125aca040>, 'Stree', readonly_memmap=True)\n",
|
||||
"11 functools.partial(<function check_complex_data at 0x125ac0e50>, 'Stree')\n",
|
||||
"12 functools.partial(<function check_dtype_object at 0x125ac0dc0>, 'Stree')\n",
|
||||
"13 functools.partial(<function check_estimators_empty_data_messages at 0x125ac7160>, 'Stree')\n",
|
||||
"14 functools.partial(<function check_pipeline_consistency at 0x125ac2dc0>, 'Stree')\n",
|
||||
"15 functools.partial(<function check_estimators_nan_inf at 0x125ac7280>, 'Stree')\n",
|
||||
"16 functools.partial(<function check_estimators_overwrite_params at 0x125acadc0>, 'Stree')\n",
|
||||
"17 functools.partial(<function check_estimator_sparse_data at 0x125ac0700>, 'Stree')\n",
|
||||
"18 functools.partial(<function check_estimators_pickle at 0x125ac74c0>, 'Stree')\n",
|
||||
"19 functools.partial(<function check_classifier_data_not_an_array at 0x125acd160>, 'Stree')\n",
|
||||
"20 functools.partial(<function check_classifiers_one_label at 0x125ac7b80>, 'Stree')\n",
|
||||
"21 functools.partial(<function check_classifiers_classes at 0x125aca5e0>, 'Stree')\n",
|
||||
"22 functools.partial(<function check_estimators_partial_fit_n_features at 0x125ac75e0>, 'Stree')\n",
|
||||
"23 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree')\n",
|
||||
"24 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree', readonly_memmap=True)\n",
|
||||
"25 functools.partial(<function check_classifiers_train at 0x125ac7ca0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n",
|
||||
"26 functools.partial(<function check_classifiers_regression_target at 0x125acdc10>, 'Stree')\n",
|
||||
"27 functools.partial(<function check_supervised_y_no_nan at 0x125aab790>, 'Stree')\n",
|
||||
"28 functools.partial(<function check_supervised_y_2d at 0x125aca280>, 'Stree')\n",
|
||||
"29 functools.partial(<function check_estimators_unfitted at 0x125aca160>, 'Stree')\n",
|
||||
"30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x125acd790>, 'Stree')\n",
|
||||
"31 functools.partial(<function check_decision_proba_consistency at 0x125acdd30>, 'Stree')\n",
|
||||
"32 functools.partial(<function check_fit2d_predict1d at 0x125ac23a0>, 'Stree')\n",
|
||||
"33 functools.partial(<function check_methods_subset_invariance at 0x125ac2550>, 'Stree')\n",
|
||||
"34 functools.partial(<function check_fit2d_1sample at 0x125ac2670>, 'Stree')\n",
|
||||
"35 functools.partial(<function check_fit2d_1feature at 0x125ac2790>, 'Stree')\n",
|
||||
"36 functools.partial(<function check_fit1d at 0x125ac28b0>, 'Stree')\n",
|
||||
"37 functools.partial(<function check_get_params_invariance at 0x125acd9d0>, 'Stree')\n",
|
||||
"38 functools.partial(<function check_set_params at 0x125acdaf0>, 'Stree')\n",
|
||||
"39 functools.partial(<function check_dict_unchanged at 0x125ac0f70>, 'Stree')\n",
|
||||
"40 functools.partial(<function check_dont_overwrite_parameters at 0x125ac2280>, 'Stree')\n",
|
||||
"41 functools.partial(<function check_fit_idempotent at 0x125acdee0>, 'Stree')\n",
|
||||
"42 functools.partial(<function check_n_features_in at 0x125acdf70>, 'Stree')\n",
|
||||
"43 functools.partial(<function check_requires_y_none at 0x125ad1040>, 'Stree')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -301,12 +431,27 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "== Not Weighted ===\nSVC train score ..: 0.9578544061302682\nSTree train score : 0.960727969348659\nSVC test score ...: 0.9508928571428571\nSTree test score .: 0.9553571428571429\n==== Weighted =====\nSVC train score ..: 0.9636015325670498\nSTree train score : 0.9626436781609196\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9553571428571429\n*SVC test score ..: 0.9447820728419238\n*STree test score : 0.9447820728419238\n"
|
||||
"text": [
|
||||
"== Not Weighted ===\n",
|
||||
"SVC train score ..: 0.9825702393340271\n",
|
||||
"STree train score : 0.9841311134235172\n",
|
||||
"SVC test score ...: 0.9830097087378641\n",
|
||||
"STree test score .: 0.9848300970873787\n",
|
||||
"==== Weighted =====\n",
|
||||
"SVC train score ..: 0.9786680541103018\n",
|
||||
"STree train score : 0.9802289281997919\n",
|
||||
"SVC test score ...: 0.9805825242718447\n",
|
||||
"STree test score .: 0.9817961165048543\n",
|
||||
"*SVC test score ..: 0.9439939825655582\n",
|
||||
"*STree test score : 0.9476832429673473\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -333,12 +478,16 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down, <cgaf> - Leaf class=1 belief= 0.978261 impurity=0.0425 counts=(array([0, 1]), array([ 7, 315]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.955679 impurity=0.0847 counts=(array([0, 1]), array([690, 32]))\n\n"
|
||||
"text": [
|
||||
"root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4426 counts=(array([0, 1]), array([3491, 353]))\nroot - Down, <cgaf> - Leaf class=0 belief= 0.990520 impurity=0.0773 counts=(array([0, 1]), array([3448, 33]))\nroot - Up, <cgaf> - Leaf class=1 belief= 0.881543 impurity=0.5249 counts=(array([0, 1]), array([ 43, 320]))\n\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -355,12 +504,50 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "****************************************\nmax_features None = 28\nTrain score : 0.9664750957854407\nTest score .: 0.9642857142857143\nTook 0.09 seconds\n****************************************\nmax_features auto = 5\nTrain score : 0.9511494252873564\nTest score .: 0.9441964285714286\nTook 0.37 seconds\n****************************************\nmax_features log2 = 4\nTrain score : 0.935823754789272\nTest score .: 0.9330357142857143\nTook 0.10 seconds\n****************************************\nmax_features 7 = 7\nTrain score : 0.9568965517241379\nTest score .: 0.9397321428571429\nTook 3.36 seconds\n****************************************\nmax_features 0.5 = 14\nTrain score : 0.960727969348659\nTest score .: 0.9486607142857143\nTook 112.42 seconds\n****************************************\nmax_features 0.1 = 2\nTrain score : 0.8793103448275862\nTest score .: 0.8839285714285714\nTook 0.06 seconds\n****************************************\nmax_features 0.7 = 19\nTrain score : 0.9655172413793104\nTest score .: 0.9553571428571429\nTook 10.59 seconds\n"
|
||||
"text": [
|
||||
"****************************************\n",
|
||||
"max_features None = 28\n",
|
||||
"Train score : 0.9846514047866806\n",
|
||||
"Test score .: 0.9866504854368932\n",
|
||||
"Took 10.18 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features auto = 5\n",
|
||||
"Train score : 0.9836108220603538\n",
|
||||
"Test score .: 0.9842233009708737\n",
|
||||
"Took 5.22 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features log2 = 4\n",
|
||||
"Train score : 0.9791883454734651\n",
|
||||
"Test score .: 0.9793689320388349\n",
|
||||
"Took 2.05 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 7 = 7\n",
|
||||
"Train score : 0.9737252861602498\n",
|
||||
"Test score .: 0.9739077669902912\n",
|
||||
"Took 2.86 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 0.5 = 14\n",
|
||||
"Train score : 0.981789802289282\n",
|
||||
"Test score .: 0.9824029126213593\n",
|
||||
"Took 48.35 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 0.1 = 2\n",
|
||||
"Train score : 0.9638397502601457\n",
|
||||
"Test score .: 0.9648058252427184\n",
|
||||
"Took 0.35 seconds\n",
|
||||
"****************************************\n",
|
||||
"max_features 0.7 = 19\n",
|
||||
"Train score : 0.9841311134235172\n",
|
||||
"Test score .: 0.9860436893203883\n",
|
||||
"Took 20.89 seconds\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -374,13 +561,6 @@
|
||||
" print(\"Test score .:\", clf.score(Xtest, ytest))\n",
|
||||
" print(f\"Took {time.time() - now:.2f} seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -399,7 +579,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.8.4-final"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@@ -66,7 +66,8 @@
|
||||
"id": "z9Q-YUfBDZEq",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b"
|
||||
"outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"random_state=1\n",
|
||||
@@ -112,7 +113,9 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
|
||||
"text": [
|
||||
"Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
@@ -131,31 +134,68 @@
|
||||
"colab": {}
|
||||
},
|
||||
"source": [
|
||||
"parameters = {\n",
|
||||
"parameters = [{\n",
|
||||
" 'base_estimator': [Stree()],\n",
|
||||
" 'n_estimators': [10, 25],\n",
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5],\n",
|
||||
" 'base_estimator__C': [1, 3],\n",
|
||||
" 'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
|
||||
"}"
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__kernel': ['linear']\n",
|
||||
"},\n",
|
||||
"{\n",
|
||||
" 'base_estimator': [Stree()],\n",
|
||||
" 'n_estimators': [10, 25],\n",
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__degree': [3, 5, 7],\n",
|
||||
" 'base_estimator__kernel': ['poly']\n",
|
||||
"},\n",
|
||||
"{\n",
|
||||
" 'base_estimator': [Stree()],\n",
|
||||
" 'n_estimators': [10, 25],\n",
|
||||
" 'learning_rate': [.5, 1],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples', 'impurity'],\n",
|
||||
" 'base_estimator__tol': [.1, 1e-02],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__gamma': [.1, 1, 10],\n",
|
||||
" 'base_estimator__kernel': ['rbf']\n",
|
||||
"}]"
|
||||
],
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}"
|
||||
"text/plain": [
|
||||
"{'C': 1.0,\n",
|
||||
" 'criterion': 'entropy',\n",
|
||||
" 'degree': 3,\n",
|
||||
" 'gamma': 'scale',\n",
|
||||
" 'kernel': 'linear',\n",
|
||||
" 'max_depth': None,\n",
|
||||
" 'max_features': None,\n",
|
||||
" 'max_iter': 100000.0,\n",
|
||||
" 'min_samples_split': 0,\n",
|
||||
" 'random_state': None,\n",
|
||||
" 'split_criteria': 'impurity',\n",
|
||||
" 'splitter': 'random',\n",
|
||||
" 'tol': 0.0001}"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 14
|
||||
"execution_count": 6
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -168,52 +208,214 @@
|
||||
"id": "CrcB8o6EDZE5",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762"
|
||||
"outputId": "7703413a-d563-4289-a13b-532f38f82762",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"random_state=2020\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state)\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n",
|
||||
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
|
||||
"grid.fit(Xtrain, ytrain)"
|
||||
],
|
||||
"execution_count": 11,
|
||||
"execution_count": 7,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 3.6s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 4.2s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 4.8s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 5.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 6.2s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 7.2s\n[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 8.9s\n[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 10.7s\n[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 12.7s\n[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 1.5min finished\n"
|
||||
"text": [
|
||||
"Fitting 5 folds for each of 1008 candidates, totalling 5040 fits\n",
|
||||
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.6s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 3.2s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 3.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 4.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 4.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 5.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 56 tasks | elapsed: 5.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 69 tasks | elapsed: 6.2s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 82 tasks | elapsed: 7.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 97 tasks | elapsed: 8.2s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 112 tasks | elapsed: 9.6s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 129 tasks | elapsed: 11.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 146 tasks | elapsed: 12.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 165 tasks | elapsed: 14.3s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 16.0s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 205 tasks | elapsed: 18.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 226 tasks | elapsed: 20.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 249 tasks | elapsed: 21.9s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 272 tasks | elapsed: 23.4s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 297 tasks | elapsed: 24.9s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 322 tasks | elapsed: 26.6s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 349 tasks | elapsed: 29.3s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 376 tasks | elapsed: 31.9s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 405 tasks | elapsed: 35.5s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 38.7s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 465 tasks | elapsed: 42.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 496 tasks | elapsed: 46.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 529 tasks | elapsed: 52.7s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 562 tasks | elapsed: 58.1s\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 597 tasks | elapsed: 1.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 632 tasks | elapsed: 1.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 669 tasks | elapsed: 1.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 706 tasks | elapsed: 1.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 745 tasks | elapsed: 1.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 784 tasks | elapsed: 1.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 825 tasks | elapsed: 1.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 866 tasks | elapsed: 1.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 909 tasks | elapsed: 1.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 952 tasks | elapsed: 1.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 997 tasks | elapsed: 2.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1042 tasks | elapsed: 2.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1089 tasks | elapsed: 2.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1136 tasks | elapsed: 2.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1185 tasks | elapsed: 2.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1234 tasks | elapsed: 2.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1285 tasks | elapsed: 2.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1336 tasks | elapsed: 2.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1389 tasks | elapsed: 2.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1442 tasks | elapsed: 2.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1497 tasks | elapsed: 2.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1552 tasks | elapsed: 2.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1609 tasks | elapsed: 2.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1666 tasks | elapsed: 2.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1725 tasks | elapsed: 2.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1784 tasks | elapsed: 3.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1845 tasks | elapsed: 3.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1906 tasks | elapsed: 3.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 1969 tasks | elapsed: 3.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2032 tasks | elapsed: 3.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2097 tasks | elapsed: 3.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2162 tasks | elapsed: 3.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2229 tasks | elapsed: 3.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2296 tasks | elapsed: 3.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2365 tasks | elapsed: 3.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2434 tasks | elapsed: 3.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2505 tasks | elapsed: 3.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2576 tasks | elapsed: 3.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2649 tasks | elapsed: 3.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2722 tasks | elapsed: 4.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2797 tasks | elapsed: 4.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2872 tasks | elapsed: 4.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 2949 tasks | elapsed: 4.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3026 tasks | elapsed: 4.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3105 tasks | elapsed: 4.7min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3184 tasks | elapsed: 4.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3265 tasks | elapsed: 5.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3346 tasks | elapsed: 5.2min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3429 tasks | elapsed: 5.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3512 tasks | elapsed: 5.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3597 tasks | elapsed: 5.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3682 tasks | elapsed: 6.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3769 tasks | elapsed: 6.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3856 tasks | elapsed: 6.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 3945 tasks | elapsed: 6.9min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4034 tasks | elapsed: 7.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4125 tasks | elapsed: 7.4min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4216 tasks | elapsed: 7.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4309 tasks | elapsed: 7.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4402 tasks | elapsed: 8.1min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4497 tasks | elapsed: 8.5min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4592 tasks | elapsed: 8.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4689 tasks | elapsed: 9.0min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4786 tasks | elapsed: 9.3min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4885 tasks | elapsed: 9.6min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 4984 tasks | elapsed: 9.8min\n",
|
||||
"[Parallel(n_jobs=-1)]: Done 5040 out of 5040 | elapsed: 10.0min finished\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n 'base_estimator__C': [1, 3],\n 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n 'base_estimator__max_depth': [3, 5],\n 'base_estimator__tol': [0.1, 0.01],\n 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n return_train_score=True, verbose=10)"
|
||||
"text/plain": [
|
||||
"GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n",
|
||||
" n_jobs=-1,\n",
|
||||
" param_grid=[{'base_estimator': [Stree(C=7, max_depth=5,\n",
|
||||
" split_criteria='max_samples',\n",
|
||||
" tol=0.01)],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__kernel': ['linear'],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples',\n",
|
||||
" 'impurity'],\n",
|
||||
" 'base_e...\n",
|
||||
" 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n",
|
||||
" {'base_estimator': [Stree()],\n",
|
||||
" 'base_estimator__C': [1, 7, 55],\n",
|
||||
" 'base_estimator__gamma': [0.1, 1, 10],\n",
|
||||
" 'base_estimator__kernel': ['rbf'],\n",
|
||||
" 'base_estimator__max_depth': [3, 5, 7],\n",
|
||||
" 'base_estimator__split_criteria': ['max_samples',\n",
|
||||
" 'impurity'],\n",
|
||||
" 'base_estimator__tol': [0.1, 0.01],\n",
|
||||
" 'learning_rate': [0.5, 1],\n",
|
||||
" 'n_estimators': [10, 25]}],\n",
|
||||
" return_train_score=True, verbose=10)"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 11
|
||||
"execution_count": 7
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n",
|
||||
" n_jobs=-1,\n",
|
||||
" param_grid={'base_estimator': [Stree(C=55, max_depth=3, tol=0.01)],\n",
|
||||
" 'base_estimator__C': [7, 55],\n",
|
||||
" 'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n",
|
||||
" 'base_estimator__max_depth': [3, 5],\n",
|
||||
" 'base_estimator__tol': [0.1, 0.01],\n",
|
||||
" 'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n",
|
||||
" return_train_score=True, verbose=10)"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"metadata": {
|
||||
"id": "ZjX88NoYDZE8",
|
||||
"colab_type": "code",
|
||||
"colab": {},
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344"
|
||||
"outputId": "285163c8-fa33-4915-8ae7-61c4f7844344",
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"print(\"Best estimator: \", grid.best_estimator_)\n",
|
||||
"print(\"Best hyperparameters: \", grid.best_params_)\n",
|
||||
"print(\"Best accuracy: \", grid.best_score_)"
|
||||
],
|
||||
"execution_count": 16,
|
||||
"execution_count": 8,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Best estimator: AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy: 0.9492316893632683\n"
|
||||
"text": [
|
||||
"Best estimator: AdaBoostClassifier(algorithm='SAMME',\n base_estimator=Stree(C=7, max_depth=5,\n split_criteria='max_samples',\n tol=0.01),\n learning_rate=0.5, n_estimators=25, random_state=2020)\nBest hyperparameters: {'base_estimator': Stree(C=7, max_depth=5, split_criteria='max_samples', tol=0.01), 'base_estimator__C': 7, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 5, 'base_estimator__split_criteria': 'max_samples', 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\nBest accuracy: 0.9549825174825175\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"Best estimator: AdaBoostClassifier(algorithm='SAMME',\n",
|
||||
" base_estimator=Stree(C=55, max_depth=3, tol=0.01),\n",
|
||||
" learning_rate=0.5, n_estimators=25, random_state=2020)\n",
|
||||
"\n",
|
||||
"Best hyperparameters: {'base_estimator': Stree(C=55, max_depth=3, tol=0.01), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\n",
|
||||
"\n",
|
||||
"Best accuracy: 0.9559440559440558"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
},
|
||||
{
|
||||
"source": [
|
||||
"0.9511547662863451"
|
||||
],
|
||||
"cell_type": "markdown",
|
||||
"metadata": {}
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -227,12 +429,12 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.6-final"
|
||||
"version": "3.8.4-final"
|
||||
},
|
||||
"orig_nbformat": 2,
|
||||
"kernelspec": {
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)"
|
||||
"name": "python38464bitgeneralvenv77203c0a6afd4428bd66253ef62753dc",
|
||||
"display_name": "Python 3.8.4 64-bit ('general': venv)"
|
||||
},
|
||||
"colab": {
|
||||
"name": "gridsearch.ipynb",
|
||||
|
4
setup.py
4
setup.py
@@ -1,6 +1,6 @@
|
||||
import setuptools
|
||||
|
||||
__version__ = "0.9rc4"
|
||||
__version__ = "0.9rc6"
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ setuptools.setup(
|
||||
classifiers=[
|
||||
"Development Status :: 4 - Beta",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Natural Language :: English",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
"Intended Audience :: Science/Research",
|
||||
|
241
stree/Strees.py
241
stree/Strees.py
@@ -10,6 +10,7 @@ import os
|
||||
import numbers
|
||||
import random
|
||||
import warnings
|
||||
from math import log
|
||||
from itertools import combinations
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
@@ -39,6 +40,7 @@ class Snode:
|
||||
features: np.array,
|
||||
impurity: float,
|
||||
title: str,
|
||||
weight: np.ndarray = None,
|
||||
):
|
||||
self._clf = clf
|
||||
self._title = title
|
||||
@@ -50,9 +52,12 @@ class Snode:
|
||||
self._up = None
|
||||
self._class = None
|
||||
self._feature = None
|
||||
self._sample_weight = None
|
||||
self._sample_weight = (
|
||||
weight if os.environ.get("TESTING", "NS") != "NS" else None
|
||||
)
|
||||
self._features = features
|
||||
self._impurity = impurity
|
||||
self._partition_column: int = -1
|
||||
|
||||
@classmethod
|
||||
def copy(cls, node: "Snode") -> "Snode":
|
||||
@@ -65,6 +70,12 @@ class Snode:
|
||||
node._title,
|
||||
)
|
||||
|
||||
def set_partition_column(self, col: int):
|
||||
self._partition_column = col
|
||||
|
||||
def get_partition_column(self) -> int:
|
||||
return self._partition_column
|
||||
|
||||
def set_down(self, son):
|
||||
self._down = son
|
||||
|
||||
@@ -89,9 +100,8 @@ class Snode:
|
||||
classes, card = np.unique(self._y, return_counts=True)
|
||||
if len(classes) > 1:
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
self._class = classes[card == max_card][0]
|
||||
self._belief = max_card / (max_card + min_card)
|
||||
self._belief = max_card / np.sum(card)
|
||||
else:
|
||||
self._belief = 1
|
||||
try:
|
||||
@@ -100,24 +110,23 @@ class Snode:
|
||||
self._class = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
count_values = np.unique(self._y, return_counts=True)
|
||||
if self.is_leaf():
|
||||
count_values = np.unique(self._y, return_counts=True)
|
||||
result = (
|
||||
return (
|
||||
f"{self._title} - Leaf class={self._class} belief="
|
||||
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||
f"counts={count_values}"
|
||||
)
|
||||
return result
|
||||
else:
|
||||
return (
|
||||
f"{self._title} feaures={self._features} impurity="
|
||||
f"{self._impurity:.4f}"
|
||||
f"{self._impurity:.4f} "
|
||||
f"counts={count_values}"
|
||||
)
|
||||
|
||||
|
||||
class Siterator:
|
||||
"""Stree preorder iterator
|
||||
"""
|
||||
"""Stree preorder iterator"""
|
||||
|
||||
def __init__(self, tree: Snode):
|
||||
self._stack = []
|
||||
@@ -163,20 +172,22 @@ class Splitter:
|
||||
f"criterion must be gini or entropy got({criterion})"
|
||||
)
|
||||
|
||||
if criteria not in ["min_distance", "max_samples"]:
|
||||
if criteria not in [
|
||||
"max_samples",
|
||||
"impurity",
|
||||
]:
|
||||
raise ValueError(
|
||||
f"split_criteria has to be min_distance or \
|
||||
max_samples got ({criteria})"
|
||||
f"criteria has to be max_samples or impurity; got ({criteria})"
|
||||
)
|
||||
|
||||
if splitter_type not in ["random", "best"]:
|
||||
raise ValueError(
|
||||
f"splitter must be either random or best got({splitter_type})"
|
||||
f"splitter must be either random or best, got({splitter_type})"
|
||||
)
|
||||
self.criterion_function = getattr(self, f"_{self._criterion}")
|
||||
self.decision_criteria = getattr(self, f"_{self._criteria}")
|
||||
|
||||
def impurity(self, y: np.array) -> np.array:
|
||||
def partition_impurity(self, y: np.array) -> np.array:
|
||||
return self.criterion_function(y)
|
||||
|
||||
@staticmethod
|
||||
@@ -186,24 +197,47 @@ class Splitter:
|
||||
|
||||
@staticmethod
|
||||
def _entropy(y: np.array) -> float:
|
||||
_, count = np.unique(y, return_counts=True)
|
||||
proportion = count / np.sum(count)
|
||||
return -np.sum(proportion * np.log2(proportion))
|
||||
n_labels = len(y)
|
||||
if n_labels <= 1:
|
||||
return 0
|
||||
counts = np.bincount(y)
|
||||
proportions = counts / n_labels
|
||||
n_classes = np.count_nonzero(proportions)
|
||||
if n_classes <= 1:
|
||||
return 0
|
||||
entropy = 0.0
|
||||
# Compute standard entropy.
|
||||
for prop in proportions:
|
||||
if prop != 0.0:
|
||||
entropy -= prop * log(prop, n_classes)
|
||||
return entropy
|
||||
|
||||
def information_gain(
|
||||
self, labels_up: np.array, labels_dn: np.array
|
||||
self, labels: np.array, labels_up: np.array, labels_dn: np.array
|
||||
) -> float:
|
||||
card_up = labels_up.shape[0] if labels_up is not None else 0
|
||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||
imp_prev = self.criterion_function(labels)
|
||||
card_up = card_dn = imp_up = imp_dn = 0
|
||||
if labels_up is not None:
|
||||
card_up = labels_up.shape[0]
|
||||
imp_up = self.criterion_function(labels_up)
|
||||
if labels_dn is not None:
|
||||
card_dn = labels_dn.shape[0] if labels_dn is not None else 0
|
||||
imp_dn = self.criterion_function(labels_dn)
|
||||
samples = card_up + card_dn
|
||||
up = card_up / samples * self.criterion_function(labels_up)
|
||||
dn = card_dn / samples * self.criterion_function(labels_dn)
|
||||
return up + dn
|
||||
if samples == 0:
|
||||
return 0.0
|
||||
else:
|
||||
result = (
|
||||
imp_prev
|
||||
- (card_up / samples) * imp_up
|
||||
- (card_dn / samples) * imp_dn
|
||||
)
|
||||
return result
|
||||
|
||||
def _select_best_set(
|
||||
self, dataset: np.array, labels: np.array, features_sets: list
|
||||
) -> list:
|
||||
min_impurity = 1
|
||||
max_gain = 0
|
||||
selected = None
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
for feature_set in features_sets:
|
||||
@@ -211,13 +245,13 @@ class Splitter:
|
||||
node = Snode(
|
||||
self._clf, dataset, labels, feature_set, 0.0, "subset"
|
||||
)
|
||||
self.partition(dataset, node)
|
||||
self.partition(dataset, node, train=True)
|
||||
y1, y2 = self.part(labels)
|
||||
impurity = self.information_gain(y1, y2)
|
||||
if impurity < min_impurity:
|
||||
min_impurity = impurity
|
||||
gain = self.information_gain(labels, y1, y2)
|
||||
if gain > max_gain:
|
||||
max_gain = gain
|
||||
selected = feature_set
|
||||
return selected
|
||||
return selected if selected is not None else feature_set
|
||||
|
||||
def _get_subspaces_set(
|
||||
self, dataset: np.array, labels: np.array, max_features: int
|
||||
@@ -226,47 +260,106 @@ class Splitter:
|
||||
features_sets = list(combinations(features, max_features))
|
||||
if len(features_sets) > 1:
|
||||
if self._splitter_type == "random":
|
||||
return features_sets[random.randint(0, len(features_sets) - 1)]
|
||||
index = random.randint(0, len(features_sets) - 1)
|
||||
return features_sets[index]
|
||||
else:
|
||||
# get only 3 sets at most
|
||||
if len(features_sets) > 3:
|
||||
features_sets = random.sample(features_sets, 3)
|
||||
return self._select_best_set(dataset, labels, features_sets)
|
||||
else:
|
||||
return features_sets[0]
|
||||
|
||||
def get_subspace(
|
||||
self, dataset: np.array, labels: np.array, max_features: int
|
||||
) -> list:
|
||||
"""Return the best subspace to make a split
|
||||
"""
|
||||
) -> tuple:
|
||||
"""Return the best/random subspace to make a split"""
|
||||
indices = self._get_subspaces_set(dataset, labels, max_features)
|
||||
return dataset[:, indices], indices
|
||||
|
||||
@staticmethod
|
||||
def _min_distance(data: np.array, _) -> np.array:
|
||||
# chooses the lowest distance of every sample
|
||||
indices = np.argmin(np.abs(data), axis=1)
|
||||
return np.array(
|
||||
[data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
|
||||
)
|
||||
def _impurity(self, data: np.array, y: np.array) -> np.array:
|
||||
"""return column of dataset to be taken into account to split dataset
|
||||
|
||||
:param data: distances to hyper plane of every class
|
||||
:type data: np.array (m, n_classes)
|
||||
:param y: vector of labels (classes)
|
||||
:type y: np.array (m,)
|
||||
:return: vector with the class assigned to each sample values
|
||||
(can be 0, 1, ...) -1 if none produces information gain
|
||||
:rtype: np.array shape (m,)
|
||||
"""
|
||||
max_gain = 0
|
||||
selected = -1
|
||||
for col in range(data.shape[1]):
|
||||
tup = y[data[:, col] > 0]
|
||||
tdn = y[data[:, col] <= 0]
|
||||
info_gain = self.information_gain(y, tup, tdn)
|
||||
if info_gain > max_gain:
|
||||
selected = col
|
||||
max_gain = info_gain
|
||||
return selected
|
||||
|
||||
@staticmethod
|
||||
def _max_samples(data: np.array, y: np.array) -> np.array:
|
||||
"""return column of dataset to be taken into account to split dataset
|
||||
|
||||
:param data: distances to hyper plane of every class
|
||||
:type data: np.array (m, n_classes)
|
||||
:param y: vector of labels (classes)
|
||||
:type y: np.array (m,)
|
||||
:return: vector with distances to hyperplane (can be positive or neg.)
|
||||
:rtype: np.array shape (m,)
|
||||
"""
|
||||
# select the class with max number of samples
|
||||
_, samples = np.unique(y, return_counts=True)
|
||||
selected = np.argmax(samples)
|
||||
return data[:, selected]
|
||||
return np.argmax(samples)
|
||||
|
||||
def partition(self, samples: np.array, node: Snode):
|
||||
"""Set the criteria to split arrays
|
||||
def partition(self, samples: np.array, node: Snode, train: bool):
|
||||
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||
that should go to one side of the tree (down)
|
||||
|
||||
"""
|
||||
# data contains the distances of every sample to every class hyperplane
|
||||
# array of (m, nc) nc = # classes
|
||||
data = self._distances(node, samples)
|
||||
if data.shape[0] < self._min_samples_split:
|
||||
self._down = np.ones((data.shape[0]), dtype=bool)
|
||||
# there aren't enough samples to split
|
||||
self._up = np.ones((data.shape[0]), dtype=bool)
|
||||
return
|
||||
if data.ndim > 1:
|
||||
# split criteria for multiclass
|
||||
data = self.decision_criteria(data, node._y)
|
||||
self._down = data > 0
|
||||
# Convert data to a (m, 1) array selecting values for samples
|
||||
if train:
|
||||
# in train time we have to compute the column to take into
|
||||
# account to split the dataset
|
||||
col = self.decision_criteria(data, node._y)
|
||||
node.set_partition_column(col)
|
||||
else:
|
||||
# in predcit time just use the column computed in train time
|
||||
# is taking the classifier of class <col>
|
||||
col = node.get_partition_column()
|
||||
if col == -1:
|
||||
# No partition is producing information gain
|
||||
data = np.ones(data.shape)
|
||||
data = data[:, col]
|
||||
self._up = data > 0
|
||||
|
||||
def part(self, origin: np.array) -> list:
|
||||
"""Split an array in two based on indices (down) and its complement
|
||||
partition has to be called first to establish down indices
|
||||
|
||||
:param origin: dataset to split
|
||||
:type origin: np.array
|
||||
:param down: indices to use to split array
|
||||
:type down: np.array
|
||||
:return: list with two splits of the array
|
||||
:rtype: list
|
||||
"""
|
||||
down = ~self._up
|
||||
return [
|
||||
origin[self._up] if any(self._up) else None,
|
||||
origin[down] if any(down) else None,
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _distances(node: Snode, data: np.ndarray) -> np.array:
|
||||
@@ -276,28 +369,12 @@ class Splitter:
|
||||
:type node: Snode
|
||||
:param data: samples to find out distance to hyperplane
|
||||
:type data: np.ndarray
|
||||
:return: array of shape (m, 1) with the distances of every sample to
|
||||
the hyperplane of the node
|
||||
:return: array of shape (m, nc) with the distances of every sample to
|
||||
the hyperplane of every class. nc = # of classes
|
||||
:rtype: np.array
|
||||
"""
|
||||
return node._clf.decision_function(data[:, node._features])
|
||||
|
||||
def part(self, origin: np.array) -> list:
|
||||
"""Split an array in two based on indices (down) and its complement
|
||||
|
||||
:param origin: dataset to split
|
||||
:type origin: np.array
|
||||
:param down: indices to use to split array
|
||||
:type down: np.array
|
||||
:return: list with two splits of the array
|
||||
:rtype: list
|
||||
"""
|
||||
up = ~self._down
|
||||
return [
|
||||
origin[up] if any(up) else None,
|
||||
origin[self._down] if any(self._down) else None,
|
||||
]
|
||||
|
||||
|
||||
class Stree(BaseEstimator, ClassifierMixin):
|
||||
"""Estimator that is based on binary trees of svm nodes
|
||||
@@ -311,14 +388,14 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
self,
|
||||
C: float = 1.0,
|
||||
kernel: str = "linear",
|
||||
max_iter: int = 1000,
|
||||
max_iter: int = 1e5,
|
||||
random_state: int = None,
|
||||
max_depth: int = None,
|
||||
tol: float = 1e-4,
|
||||
degree: int = 3,
|
||||
gamma="scale",
|
||||
split_criteria: str = "max_samples",
|
||||
criterion: str = "gini",
|
||||
split_criteria: str = "impurity",
|
||||
criterion: str = "entropy",
|
||||
min_samples_split: int = 0,
|
||||
max_features=None,
|
||||
splitter: str = "random",
|
||||
@@ -379,7 +456,9 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
sample_weight = _check_sample_weight(
|
||||
sample_weight, X, dtype=np.float64
|
||||
)
|
||||
check_classification_targets(y)
|
||||
# Initialize computed parameters
|
||||
self.splitter_ = Splitter(
|
||||
@@ -439,15 +518,24 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
features=X.shape[1],
|
||||
impurity=0.0,
|
||||
title=title + ", <pure>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
# Train the model
|
||||
clf = self._build_clf()
|
||||
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||
# solve WARNING: class label 0 specified in weight is not found
|
||||
# in bagging
|
||||
if any(sample_weight == 0):
|
||||
indices = sample_weight == 0
|
||||
y_next = y[~indices]
|
||||
# touch weights if removing any class
|
||||
if np.unique(y_next).shape[0] != self.n_classes_:
|
||||
sample_weight += 1e-5
|
||||
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||
impurity = self.splitter_.impurity(y)
|
||||
node = Snode(clf, X, y, features, impurity, title)
|
||||
impurity = self.splitter_.partition_impurity(y)
|
||||
node = Snode(clf, X, y, features, impurity, title, sample_weight)
|
||||
self.depth_ = max(depth, self.depth_)
|
||||
self.splitter_.partition(X, node)
|
||||
self.splitter_.partition(X, node, True)
|
||||
X_U, X_D = self.splitter_.part(X)
|
||||
y_u, y_d = self.splitter_.part(y)
|
||||
sw_u, sw_d = self.splitter_.part(sample_weight)
|
||||
@@ -460,14 +548,14 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
features=X.shape[1],
|
||||
impurity=impurity,
|
||||
title=title + ", <cgaf>",
|
||||
weight=sample_weight,
|
||||
)
|
||||
node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
|
||||
node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
|
||||
return node
|
||||
|
||||
def _build_predictor(self):
|
||||
"""Process the leaves to make them predictors
|
||||
"""
|
||||
"""Process the leaves to make them predictors"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node.is_leaf():
|
||||
@@ -479,8 +567,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
run_tree(self.tree_)
|
||||
|
||||
def _build_clf(self):
|
||||
""" Build the correct classifier for the node
|
||||
"""
|
||||
"""Build the correct classifier for the node"""
|
||||
return (
|
||||
LinearSVC(
|
||||
max_iter=self.max_iter,
|
||||
@@ -535,7 +622,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
return prediction, indices
|
||||
self.splitter_.partition(xp, node)
|
||||
self.splitter_.partition(xp, node, train=False)
|
||||
x_u, x_d = self.splitter_.part(xp)
|
||||
i_u, i_d = self.splitter_.part(indices)
|
||||
prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
|
||||
|
@@ -33,22 +33,20 @@ class Snode_test(unittest.TestCase):
|
||||
max_card = max(card)
|
||||
min_card = min(card)
|
||||
if len(classes) > 1:
|
||||
try:
|
||||
belief = max_card / (max_card + min_card)
|
||||
except ZeroDivisionError:
|
||||
belief = 0.0
|
||||
belief = max_card / (max_card + min_card)
|
||||
else:
|
||||
belief = 1
|
||||
self.assertEqual(belief, node._belief)
|
||||
# Check Class
|
||||
class_computed = classes[card == max_card]
|
||||
self.assertEqual(class_computed, node._class)
|
||||
# Check Partition column
|
||||
self.assertEqual(node._partition_column, -1)
|
||||
|
||||
check_leave(self._clf.tree_)
|
||||
|
||||
def test_nodes_coefs(self):
|
||||
"""Check if the nodes of the tree have the right attributes filled
|
||||
"""
|
||||
"""Check if the nodes of the tree have the right attributes filled"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node._belief < 1:
|
||||
@@ -57,16 +55,19 @@ class Snode_test(unittest.TestCase):
|
||||
self.assertIsNotNone(node._clf.coef_)
|
||||
if node.is_leaf():
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
run_tree(node.get_down())
|
||||
|
||||
run_tree(self._clf.tree_)
|
||||
model = Stree(self._random_state)
|
||||
model.fit(*load_dataset(self._random_state, 3, 4))
|
||||
run_tree(model.tree_)
|
||||
|
||||
def test_make_predictor_on_leaf(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||
test.make_predictor()
|
||||
self.assertEqual(1, test._class)
|
||||
self.assertEqual(0.75, test._belief)
|
||||
self.assertEqual(-1, test._partition_column)
|
||||
|
||||
def test_make_predictor_on_not_leaf(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
|
||||
@@ -74,11 +75,14 @@ class Snode_test(unittest.TestCase):
|
||||
test.make_predictor()
|
||||
self.assertIsNone(test._class)
|
||||
self.assertEqual(0, test._belief)
|
||||
self.assertEqual(-1, test._partition_column)
|
||||
self.assertEqual(-1, test.get_up()._partition_column)
|
||||
|
||||
def test_make_predictor_on_leaf_bogus_data(self):
|
||||
test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
|
||||
test.make_predictor()
|
||||
self.assertIsNone(test._class)
|
||||
self.assertEqual(-1, test._partition_column)
|
||||
|
||||
def test_copy_node(self):
|
||||
px = [1, 2, 3, 4]
|
||||
@@ -89,3 +93,4 @@ class Snode_test(unittest.TestCase):
|
||||
self.assertListEqual(computed._y, py)
|
||||
self.assertEqual("test", computed._title)
|
||||
self.assertIsInstance(computed._clf, Stree)
|
||||
self.assertEqual(test._partition_column, computed._partition_column)
|
||||
|
@@ -1,11 +1,11 @@
|
||||
import os
|
||||
import unittest
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.datasets import load_wine, load_iris
|
||||
from stree import Splitter
|
||||
from .utils import load_dataset
|
||||
|
||||
|
||||
class Splitter_test(unittest.TestCase):
|
||||
@@ -15,15 +15,15 @@ class Splitter_test(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def build(
|
||||
clf=LinearSVC(),
|
||||
clf=SVC,
|
||||
min_samples_split=0,
|
||||
splitter_type="random",
|
||||
criterion="gini",
|
||||
criteria="min_distance",
|
||||
criteria="max_samples",
|
||||
random_state=None,
|
||||
):
|
||||
return Splitter(
|
||||
clf=clf,
|
||||
clf=clf(random_state=random_state, kernel="rbf"),
|
||||
min_samples_split=min_samples_split,
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
@@ -43,10 +43,10 @@ class Splitter_test(unittest.TestCase):
|
||||
with self.assertRaises(ValueError):
|
||||
self.build(criteria="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
self.build(clf=None)
|
||||
_ = Splitter(clf=None)
|
||||
for splitter_type in ["best", "random"]:
|
||||
for criterion in ["gini", "entropy"]:
|
||||
for criteria in ["min_distance", "max_samples"]:
|
||||
for criteria in ["max_samples", "impurity"]:
|
||||
tcl = self.build(
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
@@ -57,30 +57,74 @@ class Splitter_test(unittest.TestCase):
|
||||
self.assertEqual(criteria, tcl._criteria)
|
||||
|
||||
def test_gini(self):
|
||||
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
||||
expected = 0.48
|
||||
self.assertEqual(expected, Splitter._gini(y))
|
||||
tcl = self.build(criterion="gini")
|
||||
self.assertEqual(expected, tcl.criterion_function(y))
|
||||
expected_values = [
|
||||
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
|
||||
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
|
||||
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
|
||||
([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
|
||||
([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
|
||||
([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
|
||||
([0], 0),
|
||||
([1, 1, 1, 1], 0),
|
||||
]
|
||||
for labels, expected in expected_values:
|
||||
self.assertAlmostEqual(expected, Splitter._gini(labels))
|
||||
tcl = self.build(criterion="gini")
|
||||
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||
|
||||
def test_entropy(self):
|
||||
y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
|
||||
expected = 0.9709505944546686
|
||||
self.assertAlmostEqual(expected, Splitter._entropy(y))
|
||||
tcl = self.build(criterion="entropy")
|
||||
self.assertEqual(expected, tcl.criterion_function(y))
|
||||
expected_values = [
|
||||
([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
|
||||
([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
|
||||
([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
|
||||
([0, 0, 1, 1, 1, 1, 0, 0], 1),
|
||||
([0, 0, 1, 1, 2, 2, 3, 3], 1),
|
||||
([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
|
||||
([1], 0),
|
||||
([0, 0, 0, 0], 0),
|
||||
]
|
||||
for labels, expected in expected_values:
|
||||
self.assertAlmostEqual(expected, Splitter._entropy(labels))
|
||||
tcl = self.build(criterion="entropy")
|
||||
self.assertAlmostEqual(expected, tcl.criterion_function(labels))
|
||||
|
||||
def test_information_gain(self):
|
||||
yu = np.array([0, 1, 1, 1, 1, 1])
|
||||
yd = np.array([0, 0, 0, 1])
|
||||
values_expected = [
|
||||
("gini", 0.31666666666666665),
|
||||
("entropy", 0.7145247027726656),
|
||||
expected_values = [
|
||||
(
|
||||
[0, 1, 1, 1, 1, 1],
|
||||
[0, 0, 0, 1],
|
||||
0.16333333333333333,
|
||||
0.25642589168200297,
|
||||
),
|
||||
(
|
||||
[0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
|
||||
[5, 3, 2, 1, 1],
|
||||
0.007381776239907684,
|
||||
-0.03328610916207225,
|
||||
),
|
||||
([], [], 0.0, 0.0),
|
||||
([1], [], 0.0, 0.0),
|
||||
([], [1], 0.0, 0.0),
|
||||
([0, 0, 0, 0], [0, 0], 0.0, 0.0),
|
||||
([], [1, 1, 1, 2], 0.0, 0.0),
|
||||
(None, [1, 2, 3], 0.0, 0.0),
|
||||
([1, 2, 3], None, 0.0, 0.0),
|
||||
]
|
||||
for criterion, expected in values_expected:
|
||||
tcl = self.build(criterion=criterion)
|
||||
computed = tcl.information_gain(yu, yd)
|
||||
self.assertAlmostEqual(expected, computed)
|
||||
for yu, yd, expected_gini, expected_entropy in expected_values:
|
||||
yu = np.array(yu, dtype=np.int32) if yu is not None else None
|
||||
yd = np.array(yd, dtype=np.int32) if yd is not None else None
|
||||
if yu is not None and yd is not None:
|
||||
complete = np.append(yu, yd)
|
||||
elif yd is not None:
|
||||
complete = yd
|
||||
else:
|
||||
complete = yu
|
||||
tcl = self.build(criterion="gini")
|
||||
computed = tcl.information_gain(complete, yu, yd)
|
||||
self.assertAlmostEqual(expected_gini, computed)
|
||||
tcl = self.build(criterion="entropy")
|
||||
computed = tcl.information_gain(complete, yu, yd)
|
||||
self.assertAlmostEqual(expected_entropy, computed)
|
||||
|
||||
def test_max_samples(self):
|
||||
tcl = self.build(criteria="max_samples")
|
||||
@@ -90,52 +134,82 @@ class Splitter_test(unittest.TestCase):
|
||||
[0.7, 0.01, -0.1],
|
||||
[0.7, -0.9, 0.5],
|
||||
[0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
]
|
||||
)
|
||||
expected = np.array([0.2, 0.01, -0.9, 0.2])
|
||||
y = [1, 2, 1, 0]
|
||||
expected = data[:, 0]
|
||||
y = [1, 2, 1, 0, 0, 0]
|
||||
computed = tcl._max_samples(data, y)
|
||||
self.assertEqual((4,), computed.shape)
|
||||
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||
self.assertEqual(0, computed)
|
||||
computed_data = data[:, computed]
|
||||
self.assertEqual((6,), computed_data.shape)
|
||||
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
||||
|
||||
def test_min_distance(self):
|
||||
tcl = self.build()
|
||||
def test_impurity(self):
|
||||
tcl = self.build(criteria="impurity")
|
||||
data = np.array(
|
||||
[
|
||||
[-0.1, 0.2, -0.3],
|
||||
[0.7, 0.01, -0.1],
|
||||
[0.7, -0.9, 0.5],
|
||||
[0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
[-0.1, 0.2, 0.3],
|
||||
]
|
||||
)
|
||||
expected = np.array([-0.1, 0.01, 0.5, 0.1])
|
||||
computed = tcl._min_distance(data, None)
|
||||
self.assertEqual((4,), computed.shape)
|
||||
self.assertListEqual(expected.tolist(), computed.tolist())
|
||||
expected = data[:, 2]
|
||||
y = np.array([1, 2, 1, 0, 0, 0])
|
||||
computed = tcl._impurity(data, y)
|
||||
self.assertEqual(2, computed)
|
||||
computed_data = data[:, computed]
|
||||
self.assertEqual((6,), computed_data.shape)
|
||||
self.assertListEqual(expected.tolist(), computed_data.tolist())
|
||||
|
||||
def test_best_splitter_few_sets(self):
|
||||
X, y = load_iris(return_X_y=True)
|
||||
X = np.delete(X, 3, 1)
|
||||
tcl = self.build(splitter_type="best", random_state=self._random_state)
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=2)
|
||||
self.assertListEqual([0, 2], list(computed))
|
||||
self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
|
||||
|
||||
def test_splitter_parameter(self):
|
||||
expected_values = [
|
||||
[1, 7, 9],
|
||||
[1, 7, 9],
|
||||
[1, 7, 9],
|
||||
[1, 7, 9],
|
||||
[0, 5, 6],
|
||||
[0, 5, 6],
|
||||
[0, 5, 6],
|
||||
[0, 5, 6],
|
||||
[0, 1, 7, 9], # best entropy max_samples
|
||||
[3, 8, 10, 11], # best entropy impurity
|
||||
[0, 2, 8, 12], # best gini max_samples
|
||||
[1, 2, 5, 12], # best gini impurity
|
||||
[1, 2, 5, 10], # random entropy max_samples
|
||||
[4, 8, 9, 12], # random entropy impurity
|
||||
[3, 9, 11, 12], # random gini max_samples
|
||||
[1, 5, 6, 9], # random gini impurity
|
||||
]
|
||||
X, y = load_dataset(self._random_state, n_features=12)
|
||||
X, y = load_wine(return_X_y=True)
|
||||
rn = 0
|
||||
for splitter_type in ["best", "random"]:
|
||||
for criterion in ["gini", "entropy"]:
|
||||
for criteria in ["min_distance", "max_samples"]:
|
||||
for criterion in ["entropy", "gini"]:
|
||||
for criteria in [
|
||||
"max_samples",
|
||||
"impurity",
|
||||
]:
|
||||
tcl = self.build(
|
||||
splitter_type=splitter_type,
|
||||
criterion=criterion,
|
||||
criteria=criteria,
|
||||
random_state=self._random_state,
|
||||
)
|
||||
expected = expected_values.pop(0)
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=3)
|
||||
random.seed(rn)
|
||||
rn += 1
|
||||
dataset, computed = tcl.get_subspace(X, y, max_features=4)
|
||||
# print(
|
||||
# "{}, # {:7s}{:8s}{:15s}".format(
|
||||
# list(computed),
|
||||
# splitter_type,
|
||||
# criterion,
|
||||
# criteria,
|
||||
# )
|
||||
# )
|
||||
self.assertListEqual(expected, list(computed))
|
||||
self.assertListEqual(
|
||||
X[:, computed].tolist(), dataset.tolist()
|
||||
|
@@ -1,8 +1,11 @@
|
||||
import os
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from sklearn.datasets import load_iris
|
||||
from sklearn.datasets import load_iris, load_wine
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
from stree import Stree, Snode
|
||||
from .utils import load_dataset
|
||||
@@ -39,53 +42,28 @@ class Stree_test(unittest.TestCase):
|
||||
_, count_u = np.unique(y_up, return_counts=True)
|
||||
#
|
||||
for i in unique_y:
|
||||
number_up = count_u[i]
|
||||
try:
|
||||
number_down = count_d[i]
|
||||
except IndexError:
|
||||
number_down = 0
|
||||
try:
|
||||
number_up = count_u[i]
|
||||
except IndexError:
|
||||
number_up = 0
|
||||
self.assertEqual(count_y[i], number_down + number_up)
|
||||
# Is the partition made the same as the prediction?
|
||||
# as the node is not a leaf...
|
||||
_, count_yp = np.unique(y_prediction, return_counts=True)
|
||||
self.assertEqual(count_yp[0], y_up.shape[0])
|
||||
self.assertEqual(count_yp[1], y_down.shape[0])
|
||||
self.assertEqual(count_yp[1], y_up.shape[0])
|
||||
self.assertEqual(count_yp[0], y_down.shape[0])
|
||||
self._check_tree(node.get_down())
|
||||
self._check_tree(node.get_up())
|
||||
|
||||
def test_build_tree(self):
|
||||
"""Check if the tree is built the same way as predictions of models
|
||||
"""
|
||||
import warnings
|
||||
|
||||
"""Check if the tree is built the same way as predictions of models"""
|
||||
warnings.filterwarnings("ignore")
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(kernel=kernel, random_state=self._random_state)
|
||||
clf.fit(*load_dataset(self._random_state))
|
||||
self._check_tree(clf.tree_)
|
||||
|
||||
@staticmethod
|
||||
def _find_out(px: np.array, x_original: np.array, y_original) -> list:
|
||||
"""Find the original values of y for a given array of samples
|
||||
|
||||
Arguments:
|
||||
px {np.array} -- array of samples to search for
|
||||
x_original {np.array} -- original dataset
|
||||
y_original {[type]} -- original classes
|
||||
|
||||
Returns:
|
||||
np.array -- classes of the given samples
|
||||
"""
|
||||
res = []
|
||||
for needle in px:
|
||||
for row in range(x_original.shape[0]):
|
||||
if all(x_original[row, :] == needle):
|
||||
res.append(y_original[row])
|
||||
return res
|
||||
|
||||
def test_single_prediction(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
for kernel in self._kernels:
|
||||
@@ -102,22 +80,6 @@ class Stree_test(unittest.TestCase):
|
||||
yp = clf.fit(X, y).predict(X[:num, :])
|
||||
self.assertListEqual(y[:num].tolist(), yp.tolist())
|
||||
|
||||
def test_score(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
accuracies = [
|
||||
0.9506666666666667,
|
||||
0.9606666666666667,
|
||||
0.9433333333333334,
|
||||
]
|
||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||
clf = Stree(random_state=self._random_state, kernel=kernel,)
|
||||
clf.fit(X, y)
|
||||
accuracy_score = clf.score(X, y)
|
||||
yp = clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||
|
||||
def test_single_vs_multiple_prediction(self):
|
||||
"""Check if predicting sample by sample gives the same result as
|
||||
predicting all samples at once
|
||||
@@ -137,20 +99,22 @@ class Stree_test(unittest.TestCase):
|
||||
self.assertListEqual(yp_line.tolist(), yp_once.tolist())
|
||||
|
||||
def test_iterator_and_str(self):
|
||||
"""Check preorder iterator
|
||||
"""
|
||||
"""Check preorder iterator"""
|
||||
expected = [
|
||||
"root feaures=(0, 1, 2) impurity=0.5000",
|
||||
"root - Down feaures=(0, 1, 2) impurity=0.0671",
|
||||
"root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 "
|
||||
"impurity=0.0469 counts=(array([0, 1]), array([ 17, 691]))",
|
||||
"root - Down - Up feaures=(0, 1, 2) impurity=0.3967",
|
||||
"root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 "
|
||||
"impurity=0.3750 counts=(array([0, 1]), array([1, 3]))",
|
||||
"root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 "
|
||||
"impurity=0.0000 counts=(array([0]), array([7]))",
|
||||
"root - Up, <cgaf> - Leaf class=0 belief= 0.928297 impurity=0.1331"
|
||||
" counts=(array([0, 1]), array([725, 56]))",
|
||||
"root feaures=(0, 1, 2) impurity=1.0000 counts=(array([0, 1]), arr"
|
||||
"ay([750, 750]))",
|
||||
"root - Down, <cgaf> - Leaf class=0 belief= 0.928297 impurity=0.37"
|
||||
"22 counts=(array([0, 1]), array([725, 56]))",
|
||||
"root - Up feaures=(0, 1, 2) impurity=0.2178 counts=(array([0, 1])"
|
||||
", array([ 25, 694]))",
|
||||
"root - Up - Down feaures=(0, 1, 2) impurity=0.8454 counts=(array("
|
||||
"[0, 1]), array([8, 3]))",
|
||||
"root - Up - Down - Down, <pure> - Leaf class=0 belief= 1.000000 i"
|
||||
"mpurity=0.0000 counts=(array([0]), array([7]))",
|
||||
"root - Up - Down - Up, <cgaf> - Leaf class=1 belief= 0.750000 imp"
|
||||
"urity=0.8113 counts=(array([0, 1]), array([1, 3]))",
|
||||
"root - Up - Up, <cgaf> - Leaf class=1 belief= 0.975989 impurity=0"
|
||||
".1634 counts=(array([0, 1]), array([ 17, 691]))",
|
||||
]
|
||||
computed = []
|
||||
expected_string = ""
|
||||
@@ -164,9 +128,6 @@ class Stree_test(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def test_is_a_sklearn_classifier():
|
||||
import warnings
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
@@ -229,38 +190,43 @@ class Stree_test(unittest.TestCase):
|
||||
def test_muticlass_dataset(self):
|
||||
datasets = {
|
||||
"Synt": load_dataset(random_state=self._random_state, n_classes=3),
|
||||
"Iris": load_iris(return_X_y=True),
|
||||
"Iris": load_wine(return_X_y=True),
|
||||
}
|
||||
outcomes = {
|
||||
"Synt": {
|
||||
"max_samples linear": 0.9533333333333334,
|
||||
"max_samples rbf": 0.836,
|
||||
"max_samples poly": 0.9473333333333334,
|
||||
"min_distance linear": 0.9533333333333334,
|
||||
"min_distance rbf": 0.836,
|
||||
"min_distance poly": 0.9473333333333334,
|
||||
"max_samples linear": 0.9606666666666667,
|
||||
"max_samples rbf": 0.7133333333333334,
|
||||
"max_samples poly": 0.49066666666666664,
|
||||
"impurity linear": 0.9606666666666667,
|
||||
"impurity rbf": 0.7133333333333334,
|
||||
"impurity poly": 0.49066666666666664,
|
||||
},
|
||||
"Iris": {
|
||||
"max_samples linear": 0.98,
|
||||
"max_samples rbf": 1.0,
|
||||
"max_samples poly": 1.0,
|
||||
"min_distance linear": 0.98,
|
||||
"min_distance rbf": 1.0,
|
||||
"min_distance poly": 1.0,
|
||||
"max_samples linear": 1.0,
|
||||
"max_samples rbf": 0.6910112359550562,
|
||||
"max_samples poly": 0.6966292134831461,
|
||||
"impurity linear": 1,
|
||||
"impurity rbf": 0.6910112359550562,
|
||||
"impurity poly": 0.6966292134831461,
|
||||
},
|
||||
}
|
||||
|
||||
for name, dataset in datasets.items():
|
||||
px, py = dataset
|
||||
for criteria in ["max_samples", "min_distance"]:
|
||||
for criteria in ["max_samples", "impurity"]:
|
||||
for kernel in self._kernels:
|
||||
clf = Stree(
|
||||
C=1e4,
|
||||
max_iter=1e4,
|
||||
C=55,
|
||||
max_iter=1e5,
|
||||
kernel=kernel,
|
||||
random_state=self._random_state,
|
||||
)
|
||||
clf.fit(px, py)
|
||||
outcome = outcomes[name][f"{criteria} {kernel}"]
|
||||
# print(
|
||||
# f"{name} {criteria} {kernel} {outcome} {clf.score(px"
|
||||
# ", py)}"
|
||||
# )
|
||||
self.assertAlmostEqual(outcome, clf.score(px, py))
|
||||
|
||||
def test_max_features(self):
|
||||
@@ -322,13 +288,157 @@ class Stree_test(unittest.TestCase):
|
||||
with self.assertRaises(ValueError):
|
||||
clf.predict(X[:, :3])
|
||||
|
||||
# Tests of score
|
||||
|
||||
def test_score_binary(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
accuracies = [
|
||||
0.9506666666666667,
|
||||
0.9606666666666667,
|
||||
0.9433333333333334,
|
||||
]
|
||||
for kernel, accuracy_expected in zip(self._kernels, accuracies):
|
||||
clf = Stree(
|
||||
random_state=self._random_state,
|
||||
kernel=kernel,
|
||||
)
|
||||
clf.fit(X, y)
|
||||
accuracy_score = clf.score(X, y)
|
||||
yp = clf.predict(X)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertAlmostEqual(accuracy_expected, accuracy_score)
|
||||
|
||||
def test_score_max_features(self):
|
||||
X, y = load_dataset(self._random_state)
|
||||
clf = Stree(random_state=self._random_state, max_features=2)
|
||||
clf.fit(X, y)
|
||||
self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))
|
||||
self.assertAlmostEqual(0.944, clf.score(X, y))
|
||||
|
||||
def test_bogus_splitter_parameter(self):
|
||||
clf = Stree(splitter="duck")
|
||||
with self.assertRaises(ValueError):
|
||||
clf.fit(*load_dataset())
|
||||
|
||||
def test_weights_removing_class(self):
|
||||
# This patch solves an stderr message from sklearn svm lib
|
||||
# "WARNING: class label x specified in weight is not found"
|
||||
X = np.array(
|
||||
[
|
||||
[0.1, 0.1],
|
||||
[0.1, 0.2],
|
||||
[0.2, 0.1],
|
||||
[5, 6],
|
||||
[8, 9],
|
||||
[6, 7],
|
||||
[0.2, 0.2],
|
||||
]
|
||||
)
|
||||
y = np.array([0, 0, 0, 1, 1, 1, 0])
|
||||
epsilon = 1e-5
|
||||
weights = [1, 1, 1, 0, 0, 0, 1]
|
||||
weights = np.array(weights, dtype="float64")
|
||||
weights_epsilon = [x + epsilon for x in weights]
|
||||
weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
|
||||
original = weights_no_zero.copy()
|
||||
clf = Stree()
|
||||
clf.fit(X, y)
|
||||
node = clf.train(
|
||||
X,
|
||||
y,
|
||||
weights,
|
||||
1,
|
||||
"test",
|
||||
)
|
||||
# if a class is lost with zero weights the patch adds epsilon
|
||||
self.assertListEqual(weights.tolist(), weights_epsilon)
|
||||
self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
|
||||
# zero weights are ok when they don't erase a class
|
||||
_ = clf.train(X, y, weights_no_zero, 1, "test")
|
||||
self.assertListEqual(weights_no_zero.tolist(), original.tolist())
|
||||
|
||||
def test_multiclass_classifier_integrity(self):
|
||||
"""Checks if the multiclass operation is done right"""
|
||||
X, y = load_iris(return_X_y=True)
|
||||
clf = Stree(random_state=0)
|
||||
clf.fit(X, y)
|
||||
score = clf.score(X, y)
|
||||
# Check accuracy of the whole model
|
||||
self.assertAlmostEquals(0.98, score, 5)
|
||||
svm = LinearSVC(random_state=0)
|
||||
svm.fit(X, y)
|
||||
self.assertAlmostEquals(0.9666666666666667, svm.score(X, y), 5)
|
||||
data = svm.decision_function(X)
|
||||
expected = [
|
||||
0.4444444444444444,
|
||||
0.35777777777777775,
|
||||
0.4569777777777778,
|
||||
]
|
||||
ty = data.copy()
|
||||
ty[data <= 0] = 0
|
||||
ty[data > 0] = 1
|
||||
ty = ty.astype(int)
|
||||
for i in range(3):
|
||||
self.assertAlmostEquals(
|
||||
expected[i],
|
||||
clf.splitter_._gini(ty[:, i]),
|
||||
)
|
||||
# 1st Branch
|
||||
# up has to have 50 samples of class 0
|
||||
# down should have 100 [50, 50]
|
||||
up = data[:, 2] > 0
|
||||
resup = np.unique(y[up], return_counts=True)
|
||||
resdn = np.unique(y[~up], return_counts=True)
|
||||
self.assertListEqual([1, 2], resup[0].tolist())
|
||||
self.assertListEqual([3, 50], resup[1].tolist())
|
||||
self.assertListEqual([0, 1], resdn[0].tolist())
|
||||
self.assertListEqual([50, 47], resdn[1].tolist())
|
||||
# 2nd Branch
|
||||
# up should have 53 samples of classes [1, 2] [3, 50]
|
||||
# down shoud have 47 samples of class 1
|
||||
node_up = clf.tree_.get_down().get_up()
|
||||
node_dn = clf.tree_.get_down().get_down()
|
||||
resup = np.unique(node_up._y, return_counts=True)
|
||||
resdn = np.unique(node_dn._y, return_counts=True)
|
||||
self.assertListEqual([1, 2], resup[0].tolist())
|
||||
self.assertListEqual([3, 50], resup[1].tolist())
|
||||
self.assertListEqual([1], resdn[0].tolist())
|
||||
self.assertListEqual([47], resdn[1].tolist())
|
||||
|
||||
def test_score_multiclass_rbf(self):
|
||||
X, y = load_dataset(
|
||||
random_state=self._random_state,
|
||||
n_classes=3,
|
||||
n_features=5,
|
||||
n_samples=500,
|
||||
)
|
||||
clf = Stree(kernel="rbf", random_state=self._random_state)
|
||||
self.assertEqual(0.824, clf.fit(X, y).score(X, y))
|
||||
X, y = load_wine(return_X_y=True)
|
||||
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
||||
|
||||
def test_score_multiclass_poly(self):
|
||||
X, y = load_dataset(
|
||||
random_state=self._random_state,
|
||||
n_classes=3,
|
||||
n_features=5,
|
||||
n_samples=500,
|
||||
)
|
||||
clf = Stree(
|
||||
kernel="poly", random_state=self._random_state, C=10, degree=5
|
||||
)
|
||||
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
||||
X, y = load_wine(return_X_y=True)
|
||||
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
||||
|
||||
def test_score_multiclass_linear(self):
|
||||
X, y = load_dataset(
|
||||
random_state=self._random_state,
|
||||
n_classes=3,
|
||||
n_features=5,
|
||||
n_samples=1500,
|
||||
)
|
||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||
X, y = load_wine(return_X_y=True)
|
||||
self.assertEqual(0.9550561797752809, clf.fit(X, y).score(X, y))
|
||||
|
@@ -1,9 +1,9 @@
|
||||
from sklearn.datasets import make_classification
|
||||
|
||||
|
||||
def load_dataset(random_state=0, n_classes=2, n_features=3):
|
||||
def load_dataset(random_state=0, n_classes=2, n_features=3, n_samples=1500):
|
||||
X, y = make_classification(
|
||||
n_samples=1500,
|
||||
n_samples=n_samples,
|
||||
n_features=n_features,
|
||||
n_informative=3,
|
||||
n_redundant=0,
|
||||
|
Reference in New Issue
Block a user