#3 Add sample_weights to score, update notebooks

Update readme to use new names of notebooks
This commit is contained in:
2020-06-09 01:46:38 +02:00
parent 26273e936a
commit 7e932de072
5 changed files with 406 additions and 351 deletions

View File

@@ -18,21 +18,17 @@ pip install git+https://github.com/doctorado-ml/stree
### Jupyter notebooks ### Jupyter notebooks
##### Slow launch but better integration * [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/benchmark.ipynb) Benchmark
* [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Doctorado-ML/STree/master?urlpath=lab/tree/notebooks/test.ipynb) Test notebook * [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/benchmark.ipynb) Benchmark
##### Fast launch but have to run first commented out cell for setup * [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/features.ipynb) Test features
* [![Test](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test.ipynb) Test notebook
* [![Test2](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test2.ipynb) Another Test notebook
* [![Adaboost](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost * [![Adaboost](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/adaboost.ipynb) Adaboost
* [![Gridsearch](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch * [![Gridsearch](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/gridsearch.ipynb) Gridsearch
* [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics notebook * [![Test Graphics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Doctorado-ML/STree/blob/master/notebooks/test_graphs.ipynb) Test Graphics
### Command line ### Command line

370
notebooks/features.ipynb Normal file
View File

@@ -0,0 +1,370 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test smple_weight, kernels, C, sklearn estimator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup\n",
"Uncomment the next cell if STree is not already installed"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import SVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.utils.estimator_checks import check_estimator\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]\n",
"# Set weights inverse to its count class in dataset\n",
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
"weights[ytrain==1] = 1.99755\n",
"weights_test = np.ones(Xtest.shape[0],) * 1.00244\n",
"weights_test[ytest==1] = 1.99755 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tests"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test smple_weights\n",
"Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Accuracy of Train without weights 0.9722222222222222\nAccuracy of Train with weights 0.9875478927203065\nAccuracy of Tests without weights 0.9508928571428571\nAccuracy of Tests with weights 0.9486607142857143\n"
}
],
"source": [
"C = 23\n",
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test accuracy with different kernels\n",
"Compute accuracy on train and test set with default hyperparmeters of every kernel"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Time: 0.27s\tKernel: linear\tAccuracy_train: 0.9712643678160919\tAccuracy_test: 0.953125\nTime: 0.08s\tKernel: rbf\tAccuracy_train: 0.9932950191570882\tAccuracy_test: 0.9620535714285714\nTime: 0.05s\tKernel: poly\tAccuracy_train: 0.9923371647509579\tAccuracy_test: 0.9419642857142857\n"
}
],
"source": [
"random_state=1\n",
"for kernel in ['linear', 'rbf', 'poly']:\n",
" now = time.time()\n",
" clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)\n",
" accuracy_train = clf.score(Xtrain, ytrain)\n",
" accuracy_test = clf.score(Xtest, ytest)\n",
" time_spent = time.time() - now\n",
" print(f\"Time: {time_spent:.2f}s\\tKernel: {kernel}\\tAccuracy_train: {accuracy_train}\\tAccuracy_test: {accuracy_test}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test diferent values of C"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"tags": [
"outputPrepend"
]
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9550\nClassifier's accuracy (test) : 0.9554\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.977636 counts=(array([0, 1]), array([ 7, 306]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.945280 counts=(array([0, 1]), array([691, 40]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9554\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.983923 counts=(array([0, 1]), array([ 5, 306]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.945430 counts=(array([0, 1]), array([693, 40]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([4]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951989 counts=(array([0, 1]), array([694, 35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9703\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([5]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.957004 counts=(array([0, 1]), array([690, 31]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9799\nClassifier's accuracy (test) : 0.9531\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([15]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([9]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([10]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.969653 counts=(array([0, 1]), array([671, 21]))\n\n**************************************************\n0.5032 secs\n"
}
],
"source": [
"t = time.time()\n",
"for C in (.001, .01, 1, 5, 17):\n",
" clf = Stree(C=C, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" print(f\"************** C={C} ****************************\")\n",
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
" print(clf)\n",
" print(f\"**************************************************\")\n",
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test iterator\n",
"Check different weays of using the iterator"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([15]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([9]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([10]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.969653 counts=(array([0, 1]), array([671, 21]))\n"
}
],
"source": [
"#check iterator\n",
"for i in list(clf):\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([15]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([9]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([10]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.969653 counts=(array([0, 1]), array([671, 21]))\n"
}
],
"source": [
"#check iterator again\n",
"for i in clf:\n",
" print(i)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test STree is a sklearn estimator"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x124d443b0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x124d3b4d0>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x124d3b3b0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x124d33cb0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x124d33dd0>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x124d33ef0>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x124d35050>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x124d35170>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x124d3e4d0>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x124d3e4d0>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x124d35320>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x124d35290>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x124d3b5f0>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x124d3b290>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x124d3b710>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x124d44290>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x124d33b90>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x124d3b950>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x124d445f0>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x124d3e050>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x124d3ea70>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x124d3ba70>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x124d3e170>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x124d3e170>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x124d3e170>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x124d480e0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x124d2d9e0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x124d3e710>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x124d3e5f0>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x124d44c20>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x124d48200>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x124d35830>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x124d359e0>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x124d35b00>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x124d35c20>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x124d35d40>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x124d44e60>, 'Stree')\n38 functools.partial(<function check_set_params at 0x124d44f80>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x124d35440>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x124d35710>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x124d483b0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x124d48440>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x124d484d0>, 'Stree')\n"
}
],
"source": [
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Stree(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Check if the classifier is a sklearn estimator\n",
"check_estimator(Stree())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compare to SVM"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "== Not Weighted ===\nSVC train score ..: 0.9530651340996169\nSTree train score : 0.960727969348659\nSVC test score ...: 0.9620535714285714\nSTree test score .: 0.9642857142857143\n==== Weighted =====\nSVC train score ..: 0.960727969348659\nSTree train score : 0.960727969348659\nSVC test score ...: 0.953125\nSTree test score .: 0.9553571428571429\n*SVC test score ..: 0.9397723008352139\n*STree test score : 0.9431162390279932\n"
}
],
"source": [
"svc = SVC(C=7, kernel='rbf', gamma=.001, random_state=random_state)\n",
"clf = Stree(C=17, kernel='rbf', gamma=.001, random_state=random_state)\n",
"svc.fit(Xtrain, ytrain)\n",
"clf.fit(Xtrain, ytrain)\n",
"print(\"== Not Weighted ===\")\n",
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
"svc.fit(Xtrain, ytrain, weights)\n",
"clf.fit(Xtrain, ytrain, weights)\n",
"print(\"==== Weighted =====\")\n",
"print(\"SVC train score ..:\", svc.score(Xtrain, ytrain))\n",
"print(\"STree train score :\", clf.score(Xtrain, ytrain))\n",
"print(\"SVC test score ...:\", svc.score(Xtest, ytest))\n",
"print(\"STree test score .:\", clf.score(Xtest, ytest))\n",
"print(\"*SVC test score ..:\", svc.score(Xtest, ytest, weights_test))\n",
"print(\"*STree test score :\", clf.score(Xtest, ytest, weights_test))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down, <cgaf> - Leaf class=1 belief= 0.978056 counts=(array([0, 1]), array([ 7, 312]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.953103 counts=(array([0, 1]), array([691, 34]))\n\n"
}
],
"source": [
"print(clf)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.6 64-bit ('general': venv)",
"language": "python",
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -1,337 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Test smple_weight, kernels, C, sklearn estimator"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup\n",
"Uncomment the next cell if STree is not already installed"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#\n",
"# Google Colab setup\n",
"#\n",
"#!pip install git+https://github.com/doctorado-ml/stree"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.svm import SVC\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.datasets import make_classification, load_iris, load_wine\n",
"from sklearn.model_selection import train_test_split\n",
"from stree import Stree\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"if not os.path.isfile('data/creditcard.csv'):\n",
" !wget --no-check-certificate --content-disposition http://nube.jccm.es/index.php/s/Zs7SYtZQJ3RQ2H2/download\n",
" !tar xzf creditcard.tgz"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.177% 495\nValid: 66.823% 997\n"
}
],
"source": [
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]\n",
"# Set weights inverse to its count class in dataset\n",
"weights = np.ones(Xtrain.shape[0],) * 1.00244\n",
"weights[ytrain==1] = 1.99755 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tests"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test smple_weights\n",
"Compute accuracy with weights in samples. The weights are set based on the inverse of the number of samples of each class"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Accuracy of Train without weights 0.9770114942528736\nAccuracy of Train with weights 0.9818007662835249\nAccuracy of Tests without weights 0.953125\nAccuracy of Tests with weights 0.9419642857142857\n"
}
],
"source": [
"C = 23\n",
"print(\"Accuracy of Train without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Train with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtrain, ytrain))\n",
"print(\"Accuracy of Tests without weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain).score(Xtest, ytest))\n",
"print(\"Accuracy of Tests with weights\", Stree(C=C, random_state=1).fit(Xtrain, ytrain, sample_weight=weights).score(Xtest, ytest))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test accuracy with different kernels\n",
"Compute accuracy on train and test set with default hyperparmeters of every kernel"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Time: 0.20s\tKernel: linear\tAccuracy_train: 0.9712643678160919\tAccuracy_test: 0.9575892857142857\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9932950191570882\tAccuracy_test: 0.9620535714285714\nTime: 0.09s\tKernel: poly\tAccuracy_train: 0.9904214559386973\tAccuracy_test: 0.9508928571428571\n"
}
],
"source": [
"random_state=1\n",
"for kernel in ['linear', 'rbf', 'poly']:\n",
" now = time.time()\n",
" clf = Stree(C=7, kernel=kernel, random_state=random_state).fit(Xtrain, ytrain)\n",
" accuracy_train = clf.score(Xtrain, ytrain)\n",
" accuracy_test = clf.score(Xtest, ytest)\n",
" time_spent = time.time() - now\n",
" print(f\"Time: {time_spent:.2f}s\\tKernel: {kernel}\\tAccuracy_train: {accuracy_train}\\tAccuracy_test: {accuracy_test}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test diferent values of C"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"tags": [
"outputPrepend"
]
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9550\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.980583 counts=(array([0, 1]), array([ 6, 303]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.943836 counts=(array([0, 1]), array([689, 41]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9576\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.990228 counts=(array([0, 1]), array([ 3, 304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.942935 counts=(array([0, 1]), array([694, 42]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([4]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951989 counts=(array([0, 1]), array([694, 35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9674\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([312]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([4]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.953039 counts=(array([0, 1]), array([690, 34]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9770\nClassifier's accuracy (test) : 0.9509\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([314]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([12]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.965714 counts=(array([0, 1]), array([676, 24]))\n\n**************************************************\n0.9578 secs\n"
}
],
"source": [
"t = time.time()\n",
"for C in (.001, .01, 1, 5, 17):\n",
" clf = Stree(C=C, random_state=random_state)\n",
" clf.fit(Xtrain, ytrain)\n",
" print(f\"************** C={C} ****************************\")\n",
" print(f\"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}\")\n",
" print(f\"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}\")\n",
" print(clf)\n",
" print(f\"**************************************************\")\n",
"print(f\"{time.time() - t:.4f} secs\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "[[0.88204928 0.11795072]\n [0.8640131 0.1359869 ]\n [0.94207521 0.05792479]\n [0.90219947 0.09780053]]\n"
}
],
"source": [
"import numpy as np\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.calibration import CalibratedClassifierCV\n",
"scaler = StandardScaler()\n",
"cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n",
"cclf.fit(Xtrain, ytrain)\n",
"res = cclf.predict_proba(Xtest)\n",
"print(res[:4, :])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test iterator\n",
"Check different weays of using the iterator"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([314]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([12]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.965714 counts=(array([0, 1]), array([676, 24]))\n"
}
],
"source": [
"#check iterator\n",
"for i in list(clf):\n",
" print(i)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([314]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([12]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.965714 counts=(array([0, 1]), array([676, 24]))\n"
}
],
"source": [
"#check iterator again\n",
"for i in clf:\n",
" print(i)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Test STree is a sklearn estimator"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12bd3b5f0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12bd31710>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12bd315f0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12bd21ef0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12bd2d050>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12bd2d170>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12bd2d290>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12bd2d3b0>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12bd37710>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x12bd37710>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12bd2d560>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12bd2d4d0>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x12bd31830>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x12bd314d0>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x12bd31950>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12bd3b4d0>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12bd21dd0>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x12bd31b90>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12bd3b830>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x12bd37290>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x12bd37cb0>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x12bd31cb0>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12bd373b0>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x12bd373b0>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x12bd373b0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12bd40320>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x12bd20ef0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x12bd37950>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x12bd37830>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12bd3be60>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12bd40440>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12bd2da70>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12bd2dc20>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12bd2dd40>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12bd2de60>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12bd2df80>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12bd400e0>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12bd40200>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12bd2d680>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12bd2d950>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12bd405f0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12bd40680>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x12bd40710>, 'Stree')\n"
}
],
"source": [
"# Make checks one by one\n",
"c = 0\n",
"checks = check_estimator(Stree(), generate_only=True)\n",
"for check in checks:\n",
" c += 1\n",
" print(c, check[1])\n",
" check[1](check[0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# Check if the classifier is a sklearn estimator\n",
"from sklearn.utils.estimator_checks import check_estimator\n",
"check_estimator(Stree())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.6 64-bit ('general': venv)",
"language": "python",
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6-final"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@@ -11,6 +11,7 @@ import os
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC from sklearn.svm import SVC, LinearSVC
from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import ( from sklearn.utils.validation import (
check_X_y, check_X_y,
@@ -18,6 +19,8 @@ from sklearn.utils.validation import (
check_is_fitted, check_is_fitted,
_check_sample_weight, _check_sample_weight,
) )
from sklearn.utils.sparsefuncs import count_nonzero
from sklearn.metrics._classification import _weighted_sum, _check_targets
class Snode: class Snode:
@@ -201,6 +204,13 @@ class Stree(BaseEstimator, ClassifierMixin):
) -> "Stree": ) -> "Stree":
"""Build the tree based on the dataset of samples and its labels """Build the tree based on the dataset of samples and its labels
:param X: dataset of samples to make predictions
:type X: np.array
:param y: samples labels
:type y: np.array
:param sample_weight: weights of the samples. Rescale C per sample.
Hi' weights force the classifier to put more emphasis on these points
:type sample_weight: np.array optional
:raises ValueError: if parameters C or max_depth are out of bounds :raises ValueError: if parameters C or max_depth are out of bounds
:return: itself to be able to chain actions: fit().predict() ... :return: itself to be able to chain actions: fit().predict() ...
:rtype: Stree :rtype: Stree
@@ -284,7 +294,8 @@ class Stree(BaseEstimator, ClassifierMixin):
:type X: np.ndarray :type X: np.ndarray
:param y: samples labels :param y: samples labels
:type y: np.ndarray :type y: np.ndarray
:param sample_weight: weight of samples (used in boosting) :param sample_weight: weight of samples. Rescale C per sample.
Hi weights force the classifier to put more emphasis on these points.
:type sample_weight: np.ndarray :type sample_weight: np.ndarray
:param depth: actual depth in the tree :param depth: actual depth in the tree
:type depth: int :type depth: int
@@ -435,20 +446,35 @@ class Stree(BaseEstimator, ClassifierMixin):
result[:, 0] = 1 - result[:, 1] result[:, 0] = 1 - result[:, 1]
return self._reorder_results(result, indices) return self._reorder_results(result, indices)
def score(self, X: np.array, y: np.array) -> float: def score(
self, X: np.array, y: np.array, sample_weight: np.array = None
) -> float:
"""Compute accuracy of the prediction """Compute accuracy of the prediction
:param X: dataset of samples to make predictions :param X: dataset of samples to make predictions
:type X: np.array :type X: np.array
:param y: samples labels :param y_true: samples labels
:type y: np.array :type y_true: np.array
:param sample_weight: weights of the samples. Rescale C per sample.
Hi' weights force the classifier to put more emphasis on these points
:type sample_weight: np.array optional
:return: accuracy of the prediction :return: accuracy of the prediction
:rtype: float :rtype: float
""" """
# sklearn check # sklearn check
check_is_fitted(self) check_is_fitted(self)
yp = self.predict(X).reshape(y.shape)
return np.mean(yp == y) y_pred = self.predict(X).reshape(y.shape)
# Compute accuracy for each possible representation
y_type, y_true, y_pred = _check_targets(y, y_pred)
check_consistent_length(y_true, y_pred, sample_weight)
if y_type.startswith("multilabel"):
differing_labels = count_nonzero(y_true - y_pred, axis=1)
score = differing_labels == 0
else:
score = y_true == y_pred
return _weighted_sum(score, sample_weight, normalize=True)
def __iter__(self) -> Siterator: def __iter__(self) -> Siterator:
"""Create an iterator to be able to visit the nodes of the tree in preorder, """Create an iterator to be able to visit the nodes of the tree in preorder,