Refactor split_data adding sample_weight

This commit is contained in:
2020-05-29 18:52:23 +02:00
parent ed98054f0d
commit a22ae81b54
5 changed files with 152 additions and 141 deletions

1
data/.gitignore vendored
View File

@@ -1 +0,0 @@
*

View File

@@ -8,7 +8,8 @@
"source": [
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
"from sklearn.datasets import load_iris\n",
"from stree import Stree"
]
@@ -27,12 +28,51 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n"
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_csv('data/creditcard.csv')\n",
"y = df.Class.values\n",
"X = df.drop(['Class', 'Time', 'Amount'], axis=1).values"
"random_state=1\n",
"\n",
"def load_creditcard(n_examples=0):\n",
" import pandas as pd\n",
" import numpy as np\n",
" import random\n",
" df = pd.read_csv('data/creditcard.csv')\n",
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
" y = df.Class\n",
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
" if n_examples > 0:\n",
" # Take first n_examples samples\n",
" X = X[:n_examples, :]\n",
" y = y[:n_examples, :]\n",
" else:\n",
" # Take all the positive samples with a number of random negatives\n",
" if n_examples < 0:\n",
" Xt = X[(y == 1).ravel()]\n",
" yt = y[(y == 1).ravel()]\n",
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
" X = np.append(Xt, X[indices], axis=0)\n",
" y = np.append(yt, y[indices], axis=0)\n",
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
" return Xtrain, Xtest, ytrain, ytest\n",
"\n",
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
"data = load_creditcard(-1000) # Take all the samples\n",
"\n",
"Xtrain = data[0]\n",
"Xtest = data[1]\n",
"ytrain = data[2]\n",
"ytest = data[3]"
]
},
{
@@ -43,14 +83,12 @@
{
"output_type": "stream",
"name": "stdout",
"text": "\n"
"text": "root\nroot - Down - Leaf class=1.0 belief=0.976000 counts=(array([0., 1.]), array([ 3, 122]))\nroot - Up - Leaf class=0.0 belief=0.977528 counts=(array([0., 1.]), array([696, 16]))\n\n"
}
],
"source": [
"c = Stree(C=17, max_depth=2)\n",
"print(c)\n",
"c.fit(X, y)\n",
"print(len(list(c)))\n",
"c = Stree(max_depth=2)\n",
"c.fit(Xtrain, ytrain)\n",
"print(c)"
]
},
@@ -62,7 +100,7 @@
"source": [
"#'base_estimator': [DecisionTreeClassifier(max_depth=1), Stree(max_depth=2), Stree(max_depth=3)],\n",
"parameters = {\n",
" 'base_estimator': [Stree(max_depth=2), Stree(max_depth=3)],\n",
" 'base_estimator': [LinearSVC(), Stree(max_depth=2), Stree(max_depth=3)],\n",
" 'n_estimators': [20, 50, 100, 150],\n",
" 'learning_rate': [.5, 1, 1.5] \n",
"}"
@@ -89,41 +127,58 @@
{
"output_type": "stream",
"name": "stdout",
"text": "Fitting 5 folds for each of 24 candidates, totalling 120 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.5s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.6s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 2.6s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 2.6s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.1837s.) Setting batch_size=2.\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.0313s.) Setting batch_size=4.\n[Parallel(n_jobs=-1)]: Done 64 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.0302s.) Setting batch_size=8.\n[Parallel(n_jobs=-1)]: Done 92 out of 120 | elapsed: 2.7s remaining: 0.8s\n[Parallel(n_jobs=-1)]: Done 118 out of 120 | elapsed: 2.7s remaining: 0.0s\n[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 2.7s finished\n"
},
{
"output_type": "error",
"ename": "ValueError",
"evalue": "Stree doesn't support sample_weight.",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-7-1a0e5b8c6bec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mclf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAdaBoostClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mgrid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 71\u001b[0m FutureWarning)\n\u001b[1;32m 72\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 763\u001b[0m \u001b[0mrefit_start_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 765\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 766\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 767\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/ensemble/_weight_boosting.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 442\u001b[0m \u001b[0;31m# Fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 443\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 444\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_validate_estimator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/ensemble/_weight_boosting.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;31m# Check parameters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_estimator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;31m# Clear any previous fit results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/ensemble/_weight_boosting.py\u001b[0m in \u001b[0;36m_validate_estimator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhas_fit_parameter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase_estimator_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"sample_weight\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 460\u001b[0m raise ValueError(\"%s doesn't support sample_weight.\"\n\u001b[0;32m--> 461\u001b[0;31m % self.base_estimator_.__class__.__name__)\n\u001b[0m\u001b[1;32m 462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_boost\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miboost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: Stree doesn't support sample_weight."
]
"text": "(X: numpy.ndarray, y: numpy.ndarray, sample_weight: <built-in function array> = None) -> 'Stree'\n"
}
],
"source": [
"random_state=2020\n",
"clf = AdaBoostClassifier(random_state=random_state)\n",
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
"grid.fit(X, y)"
"from inspect import signature\n",
"print(signature(c.fit))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.utils.validation import _check_sample_weight"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "AdaBoostClassifier(base_estimator=Stree(max_depth=2), learning_rate=0.5,\n n_estimators=20, random_state=2020)\n"
"text": "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.1671s.) Setting batch_size=2.\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 1.4s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.0413s.) Setting batch_size=4.\n[Parallel(n_jobs=-1)]: Done 50 tasks | elapsed: 1.4s\n[Parallel(n_jobs=-1)]: Batch computation too slow (7.7880s.) Setting batch_size=1.\n[Parallel(n_jobs=-1)]: Done 74 tasks | elapsed: 9.2s\n[Parallel(n_jobs=-1)]: Done 121 tasks | elapsed: 48.9s\n[Parallel(n_jobs=-1)]: Done 140 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 161 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 1.6min finished\n"
},
{
"output_type": "execute_result",
"data": {
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [LinearSVC(), Stree(max_depth=2),\n Stree(max_depth=3)],\n 'learning_rate': [0.5, 1, 1.5],\n 'n_estimators': [20, 50, 100, 150]},\n return_train_score=True, verbose=10)"
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"random_state=2020\n",
"clf = AdaBoostClassifier(random_state=random_state)\n",
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
"grid.fit(Xtrain, ytrain)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "AdaBoostClassifier(base_estimator=Stree(max_depth=2), learning_rate=0.5,\n n_estimators=150, random_state=2020)\n"
}
],
"source": [
@@ -131,11 +186,12 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": []
"source": [
"AdaBoostClassifier(base_estimator=Stree(max_depth=3), learning_rate=0.5,\n",
" n_estimators=20, random_state=2020)"
]
}
],
"metadata": {

View File

@@ -48,7 +48,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.043% 493\nValid: 66.957% 999\n"
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
}
],
"source": [
@@ -103,7 +103,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "depth: 1\ndepth: 2\ndepth: 2\n************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9550\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.983766 counts=(array([0, 1]), array([ 5, 303]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.942935 counts=(array([0, 1]), array([694, 42]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 3\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief=0.990196 counts=(array([0, 1]), array([ 3, 303]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.942935 counts=(array([0, 1]), array([694, 42]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 4\ndepth: 3\ndepth: 2\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9684\nClassifier's accuracy (test) : 0.9688\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.954608 counts=(array([0, 1]), array([694, 33]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 2\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9693\nClassifier's accuracy (test) : 0.9710\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([313]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.955801 counts=(array([0, 1]), array([692, 32]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 4\ndepth: 5\ndepth: 6\ndepth: 6\ndepth: 5\ndepth: 4\ndepth: 3\ndepth: 2\ndepth: 3\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9818\nClassifier's accuracy (test) : 0.9554\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([307]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([8]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([25]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([5]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([5]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.972263 counts=(array([0, 1]), array([666, 19]))\n\n**************************************************\n0.6576 secs\n"
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9797\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1.0 belief=0.984127 counts=(array([0., 1.]), array([ 2, 124]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up\nroot - Up - Down, <cgaf> - Leaf class=0.0 belief=0.750000 counts=(array([0., 1.]), array([3, 1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.980029 counts=(array([0., 1.]), array([687, 14]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9809\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([124]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.977560 counts=(array([0., 1.]), array([697, 16]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9777\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9916\nClassifier's accuracy (test) : 0.9833\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n\n**************************************************\n0.2235 secs\n"
}
],
"source": [
@@ -144,7 +144,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([307]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([8]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([25]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([5]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([5]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.972263 counts=(array([0, 1]), array([666, 19]))\n"
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n"
}
],
"source": [
@@ -161,7 +161,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([307]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([8]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([25]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([5]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([5]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.972263 counts=(array([0, 1]), array([666, 19]))\n"
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n"
}
],
"source": [
@@ -174,13 +174,7 @@
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "depth: 1\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 1\n"
}
],
"outputs": [],
"source": [
"# Check if the classifier is a sklearn estimator\n",
"from sklearn.utils.estimator_checks import check_estimator\n",
@@ -195,7 +189,7 @@
{
"output_type": "stream",
"name": "stdout",
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12959d320>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x129595440>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 1\n3 functools.partial(<function check_fit_score_takes_y at 0x129595320>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12958ed40>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12958ee60>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12958ef80>, 'Stree')\n7 functools.partial(<function check_sample_weights_invariance at 0x1295900e0>, 'Stree')\n8 functools.partial(<function check_estimators_fit_returns_self at 0x129598440>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\n9 functools.partial(<function check_estimators_fit_returns_self at 0x129598440>, 'Stree', readonly_memmap=True)\ndepth: 1\ndepth: 2\ndepth: 2\n10 functools.partial(<function check_complex_data at 0x129590290>, 'Stree')\n11 functools.partial(<function check_dtype_object at 0x129590200>, 'Stree')\ndepth: 1\n12 functools.partial(<function check_estimators_empty_data_messages at 0x129595560>, 'Stree')\n13 functools.partial(<function check_pipeline_consistency at 0x129595200>, 'Stree')\ndepth: 1\ndepth: 1\n14 functools.partial(<function check_estimators_nan_inf at 0x129595680>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n15 functools.partial(<function check_estimators_overwrite_params at 0x12959d200>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\n16 functools.partial(<function check_estimator_sparse_data at 0x12958ec20>, 'Stree')\n17 functools.partial(<function check_estimators_pickle at 0x1295958c0>, 'Stree')\ndepth: 1\n18 functools.partial(<function check_classifier_data_not_an_array at 0x12959d560>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n19 functools.partial(<function check_classifiers_one_label at 0x129595f80>, 'Stree')\n20 functools.partial(<function check_classifiers_classes at 0x1295989e0>, 'Stree')\ndepth: 1\ndepth: 1\ndepth: 1\n21 functools.partial(<function check_estimators_partial_fit_n_features at 0x1295959e0>, 'Stree')\n22 functools.partial(<function check_classifiers_train at 0x1295980e0>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n23 functools.partial(<function check_classifiers_train at 0x1295980e0>, 'Stree', readonly_memmap=True)\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n24 functools.partial(<function check_classifiers_regression_target at 0x12959df80>, 'Stree')\n25 functools.partial(<function check_supervised_y_no_nan at 0x12958cd40>, 'Stree')\n26 functools.partial(<function check_supervised_y_2d at 0x129598680>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\n27 functools.partial(<function check_estimators_unfitted at 0x129598560>, 'Stree')\n28 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12959db00>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 3\n29 functools.partial(<function check_decision_proba_consistency at 0x1295a20e0>, 'Stree')\n30 functools.partial(<function check_fit2d_predict1d at 0x1295907a0>, 'Stree')\ndepth: 1\n31 functools.partial(<function check_methods_subset_invariance at 0x129590950>, 'Stree')\ndepth: 1\n32 functools.partial(<function check_fit2d_1sample at 0x129590a70>, 'Stree')\n33 functools.partial(<function check_fit2d_1feature at 0x129590b90>, 'Stree')\ndepth: 1\ndepth: 2\n34 functools.partial(<function check_fit1d at 0x129590cb0>, 'Stree')\n35 functools.partial(<function check_get_params_invariance at 0x12959dd40>, 'Stree')\n36 functools.partial(<function check_set_params at 0x12959de60>, 'Stree')\n37 functools.partial(<function check_dict_unchanged at 0x1295903b0>, 'Stree')\ndepth: 1\n38 functools.partial(<function check_dont_overwrite_parameters at 0x129590680>, 'Stree')\ndepth: 1\n39 functools.partial(<function check_fit_idempotent at 0x1295a2290>, 'Stree')\ndepth: 1\ndepth: 1\n"
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12aabb320>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12aab0440>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12aab0320>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12aaaac20>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12aaaad40>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12aaaae60>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12aaaaf80>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12aaac0e0>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12aaac290>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12aaac200>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x12aab0560>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x12aab0200>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x12aab0680>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12aabb200>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12aaaab00>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x12aab08c0>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12aabb560>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x12aab0f80>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x12aab69e0>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x12aab09e0>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12aabf050>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x12aaa0c20>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x12aab6680>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x12aab6560>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12aabbb90>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12aabf170>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12aaac7a0>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12aaac950>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12aaaca70>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12aaacb90>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12aaaccb0>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12aabbdd0>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12aabbef0>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12aaac3b0>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12aaac680>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12aabf320>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12aabf3b0>, 'Stree')\n"
}
],
"source": [
@@ -211,9 +205,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3.7.6 64-bit ('general': venv)",
"language": "python",
"name": "python3"
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
},
"language_info": {
"codemirror_mode": {

View File

@@ -13,7 +13,8 @@ import os
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import check_classification_targets
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, _check_sample_weight, check_random_state
class Snode:
@@ -102,9 +103,8 @@ class Siterator:
class Stree(BaseEstimator, ClassifierMixin):
"""
"""
__folder = 'data/'
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0,
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = None,
max_depth: int=None, tol: float=1e-4, use_predictions: bool = False):
self.max_iter = max_iter
self.C = C
@@ -145,25 +145,25 @@ class Stree(BaseEstimator, ClassifierMixin):
return origin[up[:, 0]] if any(up) else None, \
origin[down[:, 0]] if any(down) else None
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
if self.use_predictions:
yp = node._clf.predict(data)
down = (yp == 1).reshape(-1, 1)
res = np.expand_dims(node._clf.decision_function(data), 1)
else:
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
# computes positition of every sample is w.r.t. the hyperplane
res = self._linear_function(data, node)
down = res > 0
data_up, data_down = self._split_array(data, down)
indices_up, indices_down = self._split_array(indices, down)
res_up, res_down = self._split_array(res, down)
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
# data_up, data_down = self._split_array(data, down)
# indices_up, indices_down = self._split_array(indices, down)
# res_up, res_down = self._split_array(res, down)
# weight_up, weight_down = self._split_array(weights, down)
#return [data_up, indices_up, data_down, indices_down, weight_up, weight_down, res_up, res_down]
return res
def fit(self, X: np.ndarray, y: np.ndarray, weighted_samples: np.array=None, **fitparams: dict) -> 'Stree':
from sklearn.utils.multiclass import check_classification_targets
if fitparams is not None:
self.set_params(**fitparams)
def _split_criteria(self, data: np.array) -> np.array:
return data > 0
def fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None) -> 'Stree':
# Check parameters are Ok.
if type(y).__name__ == 'np.ndarray':
y = y.ravel()
if self.C < 0:
@@ -173,12 +173,15 @@ class Stree(BaseEstimator, ClassifierMixin):
raise ValueError(f"Maximum depth has to be greater than 1... got (max_depth={self.max_depth})")
check_classification_targets(y)
X, y = check_X_y(X, y)
sample_weight = _check_sample_weight(sample_weight, X)
check_classification_targets(y)
# Initialize computed parameters
#self.random_state = check_random_state(self.random_state)
self.classes_ = np.unique(y)
self.n_iter_ = self.max_iter
self.depth_ = 0
check_classification_targets(y)
self.n_features_in_ = X.shape[1]
self.tree_ = self.train(X, y, 1, 'root')
self.tree_ = self.train(X, y, sample_weight, 1, 'root')
self._build_predictor()
return self
@@ -195,7 +198,7 @@ class Stree(BaseEstimator, ClassifierMixin):
run_tree(self.tree_)
def train(self, X: np.ndarray, y: np.ndarray, depth: int, title: str = 'root') -> Snode:
def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray, depth: int, title: str) -> Snode:
if depth > self.__max_depth:
return None
@@ -203,21 +206,24 @@ class Stree(BaseEstimator, ClassifierMixin):
# only 1 class => pure dataset
return Snode(None, X, y, title + ', <pure>')
# Train the model
clf = LinearSVC(max_iter=self.max_iter, C=self.C,
random_state=self.random_state)
clf.fit(X, y)
clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
C=self.C) #, sample_weight=sample_weight)
clf.fit(X, y, sample_weight=sample_weight)
tree = Snode(clf, X, y, title)
self.depth_ = max(depth, self.depth_)
X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
down = self._split_criteria(self._distances(tree, X))
X_U, X_D = self._split_array(X, down)
y_u, y_d = self._split_array(y, down)
sw_u, sw_d = self._split_array(sample_weight, down)
if X_U is None or X_D is None:
# didn't part anything
return Snode(clf, X, y, title + ', <cgaf>')
tree.set_up(self.train(X_U, y_u, depth + 1, title + ' - Up'))
tree.set_down(self.train(X_D, y_d, depth + 1, title + ' - Down'))
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + ' - Up'))
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + ' - Down'))
return tree
def _reorder_results(self, y: np.array, indices: np.array, proba=False) -> np.array:
if proba:
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
if y.ndim > 1 and y.shape[1] > 1:
# if predict_proba return np.array of floats
y_ordered = np.zeros(y.shape, dtype=float)
else:
@@ -236,10 +242,12 @@ class Stree(BaseEstimator, ClassifierMixin):
# set a class for every sample in dataset
prediction = np.full((xp.shape[0], 1), node._class)
return prediction, indices
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, node.get_down())
m, n = predict_class(u, i_u, node.get_up())
return np.append(k, m), np.append(l, n)
down = self._split_criteria(self._distances(node, xp))
X_U, X_D = self._split_array(xp, down)
i_u, i_d = self._split_array(indices, down)
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ['tree_'])
@@ -276,10 +284,15 @@ class Stree(BaseEstimator, ClassifierMixin):
prediction = np.full((xp.shape[0], 1), node._class)
prediction_proba = dist
return np.append(prediction, prediction_proba, axis=1), indices
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
k, l = predict_class(d, i_d, r_d, node.get_down())
m, n = predict_class(u, i_u, r_u, node.get_up())
return np.append(k, m), np.append(l, n)
distances = self._distances(node, xp)
down = self._split_criteria(distances)
X_U, X_D = self._split_array(xp, down)
i_u, i_d = self._split_array(indices, down)
di_u, di_d = self._split_array(distances, down)
prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
# sklearn check
check_is_fitted(self, ['tree_'])
@@ -295,7 +308,7 @@ class Stree(BaseEstimator, ClassifierMixin):
# Probability of being 1
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
result[:, 0] = 1 - result[:, 1] # Probability of being 0
return self._reorder_results(result, indices, proba=True)
return self._reorder_results(result, indices)
def score(self, X: np.array, y: np.array) -> float:
"""Return accuracy
@@ -319,35 +332,3 @@ class Stree(BaseEstimator, ClassifierMixin):
output += str(i) + '\n'
return output
def get_folder(self) -> str:
return self.__folder
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
"""Save the dataset of the node in a csv file
:param tree: node with data to save
:type tree: Snode
:param catalog: catalog file handler
:type catalog: typing.TextIO
:param number: sequential number for the generated file name
:type number: int
"""
data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
name = f"{self.__folder}dataset{number}.csv"
np.savetxt(name, data, delimiter=",")
catalog.write(f"{name}, - {str(tree)}")
if tree.is_leaf():
return
self._save_datasets(tree.get_down(), catalog, number + 1)
self._save_datasets(tree.get_up(), catalog, number + 2)
def get_catalog_name(self):
return self.__folder + "catalog.txt"
def save_sub_datasets(self):
"""Save the every dataset stored in the tree to check with manual classifier
"""
if not os.path.isdir(self.__folder):
os.mkdir(self.__folder)
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
self._save_datasets(self.tree_, catalog, 1)

View File

@@ -107,24 +107,6 @@ class Stree_test(unittest.TestCase):
res.append(y_original[row])
return res
def test_subdatasets(self):
"""Check if the subdatasets files have the same labels as the original dataset
"""
self._clf.save_sub_datasets()
with open(self._clf.get_catalog_name()) as cat_file:
catalog = csv.reader(cat_file, delimiter=',')
for row in catalog:
X, y = self._get_Xy()
x_file, y_file = self._get_file_data(row[0])
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
self.assertTrue(np.array_equal(y_file, y_original))
if os.path.isdir(self._clf.get_folder()):
try:
os.remove(f"{self._clf.get_folder()}*")
os.rmdir(self._clf.get_folder())
except:
pass
def test_single_prediction(self):
X, y = self._get_Xy()
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
@@ -141,10 +123,9 @@ class Stree_test(unittest.TestCase):
X, y = self._get_Xy()
accuracy_score = self._clf.score(X, y)
yp = self._clf.predict(X)
right = (yp == y).astype(int)
accuracy_computed = sum(right) / len(y)
accuracy_computed = np.mean(yp == y)
self.assertEqual(accuracy_score, accuracy_computed)
self.assertGreater(accuracy_score, 0.8)
self.assertGreater(accuracy_score, 0.9)
def test_single_predict_proba(self):
"""Check that element 28 has a prediction different that the current label