mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-15 23:46:02 +00:00
Refactor split_data adding sample_weight
This commit is contained in:
1
data/.gitignore
vendored
1
data/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
*
|
@@ -8,7 +8,8 @@
|
||||
"source": [
|
||||
"from sklearn.ensemble import AdaBoostClassifier\n",
|
||||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||||
"from sklearn.model_selection import GridSearchCV\n",
|
||||
"from sklearn.svm import LinearSVC\n",
|
||||
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||
"from sklearn.datasets import load_iris\n",
|
||||
"from stree import Stree"
|
||||
]
|
||||
@@ -27,12 +28,51 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"df = pd.read_csv('data/creditcard.csv')\n",
|
||||
"y = df.Class.values\n",
|
||||
"X = df.drop(['Class', 'Time', 'Amount'], axis=1).values"
|
||||
"random_state=1\n",
|
||||
"\n",
|
||||
"def load_creditcard(n_examples=0):\n",
|
||||
" import pandas as pd\n",
|
||||
" import numpy as np\n",
|
||||
" import random\n",
|
||||
" df = pd.read_csv('data/creditcard.csv')\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(df.Class[df.Class == 1].count()*100/df.shape[0], df.Class[df.Class == 1].count()))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(df.Class[df.Class == 0].count()*100/df.shape[0], df.Class[df.Class == 0].count()))\n",
|
||||
" y = df.Class\n",
|
||||
" X = df.drop(['Class', 'Time', 'Amount'], axis=1).values\n",
|
||||
" if n_examples > 0:\n",
|
||||
" # Take first n_examples samples\n",
|
||||
" X = X[:n_examples, :]\n",
|
||||
" y = y[:n_examples, :]\n",
|
||||
" else:\n",
|
||||
" # Take all the positive samples with a number of random negatives\n",
|
||||
" if n_examples < 0:\n",
|
||||
" Xt = X[(y == 1).ravel()]\n",
|
||||
" yt = y[(y == 1).ravel()]\n",
|
||||
" indices = random.sample(range(X.shape[0]), -1 * n_examples)\n",
|
||||
" X = np.append(Xt, X[indices], axis=0)\n",
|
||||
" y = np.append(yt, y[indices], axis=0)\n",
|
||||
" print(\"X.shape\", X.shape, \" y.shape\", y.shape)\n",
|
||||
" print(\"Fraud: {0:.3f}% {1}\".format(len(y[y == 1])*100/X.shape[0], len(y[y == 1])))\n",
|
||||
" print(\"Valid: {0:.3f}% {1}\".format(len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])))\n",
|
||||
" Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=random_state, stratify=y)\n",
|
||||
" return Xtrain, Xtest, ytrain, ytest\n",
|
||||
"\n",
|
||||
"# data = load_creditcard(-5000) # Take all true samples + 5000 of the others\n",
|
||||
"# data = load_creditcard(5000) # Take the first 5000 samples\n",
|
||||
"data = load_creditcard(-1000) # Take all the samples\n",
|
||||
"\n",
|
||||
"Xtrain = data[0]\n",
|
||||
"Xtest = data[1]\n",
|
||||
"ytrain = data[2]\n",
|
||||
"ytest = data[3]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -43,14 +83,12 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "\n"
|
||||
"text": "root\nroot - Down - Leaf class=1.0 belief=0.976000 counts=(array([0., 1.]), array([ 3, 122]))\nroot - Up - Leaf class=0.0 belief=0.977528 counts=(array([0., 1.]), array([696, 16]))\n\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"c = Stree(C=17, max_depth=2)\n",
|
||||
"print(c)\n",
|
||||
"c.fit(X, y)\n",
|
||||
"print(len(list(c)))\n",
|
||||
"c = Stree(max_depth=2)\n",
|
||||
"c.fit(Xtrain, ytrain)\n",
|
||||
"print(c)"
|
||||
]
|
||||
},
|
||||
@@ -62,7 +100,7 @@
|
||||
"source": [
|
||||
"#'base_estimator': [DecisionTreeClassifier(max_depth=1), Stree(max_depth=2), Stree(max_depth=3)],\n",
|
||||
"parameters = {\n",
|
||||
" 'base_estimator': [Stree(max_depth=2), Stree(max_depth=3)],\n",
|
||||
" 'base_estimator': [LinearSVC(), Stree(max_depth=2), Stree(max_depth=3)],\n",
|
||||
" 'n_estimators': [20, 50, 100, 150],\n",
|
||||
" 'learning_rate': [.5, 1, 1.5] \n",
|
||||
"}"
|
||||
@@ -89,41 +127,58 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fitting 5 folds for each of 24 candidates, totalling 120 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 2.5s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 2.6s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 2.6s\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 2.6s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.1837s.) Setting batch_size=2.\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Done 45 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.0313s.) Setting batch_size=4.\n[Parallel(n_jobs=-1)]: Done 64 tasks | elapsed: 2.7s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.0302s.) Setting batch_size=8.\n[Parallel(n_jobs=-1)]: Done 92 out of 120 | elapsed: 2.7s remaining: 0.8s\n[Parallel(n_jobs=-1)]: Done 118 out of 120 | elapsed: 2.7s remaining: 0.0s\n[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 2.7s finished\n"
|
||||
},
|
||||
{
|
||||
"output_type": "error",
|
||||
"ename": "ValueError",
|
||||
"evalue": "Stree doesn't support sample_weight.",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-7-1a0e5b8c6bec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mclf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mAdaBoostClassifier\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mgrid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mgrid\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 71\u001b[0m FutureWarning)\n\u001b[1;32m 72\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 73\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 74\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0minner_f\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 75\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/model_selection/_search.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups, **fit_params)\u001b[0m\n\u001b[1;32m 763\u001b[0m \u001b[0mrefit_start_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 765\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 766\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 767\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_estimator_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/ensemble/_weight_boosting.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 442\u001b[0m \u001b[0;31m# Fit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 443\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 444\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 445\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_validate_estimator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/ensemble/_weight_boosting.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 116\u001b[0m \u001b[0;31m# Check parameters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_estimator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 118\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;31m# Clear any previous fit results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;32m~/.virtualenvs/general/lib/python3.7/site-packages/sklearn/ensemble/_weight_boosting.py\u001b[0m in \u001b[0;36m_validate_estimator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhas_fit_parameter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase_estimator_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"sample_weight\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 460\u001b[0m raise ValueError(\"%s doesn't support sample_weight.\"\n\u001b[0;32m--> 461\u001b[0;31m % self.base_estimator_.__class__.__name__)\n\u001b[0m\u001b[1;32m 462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_boost\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0miboost\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[0;31mValueError\u001b[0m: Stree doesn't support sample_weight."
|
||||
]
|
||||
"text": "(X: numpy.ndarray, y: numpy.ndarray, sample_weight: <built-in function array> = None) -> 'Stree'\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"random_state=2020\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state)\n",
|
||||
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
|
||||
"grid.fit(X, y)"
|
||||
"from inspect import signature\n",
|
||||
"print(signature(c.fit))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.utils.validation import _check_sample_weight"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "AdaBoostClassifier(base_estimator=Stree(max_depth=2), learning_rate=0.5,\n n_estimators=20, random_state=2020)\n"
|
||||
"text": "Fitting 5 folds for each of 36 candidates, totalling 180 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done 2 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 9 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 16 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.1671s.) Setting batch_size=2.\n[Parallel(n_jobs=-1)]: Done 25 tasks | elapsed: 1.3s\n[Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 1.4s\n[Parallel(n_jobs=-1)]: Batch computation too fast (0.0413s.) Setting batch_size=4.\n[Parallel(n_jobs=-1)]: Done 50 tasks | elapsed: 1.4s\n[Parallel(n_jobs=-1)]: Batch computation too slow (7.7880s.) Setting batch_size=1.\n[Parallel(n_jobs=-1)]: Done 74 tasks | elapsed: 9.2s\n[Parallel(n_jobs=-1)]: Done 121 tasks | elapsed: 48.9s\n[Parallel(n_jobs=-1)]: Done 140 tasks | elapsed: 1.0min\n[Parallel(n_jobs=-1)]: Done 161 tasks | elapsed: 1.3min\n[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 1.6min finished\n"
|
||||
},
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n param_grid={'base_estimator': [LinearSVC(), Stree(max_depth=2),\n Stree(max_depth=3)],\n 'learning_rate': [0.5, 1, 1.5],\n 'n_estimators': [20, 50, 100, 150]},\n return_train_score=True, verbose=10)"
|
||||
},
|
||||
"metadata": {},
|
||||
"execution_count": 9
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"random_state=2020\n",
|
||||
"clf = AdaBoostClassifier(random_state=random_state)\n",
|
||||
"grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
|
||||
"grid.fit(Xtrain, ytrain)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "AdaBoostClassifier(base_estimator=Stree(max_depth=2), learning_rate=0.5,\n n_estimators=150, random_state=2020)\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -131,11 +186,12 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"AdaBoostClassifier(base_estimator=Stree(max_depth=3), learning_rate=0.5,\n",
|
||||
" n_estimators=20, random_state=2020)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@@ -48,7 +48,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.043% 493\nValid: 66.957% 999\n"
|
||||
"text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -103,7 +103,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "depth: 1\ndepth: 2\ndepth: 2\n************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9550\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down, <cgaf> - Leaf class=1 belief=0.983766 counts=(array([0, 1]), array([ 5, 303]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.942935 counts=(array([0, 1]), array([694, 42]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 3\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief=0.990196 counts=(array([0, 1]), array([ 3, 303]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([2]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.942935 counts=(array([0, 1]), array([694, 42]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 4\ndepth: 3\ndepth: 2\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9684\nClassifier's accuracy (test) : 0.9688\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([4]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.954608 counts=(array([0, 1]), array([694, 33]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 2\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9693\nClassifier's accuracy (test) : 0.9710\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([313]))\nroot - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))\nroot - Up, <cgaf> - Leaf class=0 belief=0.955801 counts=(array([0, 1]), array([692, 32]))\n\n**************************************************\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 4\ndepth: 5\ndepth: 6\ndepth: 6\ndepth: 5\ndepth: 4\ndepth: 3\ndepth: 2\ndepth: 3\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9818\nClassifier's accuracy (test) : 0.9554\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([307]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([8]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([25]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([5]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([5]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.972263 counts=(array([0, 1]), array([666, 19]))\n\n**************************************************\n0.6576 secs\n"
|
||||
"text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9797\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1.0 belief=0.984127 counts=(array([0., 1.]), array([ 2, 124]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up\nroot - Up - Down, <cgaf> - Leaf class=0.0 belief=0.750000 counts=(array([0., 1.]), array([3, 1]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.980029 counts=(array([0., 1.]), array([687, 14]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9809\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([124]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.977560 counts=(array([0., 1.]), array([697, 16]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9777\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, <cgaf> - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9916\nClassifier's accuracy (test) : 0.9833\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n\n**************************************************\n0.2235 secs\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -144,7 +144,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([307]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([8]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([25]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([5]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([5]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.972263 counts=(array([0, 1]), array([666, 19]))\n"
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -161,7 +161,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([307]))\nroot - Down - Up\nroot - Down - Up - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([8]))\nroot - Down - Up - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([25]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([3]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([5]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([5]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief=1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief=1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief=0.972263 counts=(array([0, 1]), array([666, 19]))\n"
|
||||
"text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, <pure> - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -174,13 +174,7 @@
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "depth: 1\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 1\ndepth: 1\ndepth: 1\n"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Check if the classifier is a sklearn estimator\n",
|
||||
"from sklearn.utils.estimator_checks import check_estimator\n",
|
||||
@@ -195,7 +189,7 @@
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12959d320>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x129595440>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 1\ndepth: 1\n3 functools.partial(<function check_fit_score_takes_y at 0x129595320>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12958ed40>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12958ee60>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12958ef80>, 'Stree')\n7 functools.partial(<function check_sample_weights_invariance at 0x1295900e0>, 'Stree')\n8 functools.partial(<function check_estimators_fit_returns_self at 0x129598440>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\n9 functools.partial(<function check_estimators_fit_returns_self at 0x129598440>, 'Stree', readonly_memmap=True)\ndepth: 1\ndepth: 2\ndepth: 2\n10 functools.partial(<function check_complex_data at 0x129590290>, 'Stree')\n11 functools.partial(<function check_dtype_object at 0x129590200>, 'Stree')\ndepth: 1\n12 functools.partial(<function check_estimators_empty_data_messages at 0x129595560>, 'Stree')\n13 functools.partial(<function check_pipeline_consistency at 0x129595200>, 'Stree')\ndepth: 1\ndepth: 1\n14 functools.partial(<function check_estimators_nan_inf at 0x129595680>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n15 functools.partial(<function check_estimators_overwrite_params at 0x12959d200>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\n16 functools.partial(<function check_estimator_sparse_data at 0x12958ec20>, 'Stree')\n17 functools.partial(<function check_estimators_pickle at 0x1295958c0>, 'Stree')\ndepth: 1\n18 functools.partial(<function check_classifier_data_not_an_array at 0x12959d560>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n19 functools.partial(<function check_classifiers_one_label at 0x129595f80>, 'Stree')\n20 functools.partial(<function check_classifiers_classes at 0x1295989e0>, 'Stree')\ndepth: 1\ndepth: 1\ndepth: 1\n21 functools.partial(<function check_estimators_partial_fit_n_features at 0x1295959e0>, 'Stree')\n22 functools.partial(<function check_classifiers_train at 0x1295980e0>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n23 functools.partial(<function check_classifiers_train at 0x1295980e0>, 'Stree', readonly_memmap=True)\ndepth: 1\ndepth: 2\ndepth: 2\ndepth: 1\ndepth: 2\ndepth: 2\n24 functools.partial(<function check_classifiers_regression_target at 0x12959df80>, 'Stree')\n25 functools.partial(<function check_supervised_y_no_nan at 0x12958cd40>, 'Stree')\n26 functools.partial(<function check_supervised_y_2d at 0x129598680>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\ndepth: 1\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 2\ndepth: 3\ndepth: 3\ndepth: 4\ndepth: 4\ndepth: 5\n27 functools.partial(<function check_estimators_unfitted at 0x129598560>, 'Stree')\n28 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12959db00>, 'Stree')\ndepth: 1\ndepth: 2\ndepth: 3\n29 functools.partial(<function check_decision_proba_consistency at 0x1295a20e0>, 'Stree')\n30 functools.partial(<function check_fit2d_predict1d at 0x1295907a0>, 'Stree')\ndepth: 1\n31 functools.partial(<function check_methods_subset_invariance at 0x129590950>, 'Stree')\ndepth: 1\n32 functools.partial(<function check_fit2d_1sample at 0x129590a70>, 'Stree')\n33 functools.partial(<function check_fit2d_1feature at 0x129590b90>, 'Stree')\ndepth: 1\ndepth: 2\n34 functools.partial(<function check_fit1d at 0x129590cb0>, 'Stree')\n35 functools.partial(<function check_get_params_invariance at 0x12959dd40>, 'Stree')\n36 functools.partial(<function check_set_params at 0x12959de60>, 'Stree')\n37 functools.partial(<function check_dict_unchanged at 0x1295903b0>, 'Stree')\ndepth: 1\n38 functools.partial(<function check_dont_overwrite_parameters at 0x129590680>, 'Stree')\ndepth: 1\n39 functools.partial(<function check_fit_idempotent at 0x1295a2290>, 'Stree')\ndepth: 1\ndepth: 1\n"
|
||||
"text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12aabb320>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x12aab0440>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x12aab0320>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12aaaac20>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12aaaad40>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12aaaae60>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12aaaaf80>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12aaac0e0>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x12aab6440>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12aaac290>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12aaac200>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x12aab0560>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x12aab0200>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x12aab0680>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12aabb200>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12aaaab00>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x12aab08c0>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12aabb560>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x12aab0f80>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x12aab69e0>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x12aab09e0>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x12aab60e0>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12aabf050>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x12aaa0c20>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x12aab6680>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x12aab6560>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12aabbb90>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12aabf170>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12aaac7a0>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12aaac950>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12aaaca70>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12aaacb90>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12aaaccb0>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12aabbdd0>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12aabbef0>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12aaac3b0>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12aaac680>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12aabf320>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12aabf3b0>, 'Stree')\n"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
@@ -211,9 +205,9 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3.7.6 64-bit ('general': venv)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
"name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
|
115
stree/Strees.py
115
stree/Strees.py
@@ -13,7 +13,8 @@ import os
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, _check_sample_weight, check_random_state
|
||||
|
||||
|
||||
class Snode:
|
||||
@@ -102,9 +103,8 @@ class Siterator:
|
||||
class Stree(BaseEstimator, ClassifierMixin):
|
||||
"""
|
||||
"""
|
||||
__folder = 'data/'
|
||||
|
||||
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = 0,
|
||||
def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = None,
|
||||
max_depth: int=None, tol: float=1e-4, use_predictions: bool = False):
|
||||
self.max_iter = max_iter
|
||||
self.C = C
|
||||
@@ -145,25 +145,25 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
return origin[up[:, 0]] if any(up) else None, \
|
||||
origin[down[:, 0]] if any(down) else None
|
||||
|
||||
def _split_data(self, node: Snode, data: np.ndarray, indices: np.ndarray) -> list:
|
||||
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
||||
if self.use_predictions:
|
||||
yp = node._clf.predict(data)
|
||||
down = (yp == 1).reshape(-1, 1)
|
||||
res = np.expand_dims(node._clf.decision_function(data), 1)
|
||||
else:
|
||||
# doesn't work with multiclass as each sample has to do inner product with its own coeficients
|
||||
# computes positition of every sample is w.r.t. the hyperplane
|
||||
res = self._linear_function(data, node)
|
||||
down = res > 0
|
||||
data_up, data_down = self._split_array(data, down)
|
||||
indices_up, indices_down = self._split_array(indices, down)
|
||||
res_up, res_down = self._split_array(res, down)
|
||||
return [data_up, indices_up, data_down, indices_down, res_up, res_down]
|
||||
# data_up, data_down = self._split_array(data, down)
|
||||
# indices_up, indices_down = self._split_array(indices, down)
|
||||
# res_up, res_down = self._split_array(res, down)
|
||||
# weight_up, weight_down = self._split_array(weights, down)
|
||||
#return [data_up, indices_up, data_down, indices_down, weight_up, weight_down, res_up, res_down]
|
||||
return res
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray, weighted_samples: np.array=None, **fitparams: dict) -> 'Stree':
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
if fitparams is not None:
|
||||
self.set_params(**fitparams)
|
||||
def _split_criteria(self, data: np.array) -> np.array:
|
||||
return data > 0
|
||||
|
||||
def fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None) -> 'Stree':
|
||||
# Check parameters are Ok.
|
||||
if type(y).__name__ == 'np.ndarray':
|
||||
y = y.ravel()
|
||||
if self.C < 0:
|
||||
@@ -173,12 +173,15 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
raise ValueError(f"Maximum depth has to be greater than 1... got (max_depth={self.max_depth})")
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
sample_weight = _check_sample_weight(sample_weight, X)
|
||||
check_classification_targets(y)
|
||||
# Initialize computed parameters
|
||||
#self.random_state = check_random_state(self.random_state)
|
||||
self.classes_ = np.unique(y)
|
||||
self.n_iter_ = self.max_iter
|
||||
self.depth_ = 0
|
||||
check_classification_targets(y)
|
||||
self.n_features_in_ = X.shape[1]
|
||||
self.tree_ = self.train(X, y, 1, 'root')
|
||||
self.tree_ = self.train(X, y, sample_weight, 1, 'root')
|
||||
self._build_predictor()
|
||||
return self
|
||||
|
||||
@@ -195,7 +198,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
|
||||
run_tree(self.tree_)
|
||||
|
||||
def train(self, X: np.ndarray, y: np.ndarray, depth: int, title: str = 'root') -> Snode:
|
||||
def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray, depth: int, title: str) -> Snode:
|
||||
|
||||
if depth > self.__max_depth:
|
||||
return None
|
||||
@@ -203,21 +206,24 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
# only 1 class => pure dataset
|
||||
return Snode(None, X, y, title + ', <pure>')
|
||||
# Train the model
|
||||
clf = LinearSVC(max_iter=self.max_iter, C=self.C,
|
||||
random_state=self.random_state)
|
||||
clf.fit(X, y)
|
||||
clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state,
|
||||
C=self.C) #, sample_weight=sample_weight)
|
||||
clf.fit(X, y, sample_weight=sample_weight)
|
||||
tree = Snode(clf, X, y, title)
|
||||
self.depth_ = max(depth, self.depth_)
|
||||
X_U, y_u, X_D, y_d, _, _ = self._split_data(tree, X, y)
|
||||
down = self._split_criteria(self._distances(tree, X))
|
||||
X_U, X_D = self._split_array(X, down)
|
||||
y_u, y_d = self._split_array(y, down)
|
||||
sw_u, sw_d = self._split_array(sample_weight, down)
|
||||
if X_U is None or X_D is None:
|
||||
# didn't part anything
|
||||
return Snode(clf, X, y, title + ', <cgaf>')
|
||||
tree.set_up(self.train(X_U, y_u, depth + 1, title + ' - Up'))
|
||||
tree.set_down(self.train(X_D, y_d, depth + 1, title + ' - Down'))
|
||||
tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + ' - Up'))
|
||||
tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + ' - Down'))
|
||||
return tree
|
||||
|
||||
def _reorder_results(self, y: np.array, indices: np.array, proba=False) -> np.array:
|
||||
if proba:
|
||||
def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
|
||||
if y.ndim > 1 and y.shape[1] > 1:
|
||||
# if predict_proba return np.array of floats
|
||||
y_ordered = np.zeros(y.shape, dtype=float)
|
||||
else:
|
||||
@@ -236,10 +242,12 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
# set a class for every sample in dataset
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
return prediction, indices
|
||||
u, i_u, d, i_d, _, _ = self._split_data(node, xp, indices)
|
||||
k, l = predict_class(d, i_d, node.get_down())
|
||||
m, n = predict_class(u, i_u, node.get_up())
|
||||
return np.append(k, m), np.append(l, n)
|
||||
down = self._split_criteria(self._distances(node, xp))
|
||||
X_U, X_D = self._split_array(xp, down)
|
||||
i_u, i_d = self._split_array(indices, down)
|
||||
prx_u, prin_u = predict_class(X_U, i_u, node.get_up())
|
||||
prx_d, prin_d = predict_class(X_D, i_d, node.get_down())
|
||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self, ['tree_'])
|
||||
@@ -276,10 +284,15 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
prediction = np.full((xp.shape[0], 1), node._class)
|
||||
prediction_proba = dist
|
||||
return np.append(prediction, prediction_proba, axis=1), indices
|
||||
u, i_u, d, i_d, r_u, r_d = self._split_data(node, xp, indices)
|
||||
k, l = predict_class(d, i_d, r_d, node.get_down())
|
||||
m, n = predict_class(u, i_u, r_u, node.get_up())
|
||||
return np.append(k, m), np.append(l, n)
|
||||
distances = self._distances(node, xp)
|
||||
down = self._split_criteria(distances)
|
||||
|
||||
X_U, X_D = self._split_array(xp, down)
|
||||
i_u, i_d = self._split_array(indices, down)
|
||||
di_u, di_d = self._split_array(distances, down)
|
||||
prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up())
|
||||
prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down())
|
||||
return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
|
||||
|
||||
# sklearn check
|
||||
check_is_fitted(self, ['tree_'])
|
||||
@@ -295,7 +308,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
# Probability of being 1
|
||||
result[:, 1] = 1 / (1 + np.exp(-result[:, 1]))
|
||||
result[:, 0] = 1 - result[:, 1] # Probability of being 0
|
||||
return self._reorder_results(result, indices, proba=True)
|
||||
return self._reorder_results(result, indices)
|
||||
|
||||
def score(self, X: np.array, y: np.array) -> float:
|
||||
"""Return accuracy
|
||||
@@ -319,35 +332,3 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
output += str(i) + '\n'
|
||||
return output
|
||||
|
||||
def get_folder(self) -> str:
|
||||
return self.__folder
|
||||
|
||||
def _save_datasets(self, tree: Snode, catalog: typing.TextIO, number: int):
|
||||
"""Save the dataset of the node in a csv file
|
||||
|
||||
:param tree: node with data to save
|
||||
:type tree: Snode
|
||||
:param catalog: catalog file handler
|
||||
:type catalog: typing.TextIO
|
||||
:param number: sequential number for the generated file name
|
||||
:type number: int
|
||||
"""
|
||||
data = np.append(tree._X, tree._y.reshape(-1, 1), axis=1)
|
||||
name = f"{self.__folder}dataset{number}.csv"
|
||||
np.savetxt(name, data, delimiter=",")
|
||||
catalog.write(f"{name}, - {str(tree)}")
|
||||
if tree.is_leaf():
|
||||
return
|
||||
self._save_datasets(tree.get_down(), catalog, number + 1)
|
||||
self._save_datasets(tree.get_up(), catalog, number + 2)
|
||||
|
||||
def get_catalog_name(self):
|
||||
return self.__folder + "catalog.txt"
|
||||
|
||||
def save_sub_datasets(self):
|
||||
"""Save the every dataset stored in the tree to check with manual classifier
|
||||
"""
|
||||
if not os.path.isdir(self.__folder):
|
||||
os.mkdir(self.__folder)
|
||||
with open(self.get_catalog_name(), 'w', encoding='utf-8') as catalog:
|
||||
self._save_datasets(self.tree_, catalog, 1)
|
||||
|
@@ -107,24 +107,6 @@ class Stree_test(unittest.TestCase):
|
||||
res.append(y_original[row])
|
||||
return res
|
||||
|
||||
def test_subdatasets(self):
|
||||
"""Check if the subdatasets files have the same labels as the original dataset
|
||||
"""
|
||||
self._clf.save_sub_datasets()
|
||||
with open(self._clf.get_catalog_name()) as cat_file:
|
||||
catalog = csv.reader(cat_file, delimiter=',')
|
||||
for row in catalog:
|
||||
X, y = self._get_Xy()
|
||||
x_file, y_file = self._get_file_data(row[0])
|
||||
y_original = np.array(self._find_out(x_file, X, y), dtype=int)
|
||||
self.assertTrue(np.array_equal(y_file, y_original))
|
||||
if os.path.isdir(self._clf.get_folder()):
|
||||
try:
|
||||
os.remove(f"{self._clf.get_folder()}*")
|
||||
os.rmdir(self._clf.get_folder())
|
||||
except:
|
||||
pass
|
||||
|
||||
def test_single_prediction(self):
|
||||
X, y = self._get_Xy()
|
||||
yp = self._clf.predict((X[0, :].reshape(-1, X.shape[1])))
|
||||
@@ -141,10 +123,9 @@ class Stree_test(unittest.TestCase):
|
||||
X, y = self._get_Xy()
|
||||
accuracy_score = self._clf.score(X, y)
|
||||
yp = self._clf.predict(X)
|
||||
right = (yp == y).astype(int)
|
||||
accuracy_computed = sum(right) / len(y)
|
||||
accuracy_computed = np.mean(yp == y)
|
||||
self.assertEqual(accuracy_score, accuracy_computed)
|
||||
self.assertGreater(accuracy_score, 0.8)
|
||||
self.assertGreater(accuracy_score, 0.9)
|
||||
|
||||
def test_single_predict_proba(self):
|
||||
"""Check that element 28 has a prediction different that the current label
|
||||
|
Reference in New Issue
Block a user