diff --git a/notebooks/ensemble.ipynb b/notebooks/ensemble.ipynb index a604b32..5462d69 100644 --- a/notebooks/ensemble.ipynb +++ b/notebooks/ensemble.ipynb @@ -34,11 +34,8 @@ "outputs": [], "source": [ "import time\n", - "from sklearn.ensemble import AdaBoostClassifier\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "from sklearn.svm import LinearSVC, SVC\n", - "from sklearn.model_selection import GridSearchCV, train_test_split\n", - "from sklearn.datasets import load_iris\n", + "from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n", + "from sklearn.model_selection import train_test_split\n", "from stree import Stree" ] }, @@ -64,7 +61,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (284807, 28) y.shape (284807,)\nFraud: 0.173% 492\nValid: 99.827% 284315\n" + "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28) y.shape (100492,)\nFraud: 0.644% 647\nValid: 99.356% 99845\n" } ], "source": [ @@ -99,8 +96,8 @@ "\n", "# data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n", "# data = load_creditcard(5000) # Take the first 5000 samples\n", - "data = load_creditcard(0) # Take all the samples\n", - "# data = load_creditcard(-100000)\n", + "# data = load_creditcard(0) # Take all the samples\n", + "data = load_creditcard(-100000)\n", "\n", "Xtrain = data[0]\n", "Xtest = data[1]\n", @@ -119,7 +116,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## STree alone on the whole dataset and linear kernel" + "## STree alone with 100.000 samples and linear kernel" ] }, { @@ -132,7 +129,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Score Train: 0.9994632932726069\nScore Test: 0.9994967405170698\nTook 140.74 seconds\n" + "text": "Score Train: 0.9985784146480154\nScore Test: 0.9981093273185617\nTook 73.27 seconds\n" } ], "source": [ @@ -148,7 +145,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Different kernels with different configuations" + "## Adaboost" ] }, { @@ -172,13 +169,54 @@ { "output_type": "stream", "name": "stdout", - "text": "Kernel: linear\tTime: 307.83 seconds\tScore Train: 0.9991924\tScore Test: 0.9994616\nKernel: rbf\tTime: 29.22 seconds\tScore Train: 0.9982745\tScore Test: 0.9982679\nKernel: poly\tTime: 207.48 seconds\tScore Train: 0.9988062\tScore Test: 0.9990403\n" + "text": "Kernel: linear\tTime: 93.78 seconds\tScore Train: 0.9983083\tScore Test: 0.9983083\nKernel: rbf\tTime: 18.32 seconds\tScore Train: 0.9935602\tScore Test: 0.9935651\nKernel: poly\tTime: 69.68 seconds\tScore Train: 0.9973132\tScore Test: 0.9972801\n" } ], "source": [ "for kernel in ['linear', 'rbf', 'poly']:\n", " now = time.time()\n", - " clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n", + " clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n", + " clf.fit(Xtrain, ytrain)\n", + " score_train = clf.score(Xtrain, ytrain)\n", + " score_test = clf.score(Xtest, ytest)\n", + " print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bagging" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "n_estimators = 10\n", + "C = 7\n", + "max_depth = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "Kernel: linear\tTime: 387.06 seconds\tScore Train: 0.9985784\tScore Test: 0.9981093\nKernel: rbf\tTime: 144.00 seconds\tScore Train: 0.9992750\tScore Test: 0.9983415\nKernel: poly\tTime: 101.78 seconds\tScore Train: 0.9992466\tScore Test: 0.9981757\n" + } + ], + "source": [ + "for kernel in ['linear', 'rbf', 'poly']:\n", + " now = time.time()\n", + " clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n", " clf.fit(Xtrain, ytrain)\n", " score_train = clf.score(Xtrain, ytrain)\n", " score_test = clf.score(Xtest, ytest)\n", @@ -201,7 +239,7 @@ }, "orig_nbformat": 2, "kernelspec": { - "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39", + "name": "python37664bitgeneralvenve3128601eb614c5da59c5055670b6040", "display_name": "Python 3.7.6 64-bit ('general': venv)" } }, diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py index d38b3bf..8417779 100644 --- a/stree/tests/Splitter_test.py +++ b/stree/tests/Splitter_test.py @@ -4,7 +4,7 @@ import random import numpy as np from sklearn.svm import SVC -from sklearn.datasets import load_wine +from sklearn.datasets import load_wine, load_iris from stree import Splitter @@ -176,6 +176,14 @@ class Splitter_test(unittest.TestCase): self.assertEqual((4,), computed.shape) self.assertListEqual(expected.tolist(), computed.tolist()) + def test_best_splitter_few_sets(self): + X, y = load_iris(return_X_y=True) + X = np.delete(X, 3, 1) + tcl = self.build(splitter_type="best", random_state=self._random_state) + dataset, computed = tcl.get_subspace(X, y, max_features=2) + self.assertListEqual([0, 2], list(computed)) + self.assertListEqual(X[:, computed].tolist(), dataset.tolist()) + def test_splitter_parameter(self): expected_values = [ [2, 3, 5, 7], # best entropy min_distance