Add test for getting 3 feature_sets in Splitter

Add ensemble notebook
2025-08-15 23:46:02 +00:00 · 2020-06-28 02:45:08 +02:00
parent 5e3a8e3ec5
commit be552fdd6c
2 changed files with 61 additions and 15 deletions
--- a/notebooks/ensemble.ipynb
+++ b/notebooks/ensemble.ipynb
@@ -34,11 +34,8 @@
   "outputs": [],
   "source": [
    "import time\n",
-    "from sklearn.ensemble import AdaBoostClassifier\n",
-    "from sklearn.tree import DecisionTreeClassifier\n",
-    "from sklearn.svm import LinearSVC, SVC\n",
-    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
-    "from sklearn.datasets import load_iris\n",
+    "from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
    "from stree import Stree"
   ]
  },
@@ -64,7 +61,7 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (284807, 28)  y.shape (284807,)\nFraud: 0.173% 492\nValid: 99.827% 284315\n"
+     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28)  y.shape (100492,)\nFraud: 0.644% 647\nValid: 99.356% 99845\n"
    }
   ],
   "source": [
@@ -99,8 +96,8 @@
    "\n",
    "# data = load_creditcard(-1000) # Take all true samples + 1000 of the others\n",
    "# data = load_creditcard(5000)  # Take the first 5000 samples\n",
-    "data = load_creditcard(0) # Take all the samples\n",
-    "# data = load_creditcard(-100000)\n",
+    "# data = load_creditcard(0) # Take all the samples\n",
+    "data = load_creditcard(-100000)\n",
    "\n",
    "Xtrain = data[0]\n",
    "Xtest = data[1]\n",
@@ -119,7 +116,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## STree alone on the whole dataset and linear kernel"
+    "## STree alone with 100.000 samples and linear kernel"
   ]
  },
  {
@@ -132,7 +129,7 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Score Train:  0.9994632932726069\nScore Test:  0.9994967405170698\nTook 140.74 seconds\n"
+     "text": "Score Train:  0.9985784146480154\nScore Test:  0.9981093273185617\nTook 73.27 seconds\n"
    }
   ],
   "source": [
@@ -148,7 +145,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Different kernels with different configuations"
+    "## Adaboost"
   ]
  },
  {
@@ -172,13 +169,54 @@
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Kernel: linear\tTime: 307.83 seconds\tScore Train: 0.9991924\tScore Test: 0.9994616\nKernel: rbf\tTime: 29.22 seconds\tScore Train: 0.9982745\tScore Test: 0.9982679\nKernel: poly\tTime: 207.48 seconds\tScore Train: 0.9988062\tScore Test: 0.9990403\n"
+     "text": "Kernel: linear\tTime: 93.78 seconds\tScore Train: 0.9983083\tScore Test: 0.9983083\nKernel: rbf\tTime: 18.32 seconds\tScore Train: 0.9935602\tScore Test: 0.9935651\nKernel: poly\tTime: 69.68 seconds\tScore Train: 0.9973132\tScore Test: 0.9972801\n"
    }
   ],
   "source": [
    "for kernel in ['linear', 'rbf', 'poly']:\n",
    "    now = time.time()\n",
-    "    clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
+    "    clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
+    "    clf.fit(Xtrain, ytrain)\n",
+    "    score_train = clf.score(Xtrain, ytrain)\n",
+    "    score_test = clf.score(Xtest, ytest)\n",
+    "    print(f\"Kernel: {kernel}\\tTime: {time.time() - now:.2f} seconds\\tScore Train: {score_train:.7f}\\tScore Test: {score_test:.7f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Bagging"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_estimators = 10\n",
+    "C = 7\n",
+    "max_depth = 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "Kernel: linear\tTime: 387.06 seconds\tScore Train: 0.9985784\tScore Test: 0.9981093\nKernel: rbf\tTime: 144.00 seconds\tScore Train: 0.9992750\tScore Test: 0.9983415\nKernel: poly\tTime: 101.78 seconds\tScore Train: 0.9992466\tScore Test: 0.9981757\n"
+    }
+   ],
+   "source": [
+    "for kernel in ['linear', 'rbf', 'poly']:\n",
+    "    now = time.time()\n",
+    "    clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
    "    clf.fit(Xtrain, ytrain)\n",
    "    score_train = clf.score(Xtrain, ytrain)\n",
    "    score_test = clf.score(Xtest, ytest)\n",
@@ -201,7 +239,7 @@
  },
  "orig_nbformat": 2,
  "kernelspec": {
-   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
+   "name": "python37664bitgeneralvenve3128601eb614c5da59c5055670b6040",
   "display_name": "Python 3.7.6 64-bit ('general': venv)"
  }
 },
--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -4,7 +4,7 @@ import random

 import numpy as np
 from sklearn.svm import SVC
-from sklearn.datasets import load_wine
+from sklearn.datasets import load_wine, load_iris
 from stree import Splitter


@@ -176,6 +176,14 @@ class Splitter_test(unittest.TestCase):
        self.assertEqual((4,), computed.shape)
        self.assertListEqual(expected.tolist(), computed.tolist())

+    def test_best_splitter_few_sets(self):
+        X, y = load_iris(return_X_y=True)
+        X = np.delete(X, 3, 1)
+        tcl = self.build(splitter_type="best", random_state=self._random_state)
+        dataset, computed = tcl.get_subspace(X, y, max_features=2)
+        self.assertListEqual([0, 2], list(computed))
+        self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
+
    def test_splitter_parameter(self):
        expected_values = [
            [2, 3, 5, 7],  # best   entropy min_distance