Update version and notebooks

Add test for getting 3 feature_sets in Splitter
Add ensemble notebook
2025-08-17 08:26:00 +00:00 · 2020-06-28 10:44:29 +02:00 · 2020-06-28 02:45:08 +02:00 · 2020-06-27 23:34:15 +02:00 · 2020-06-27 18:29:40 +02:00 · 2020-06-26 11:22:45 +02:00
11 changed files with 518 additions and 225 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -10,5 +10,4 @@ exclude_lines =
    if __name__ == .__main__.:
 ignore_errors = True
 omit =
-    stree/tests/*
    stree/__init__.py
--- a/.gitignore
+++ b/.gitignore
@@ -130,4 +130,6 @@ dmypy.json

 .idea
 .vscode
-.pre-commit-config.yaml
+.pre-commit-config.yaml
+
+**.csv
--- a/notebooks/benchmark.ipynb
+++ b/notebooks/benchmark.ipynb
--- a/notebooks/ensemble.ipynb
+++ b/notebooks/ensemble.ipynb
@@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Test AdaBoost with different configurations"
+    "# Test Stree with AdaBoost and Bagging with different configurations"
   ]
  },
  {
@@ -34,11 +34,8 @@
   "outputs": [],
   "source": [
    "import time\n",
-    "from sklearn.ensemble import AdaBoostClassifier\n",
-    "from sklearn.tree import DecisionTreeClassifier\n",
-    "from sklearn.svm import LinearSVC, SVC\n",
-    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
-    "from sklearn.datasets import load_iris\n",
+    "from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
    "from stree import Stree"
   ]
  },
@@ -57,12 +54,14 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28)  y.shape (100492,)\nFraud: 0.659% 662\nValid: 99.341% 99830\n"
+     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (100492, 28)  y.shape (100492,)\nFraud: 0.644% 647\nValid: 99.356% 99845\n"
    }
   ],
   "source": [
@@ -117,18 +116,20 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## STree alone on the whole dataset and linear kernel"
+    "## STree alone with 100.000 samples and linear kernel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Score Train:  0.9985499829409757\nScore Test:  0.998407854584052\nTook 39.45 seconds\n"
+     "text": "Score Train:  0.9985784146480154\nScore Test:  0.9981093273185617\nTook 73.27 seconds\n"
    }
   ],
   "source": [
@@ -144,7 +145,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Different kernels with different configuations"
+    "## Adaboost"
   ]
  },
  {
@@ -161,18 +162,20 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Kernel: linear\tTime: 87.00 seconds\tScore Train: 0.9982372\tScore Test: 0.9981425\nKernel: rbf\tTime: 60.60 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 88.08 seconds\tScore Train: 0.9937450\tScore Test: 0.9938968\n"
+     "text": "Kernel: linear\tTime: 93.78 seconds\tScore Train: 0.9983083\tScore Test: 0.9983083\nKernel: rbf\tTime: 18.32 seconds\tScore Train: 0.9935602\tScore Test: 0.9935651\nKernel: poly\tTime: 69.68 seconds\tScore Train: 0.9973132\tScore Test: 0.9972801\n"
    }
   ],
   "source": [
    "for kernel in ['linear', 'rbf', 'poly']:\n",
    "    now = time.time()\n",
-    "    clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
+    "    clf = AdaBoostClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), algorithm=\"SAMME\", n_estimators=n_estimators, random_state=random_state)\n",
    "    clf.fit(Xtrain, ytrain)\n",
    "    score_train = clf.score(Xtrain, ytrain)\n",
    "    score_test = clf.score(Xtest, ytest)\n",
@@ -183,24 +186,37 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Test algorithm SAMME in AdaBoost to check speed/accuracy"
+    "## Bagging"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_estimators = 10\n",
+    "C = 7\n",
+    "max_depth = 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "tags": []
+   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
-     "text": "Kernel: linear\tTime: 58.75 seconds\tScore Train: 0.9980524\tScore Test: 0.9978771\nKernel: rbf\tTime: 12.49 seconds\tScore Train: 0.9934181\tScore Test: 0.9933992\nKernel: poly\tTime: 97.85 seconds\tScore Train: 0.9972137\tScore Test: 0.9971806\n"
+     "text": "Kernel: linear\tTime: 387.06 seconds\tScore Train: 0.9985784\tScore Test: 0.9981093\nKernel: rbf\tTime: 144.00 seconds\tScore Train: 0.9992750\tScore Test: 0.9983415\nKernel: poly\tTime: 101.78 seconds\tScore Train: 0.9992466\tScore Test: 0.9981757\n"
    }
   ],
   "source": [
    "for kernel in ['linear', 'rbf', 'poly']:\n",
    "    now = time.time()\n",
-    "    clf = AdaBoostClassifier(Stree(C=7, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state, algorithm=\"SAMME\")\n",
+    "    clf = BaggingClassifier(base_estimator=Stree(C=C, kernel=kernel, max_depth=max_depth, random_state=random_state), n_estimators=n_estimators, random_state=random_state)\n",
    "    clf.fit(Xtrain, ytrain)\n",
    "    score_train = clf.score(Xtrain, ytrain)\n",
    "    score_test = clf.score(Xtest, ytest)\n",
@@ -223,7 +239,7 @@
  },
  "orig_nbformat": 2,
  "kernelspec": {
-   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39",
+   "name": "python37664bitgeneralvenve3128601eb614c5da59c5055670b6040",
   "display_name": "Python 3.7.6 64-bit ('general': venv)"
  }
 },
--- a/notebooks/features.ipynb
+++ b/notebooks/features.ipynb
--- a/notebooks/gridsearch.ipynb
+++ b/notebooks/gridsearch.ipynb
@@ -66,7 +66,8 @@
        "id": "z9Q-YUfBDZEq",
        "colab_type": "code",
        "colab": {},
-        "outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b"
+        "outputId": "afc822fb-f16a-4302-8a67-2b9e2880159b",
+        "tags": []
      },
      "source": [
        "random_state=1\n",
@@ -112,7 +113,7 @@
        {
          "output_type": "stream",
          "name": "stdout",
-          "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28)  y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
+          "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28)  y.shape (1492,)\nFraud: 32.976% 492\nValid: 67.024% 1000\n"
        }
      ]
    },
@@ -137,25 +138,25 @@
        "    'learning_rate': [.5, 1],\n",
        "    'base_estimator__tol': [.1,  1e-02],\n",
        "    'base_estimator__max_depth': [3, 5],\n",
-        "    'base_estimator__C': [1, 3],\n",
+        "    'base_estimator__C': [7, 55],\n",
        "    'base_estimator__kernel': ['linear', 'poly', 'rbf']\n",
        "}"
      ],
-      "execution_count": 9,
+      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
-      "execution_count": 14,
+      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
-            "text/plain": "{'C': 1.0,\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'tol': 0.0001}"
+            "text/plain": "{'C': 1.0,\n 'criterion': 'gini',\n 'degree': 3,\n 'gamma': 'scale',\n 'kernel': 'linear',\n 'max_depth': None,\n 'max_features': None,\n 'max_iter': 1000,\n 'min_samples_split': 0,\n 'random_state': None,\n 'split_criteria': 'max_samples',\n 'splitter': 'random',\n 'tol': 0.0001}"
          },
          "metadata": {},
-          "execution_count": 14
+          "execution_count": 6
        }
      ],
      "source": [
@@ -168,28 +169,29 @@
        "id": "CrcB8o6EDZE5",
        "colab_type": "code",
        "colab": {},
-        "outputId": "7703413a-d563-4289-a13b-532f38f82762"
+        "outputId": "7703413a-d563-4289-a13b-532f38f82762",
+        "tags": []
      },
      "source": [
        "random_state=2020\n",
-        "clf = AdaBoostClassifier(random_state=random_state)\n",
+        "clf = AdaBoostClassifier(random_state=random_state, algorithm=\"SAMME\")\n",
        "grid = GridSearchCV(clf, parameters, verbose=10, n_jobs=-1, return_train_score=True)\n",
        "grid.fit(Xtrain, ytrain)"
      ],
-      "execution_count": 11,
+      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
-          "text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.6s\n[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    4.2s\n[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.8s\n[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.3s\n[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.2s\n[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    7.2s\n[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    8.9s\n[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   10.7s\n[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   12.7s\n[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   16.7s\n[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   19.4s\n[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   24.4s\n[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   29.3s\n[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   32.7s\n[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   36.4s\n[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:   39.7s\n[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed:   43.7s\n[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed:   46.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   48.8s\n[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   52.0s\n[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:   55.9s\n[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.0min\n[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:  1.2min\n[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed:  1.3min\n[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.3min\n[Parallel(n_jobs=-1)]: Done 465 tasks      | elapsed:  1.4min\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:  1.5min finished\n"
+          "text": "Fitting 5 folds for each of 96 candidates, totalling 480 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.0s\n[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.4s\n[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.7s\n[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s\n[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s\n[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    5.3s\n[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    6.6s\n[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    8.1s\n[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    9.4s\n[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   10.1s\n[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   11.1s\n[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   12.3s\n[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   13.6s\n[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:   14.9s\n[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   16.2s\n[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:   17.6s\n[Parallel(n_jobs=-1)]: Done 226 tasks      | elapsed:   19.1s\n[Parallel(n_jobs=-1)]: Done 249 tasks      | elapsed:   21.6s\n[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   25.9s\n[Parallel(n_jobs=-1)]: Done 297 tasks      | elapsed:   30.4s\n[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:   36.7s\n[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   38.1s\n[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   39.6s\n[Parallel(n_jobs=-1)]: Done 405 tasks      | elapsed:   41.9s\n[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   44.9s\n[Parallel(n_jobs=-1)]: Done 465 tasks      | elapsed:   48.2s\n[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   49.2s finished\n"
        },
        {
          "output_type": "execute_result",
          "data": {
-            "text/plain": "GridSearchCV(estimator=AdaBoostClassifier(random_state=2020), n_jobs=-1,\n             param_grid={'base_estimator': [Stree(C=1, max_depth=3, tol=0.1)],\n                         'base_estimator__C': [1, 3],\n                         'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n                         'base_estimator__max_depth': [3, 5],\n                         'base_estimator__tol': [0.1, 0.01],\n                         'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n             return_train_score=True, verbose=10)"
+            "text/plain": "GridSearchCV(estimator=AdaBoostClassifier(algorithm='SAMME', random_state=2020),\n             n_jobs=-1,\n             param_grid={'base_estimator': [Stree(C=55, max_depth=3, tol=0.01)],\n                         'base_estimator__C': [7, 55],\n                         'base_estimator__kernel': ['linear', 'poly', 'rbf'],\n                         'base_estimator__max_depth': [3, 5],\n                         'base_estimator__tol': [0.1, 0.01],\n                         'learning_rate': [0.5, 1], 'n_estimators': [10, 25]},\n             return_train_score=True, verbose=10)"
          },
          "metadata": {},
-          "execution_count": 11
+          "execution_count": 7
        }
      ]
    },
@@ -199,19 +201,20 @@
        "id": "ZjX88NoYDZE8",
        "colab_type": "code",
        "colab": {},
-        "outputId": "285163c8-fa33-4915-8ae7-61c4f7844344"
+        "outputId": "285163c8-fa33-4915-8ae7-61c4f7844344",
+        "tags": []
      },
      "source": [
        "print(\"Best estimator: \", grid.best_estimator_)\n",
        "print(\"Best hyperparameters: \", grid.best_params_)\n",
        "print(\"Best accuracy: \", grid.best_score_)"
      ],
-      "execution_count": 16,
+      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
-          "text": "Best estimator:  AdaBoostClassifier(base_estimator=Stree(C=1, max_depth=3, tol=0.1),\n                   learning_rate=0.5, n_estimators=10, random_state=2020)\nBest hyperparameters:  {'base_estimator': Stree(C=1, max_depth=3, tol=0.1), 'base_estimator__C': 1, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.1, 'learning_rate': 0.5, 'n_estimators': 10}\nBest accuracy:  0.9492316893632683\n"
+          "text": "Best estimator:  AdaBoostClassifier(algorithm='SAMME',\n                   base_estimator=Stree(C=55, max_depth=3, tol=0.01),\n                   learning_rate=0.5, n_estimators=25, random_state=2020)\nBest hyperparameters:  {'base_estimator': Stree(C=55, max_depth=3, tol=0.01), 'base_estimator__C': 55, 'base_estimator__kernel': 'linear', 'base_estimator__max_depth': 3, 'base_estimator__tol': 0.01, 'learning_rate': 0.5, 'n_estimators': 25}\nBest accuracy:  0.9559440559440558\n"
        }
      ]
    }
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 import setuptools

-__version__ = "0.9rc4"
+__version__ = "0.9rc5"
 __author__ = "Ricardo Montañana Gómez"


--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -10,6 +10,7 @@ import os
 import numbers
 import random
 import warnings
+from math import log
 from itertools import combinations
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
@@ -39,6 +40,7 @@ class Snode:
        features: np.array,
        impurity: float,
        title: str,
+        weight: np.ndarray = None,
    ):
        self._clf = clf
        self._title = title
@@ -50,7 +52,9 @@ class Snode:
        self._up = None
        self._class = None
        self._feature = None
-        self._sample_weight = None
+        self._sample_weight = (
+            weight if os.environ.get("TESTING", "NS") != "NS" else None
+        )
        self._features = features
        self._impurity = impurity

@@ -163,10 +167,10 @@ class Splitter:
                f"criterion must be gini or entropy got({criterion})"
            )

-        if criteria not in ["min_distance", "max_samples"]:
+        if criteria not in ["min_distance", "max_samples", "max_distance"]:
            raise ValueError(
-                f"split_criteria has to be min_distance or \
-                max_samples got ({criteria})"
+                "split_criteria has to be min_distance "
+                f"max_distance or max_samples got ({criteria})"
            )

        if splitter_type not in ["random", "best"]:
@@ -186,24 +190,47 @@ class Splitter:

    @staticmethod
    def _entropy(y: np.array) -> float:
-        _, count = np.unique(y, return_counts=True)
-        proportion = count / np.sum(count)
-        return -np.sum(proportion * np.log2(proportion))
+        n_labels = len(y)
+        if n_labels <= 1:
+            return 0
+        counts = np.bincount(y)
+        proportions = counts / n_labels
+        n_classes = np.count_nonzero(proportions)
+        if n_classes <= 1:
+            return 0
+        entropy = 0.0
+        # Compute standard entropy.
+        for prop in proportions:
+            if prop != 0.0:
+                entropy -= prop * log(prop, n_classes)
+        return entropy

    def information_gain(
-        self, labels_up: np.array, labels_dn: np.array
+        self, labels: np.array, labels_up: np.array, labels_dn: np.array
    ) -> float:
-        card_up = labels_up.shape[0] if labels_up is not None else 0
-        card_dn = labels_dn.shape[0] if labels_dn is not None else 0
+        imp_prev = self.criterion_function(labels)
+        card_up = card_dn = imp_up = imp_dn = 0
+        if labels_up is not None:
+            card_up = labels_up.shape[0]
+            imp_up = self.criterion_function(labels_up)
+        if labels_dn is not None:
+            card_dn = labels_dn.shape[0] if labels_dn is not None else 0
+            imp_dn = self.criterion_function(labels_dn)
        samples = card_up + card_dn
-        up = card_up / samples * self.criterion_function(labels_up)
-        dn = card_dn / samples * self.criterion_function(labels_dn)
-        return up + dn
+        if samples == 0:
+            return 0.0
+        else:
+            result = (
+                imp_prev
+                - (card_up / samples) * imp_up
+                - (card_dn / samples) * imp_dn
+            )
+            return result

    def _select_best_set(
        self, dataset: np.array, labels: np.array, features_sets: list
    ) -> list:
-        min_impurity = 1
+        max_gain = 0
        selected = None
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        for feature_set in features_sets:
@@ -213,11 +240,11 @@ class Splitter:
            )
            self.partition(dataset, node)
            y1, y2 = self.part(labels)
-            impurity = self.information_gain(y1, y2)
-            if impurity < min_impurity:
-                min_impurity = impurity
+            gain = self.information_gain(labels, y1, y2)
+            if gain > max_gain:
+                max_gain = gain
                selected = feature_set
-        return selected
+        return selected if selected is not None else feature_set

    def _get_subspaces_set(
        self, dataset: np.array, labels: np.array, max_features: int
@@ -226,8 +253,12 @@ class Splitter:
        features_sets = list(combinations(features, max_features))
        if len(features_sets) > 1:
            if self._splitter_type == "random":
-                return features_sets[random.randint(0, len(features_sets) - 1)]
+                index = random.randint(0, len(features_sets) - 1)
+                return features_sets[index]
            else:
+                # get only 3 sets at most
+                if len(features_sets) > 3:
+                    features_sets = random.sample(features_sets, 3)
                return self._select_best_set(dataset, labels, features_sets)
        else:
            return features_sets[0]
@@ -242,21 +273,56 @@ class Splitter:

    @staticmethod
    def _min_distance(data: np.array, _) -> np.array:
-        # chooses the lowest distance of every sample
-        indices = np.argmin(np.abs(data), axis=1)
-        return np.array(
-            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
-        )
+        """Assign class to min distances
+
+        return a vector of classes so partition can separate class 0 from
+        the rest of classes, ie. class 0 goes to one splitted node and the
+        rest of classes go to the other
+        :param data: distances to hyper plane of every class
+        :type data: np.array (m, n_classes)
+        :param _: enable call compat with other measures
+        :type _: None
+        :return: vector with the class assigned to each sample
+        :rtype: np.array shape (m,)
+        """
+        return np.argmin(data, axis=1)
+
+    @staticmethod
+    def _max_distance(data: np.array, _) -> np.array:
+        """Assign class to max distances
+
+        return a vector of classes so partition can separate class 0 from
+        the rest of classes, ie. class 0 goes to one splitted node and the
+        rest of classes go to the other
+        :param data: distances to hyper plane of every class
+        :type data: np.array (m, n_classes)
+        :param _: enable call compat with other measures
+        :type _: None
+        :return: vector with the class assigned to each sample values
+        (can be 0, 1, ...)
+        :rtype: np.array shape (m,)
+        """
+        return np.argmax(data, axis=1)

    @staticmethod
    def _max_samples(data: np.array, y: np.array) -> np.array:
+        """return distances of the class with more samples
+
+        :param data: distances to hyper plane of every class
+        :type data: np.array (m, n_classes)
+        :param y: vector of labels (classes)
+        :type y: np.array (m,)
+        :return: vector with distances to hyperplane (can be positive or neg.)
+        :rtype: np.array shape (m,)
+        """
        # select the class with max number of samples
        _, samples = np.unique(y, return_counts=True)
        selected = np.argmax(samples)
        return data[:, selected]

    def partition(self, samples: np.array, node: Snode):
-        """Set the criteria to split arrays
+        """Set the criteria to split arrays. Compute the indices of the samples
+        that should go to one side of the tree (down)

        """
        data = self._distances(node, samples)
@@ -379,7 +445,9 @@ class Stree(BaseEstimator, ClassifierMixin):

        check_classification_targets(y)
        X, y = check_X_y(X, y)
-        sample_weight = _check_sample_weight(sample_weight, X)
+        sample_weight = _check_sample_weight(
+            sample_weight, X, dtype=np.float64
+        )
        check_classification_targets(y)
        # Initialize computed parameters
        self.splitter_ = Splitter(
@@ -439,13 +507,22 @@ class Stree(BaseEstimator, ClassifierMixin):
                features=X.shape[1],
                impurity=0.0,
                title=title + ", <pure>",
+                weight=sample_weight,
            )
        # Train the model
        clf = self._build_clf()
        Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
+        # solve WARNING: class label 0 specified in weight is not found
+        # in bagging
+        if any(sample_weight == 0):
+            indices = sample_weight == 0
+            y_next = y[~indices]
+            # touch weights if removing any class
+            if np.unique(y_next).shape[0] != self.n_classes_:
+                sample_weight += 1e-5
        clf.fit(Xs, y, sample_weight=sample_weight)
        impurity = self.splitter_.impurity(y)
-        node = Snode(clf, X, y, features, impurity, title)
+        node = Snode(clf, X, y, features, impurity, title, sample_weight)
        self.depth_ = max(depth, self.depth_)
        self.splitter_.partition(X, node)
        X_U, X_D = self.splitter_.part(X)
@@ -460,6 +537,7 @@ class Stree(BaseEstimator, ClassifierMixin):
                features=X.shape[1],
                impurity=impurity,
                title=title + ", <cgaf>",
+                weight=sample_weight,
            )
        node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
        node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
--- a/stree/tests/Snode_test.py
+++ b/stree/tests/Snode_test.py
@@ -33,10 +33,7 @@ class Snode_test(unittest.TestCase):
            max_card = max(card)
            min_card = min(card)
            if len(classes) > 1:
-                try:
-                    belief = max_card / (max_card + min_card)
-                except ZeroDivisionError:
-                    belief = 0.0
+                belief = max_card / (max_card + min_card)
            else:
                belief = 1
            self.assertEqual(belief, node._belief)
--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -1,11 +1,11 @@
 import os
 import unittest
+import random

 import numpy as np
-from sklearn.svm import LinearSVC
-
+from sklearn.svm import SVC
+from sklearn.datasets import load_wine, load_iris
 from stree import Splitter
-from .utils import load_dataset


 class Splitter_test(unittest.TestCase):
@@ -15,7 +15,7 @@ class Splitter_test(unittest.TestCase):

    @staticmethod
    def build(
-        clf=LinearSVC(),
+        clf=SVC,
        min_samples_split=0,
        splitter_type="random",
        criterion="gini",
@@ -23,7 +23,7 @@ class Splitter_test(unittest.TestCase):
        random_state=None,
    ):
        return Splitter(
-            clf=clf,
+            clf=clf(random_state=random_state, kernel="rbf"),
            min_samples_split=min_samples_split,
            splitter_type=splitter_type,
            criterion=criterion,
@@ -43,10 +43,14 @@ class Splitter_test(unittest.TestCase):
        with self.assertRaises(ValueError):
            self.build(criteria="duck")
        with self.assertRaises(ValueError):
-            self.build(clf=None)
+            _ = Splitter(clf=None)
        for splitter_type in ["best", "random"]:
            for criterion in ["gini", "entropy"]:
-                for criteria in ["min_distance", "max_samples"]:
+                for criteria in [
+                    "min_distance",
+                    "max_samples",
+                    "max_distance",
+                ]:
                    tcl = self.build(
                        splitter_type=splitter_type,
                        criterion=criterion,
@@ -57,30 +61,74 @@ class Splitter_test(unittest.TestCase):
                    self.assertEqual(criteria, tcl._criteria)

    def test_gini(self):
-        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
-        expected = 0.48
-        self.assertEqual(expected, Splitter._gini(y))
-        tcl = self.build(criterion="gini")
-        self.assertEqual(expected, tcl.criterion_function(y))
+        expected_values = [
+            ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.48),
+            ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.7777777777777778),
+            ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.520408163265306),
+            ([0, 0, 1, 1, 1, 1, 0, 0], 0.5),
+            ([0, 0, 1, 1, 2, 2, 3, 3], 0.75),
+            ([0, 0, 1, 1, 1, 1, 1, 1], 0.375),
+            ([0], 0),
+            ([1, 1, 1, 1], 0),
+        ]
+        for labels, expected in expected_values:
+            self.assertAlmostEqual(expected, Splitter._gini(labels))
+            tcl = self.build(criterion="gini")
+            self.assertAlmostEqual(expected, tcl.criterion_function(labels))

    def test_entropy(self):
-        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
-        expected = 0.9709505944546686
-        self.assertAlmostEqual(expected, Splitter._entropy(y))
-        tcl = self.build(criterion="entropy")
-        self.assertEqual(expected, tcl.criterion_function(y))
+        expected_values = [
+            ([0, 1, 1, 1, 1, 1, 0, 0, 0, 1], 0.9709505944546686),
+            ([0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1], 0.9111886696810589),
+            ([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2], 0.8120406807940999),
+            ([0, 0, 1, 1, 1, 1, 0, 0], 1),
+            ([0, 0, 1, 1, 2, 2, 3, 3], 1),
+            ([0, 0, 1, 1, 1, 1, 1, 1], 0.8112781244591328),
+            ([1], 0),
+            ([0, 0, 0, 0], 0),
+        ]
+        for labels, expected in expected_values:
+            self.assertAlmostEqual(expected, Splitter._entropy(labels))
+            tcl = self.build(criterion="entropy")
+            self.assertAlmostEqual(expected, tcl.criterion_function(labels))

    def test_information_gain(self):
-        yu = np.array([0, 1, 1, 1, 1, 1])
-        yd = np.array([0, 0, 0, 1])
-        values_expected = [
-            ("gini", 0.31666666666666665),
-            ("entropy", 0.7145247027726656),
+        expected_values = [
+            (
+                [0, 1, 1, 1, 1, 1],
+                [0, 0, 0, 1],
+                0.16333333333333333,
+                0.25642589168200297,
+            ),
+            (
+                [0, 1, 1, 2, 2, 3, 4, 5, 3, 2, 1, 1],
+                [5, 3, 2, 1, 1],
+                0.007381776239907684,
+                -0.03328610916207225,
+            ),
+            ([], [], 0.0, 0.0),
+            ([1], [], 0.0, 0.0),
+            ([], [1], 0.0, 0.0),
+            ([0, 0, 0, 0], [0, 0], 0.0, 0.0),
+            ([], [1, 1, 1, 2], 0.0, 0.0),
+            (None, [1, 2, 3], 0.0, 0.0),
+            ([1, 2, 3], None, 0.0, 0.0),
        ]
-        for criterion, expected in values_expected:
-            tcl = self.build(criterion=criterion)
-            computed = tcl.information_gain(yu, yd)
-            self.assertAlmostEqual(expected, computed)
+        for yu, yd, expected_gini, expected_entropy in expected_values:
+            yu = np.array(yu, dtype=np.int32) if yu is not None else None
+            yd = np.array(yd, dtype=np.int32) if yd is not None else None
+            if yu is not None and yd is not None:
+                complete = np.append(yu, yd)
+            elif yd is not None:
+                complete = yd
+            else:
+                complete = yu
+            tcl = self.build(criterion="gini")
+            computed = tcl.information_gain(complete, yu, yd)
+            self.assertAlmostEqual(expected_gini, computed)
+            tcl = self.build(criterion="entropy")
+            computed = tcl.information_gain(complete, yu, yd)
+            self.assertAlmostEqual(expected_entropy, computed)

    def test_max_samples(self):
        tcl = self.build(criteria="max_samples")
@@ -108,34 +156,73 @@ class Splitter_test(unittest.TestCase):
                [0.1, 0.2, 0.3],
            ]
        )
-        expected = np.array([-0.1, 0.01, 0.5, 0.1])
+        expected = np.array([2, 2, 1, 0])
        computed = tcl._min_distance(data, None)
        self.assertEqual((4,), computed.shape)
        self.assertListEqual(expected.tolist(), computed.tolist())

+    def test_max_distance(self):
+        tcl = self.build(criteria="max_distance")
+        data = np.array(
+            [
+                [-0.1, 0.2, -0.3],
+                [0.7, 0.01, -0.1],
+                [0.7, -0.9, 0.5],
+                [0.1, 0.2, 0.3],
+            ]
+        )
+        expected = np.array([1, 0, 0, 2])
+        computed = tcl._max_distance(data, None)
+        self.assertEqual((4,), computed.shape)
+        self.assertListEqual(expected.tolist(), computed.tolist())
+
+    def test_best_splitter_few_sets(self):
+        X, y = load_iris(return_X_y=True)
+        X = np.delete(X, 3, 1)
+        tcl = self.build(splitter_type="best", random_state=self._random_state)
+        dataset, computed = tcl.get_subspace(X, y, max_features=2)
+        self.assertListEqual([0, 2], list(computed))
+        self.assertListEqual(X[:, computed].tolist(), dataset.tolist())
+
    def test_splitter_parameter(self):
        expected_values = [
-            [1, 7, 9],
-            [1, 7, 9],
-            [1, 7, 9],
-            [1, 7, 9],
-            [0, 5, 6],
-            [0, 5, 6],
-            [0, 5, 6],
-            [0, 5, 6],
+            [2, 3, 5, 7],  # best   entropy min_distance
+            [0, 2, 4, 5],  # best   entropy max_samples
+            [0, 2, 8, 12],  # best   entropy max_distance
+            [1, 2, 5, 12],  # best   gini    min_distance
+            [0, 3, 4, 10],  # best   gini    max_samples
+            [1, 2, 9, 12],  # best   gini    max_distance
+            [3, 9, 11, 12],  # random entropy min_distance
+            [1, 5, 6, 9],  # random entropy max_samples
+            [1, 2, 4, 8],  # random entropy max_distance
+            [2, 6, 7, 12],  # random gini    min_distance
+            [3, 9, 10, 11],  # random gini    max_samples
+            [2, 5, 8, 12],  # random gini    max_distance
        ]
-        X, y = load_dataset(self._random_state, n_features=12)
+        X, y = load_wine(return_X_y=True)
+        rn = 0
        for splitter_type in ["best", "random"]:
-            for criterion in ["gini", "entropy"]:
-                for criteria in ["min_distance", "max_samples"]:
+            for criterion in ["entropy", "gini"]:
+                for criteria in [
+                    "min_distance",
+                    "max_samples",
+                    "max_distance",
+                ]:
                    tcl = self.build(
                        splitter_type=splitter_type,
                        criterion=criterion,
                        criteria=criteria,
-                        random_state=self._random_state,
                    )
                    expected = expected_values.pop(0)
-                    dataset, computed = tcl.get_subspace(X, y, max_features=3)
+                    random.seed(rn)
+                    rn += 1
+                    dataset, computed = tcl.get_subspace(X, y, max_features=4)
+                    # print(
+                    #     "{},  # {:7s}{:8s}{:15s}".format(
+                    #         list(computed), splitter_type, criterion,
+                    #           criteria,
+                    #     )
+                    # )
                    self.assertListEqual(expected, list(computed))
                    self.assertListEqual(
                        X[:, computed].tolist(), dataset.tolist()
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -1,8 +1,10 @@
 import os
 import unittest
+import warnings

 import numpy as np
-from sklearn.datasets import load_iris
+from sklearn.datasets import load_iris, load_wine
+from sklearn.exceptions import ConvergenceWarning

 from stree import Stree, Snode
 from .utils import load_dataset
@@ -39,10 +41,7 @@ class Stree_test(unittest.TestCase):
        _, count_u = np.unique(y_up, return_counts=True)
        #
        for i in unique_y:
-            try:
-                number_down = count_d[i]
-            except IndexError:
-                number_down = 0
+            number_down = count_d[i]
            try:
                number_up = count_u[i]
            except IndexError:
@@ -59,33 +58,12 @@ class Stree_test(unittest.TestCase):
    def test_build_tree(self):
        """Check if the tree is built the same way as predictions of models
        """
-        import warnings
-
        warnings.filterwarnings("ignore")
        for kernel in self._kernels:
            clf = Stree(kernel=kernel, random_state=self._random_state)
            clf.fit(*load_dataset(self._random_state))
            self._check_tree(clf.tree_)

-    @staticmethod
-    def _find_out(px: np.array, x_original: np.array, y_original) -> list:
-        """Find the original values of y for a given array of samples
-
-        Arguments:
-            px {np.array} -- array of samples to search for
-            x_original {np.array} -- original dataset
-            y_original {[type]} -- original classes
-
-        Returns:
-            np.array -- classes of the given samples
-        """
-        res = []
-        for needle in px:
-            for row in range(x_original.shape[0]):
-                if all(x_original[row, :] == needle):
-                    res.append(y_original[row])
-        return res
-
    def test_single_prediction(self):
        X, y = load_dataset(self._random_state)
        for kernel in self._kernels:
@@ -102,22 +80,6 @@ class Stree_test(unittest.TestCase):
            yp = clf.fit(X, y).predict(X[:num, :])
            self.assertListEqual(y[:num].tolist(), yp.tolist())

-    def test_score(self):
-        X, y = load_dataset(self._random_state)
-        accuracies = [
-            0.9506666666666667,
-            0.9606666666666667,
-            0.9433333333333334,
-        ]
-        for kernel, accuracy_expected in zip(self._kernels, accuracies):
-            clf = Stree(random_state=self._random_state, kernel=kernel,)
-            clf.fit(X, y)
-            accuracy_score = clf.score(X, y)
-            yp = clf.predict(X)
-            accuracy_computed = np.mean(yp == y)
-            self.assertEqual(accuracy_score, accuracy_computed)
-            self.assertAlmostEqual(accuracy_expected, accuracy_score)
-
    def test_single_vs_multiple_prediction(self):
        """Check if predicting sample by sample gives the same result as
        predicting all samples at once
@@ -164,9 +126,6 @@ class Stree_test(unittest.TestCase):

    @staticmethod
    def test_is_a_sklearn_classifier():
-        import warnings
-        from sklearn.exceptions import ConvergenceWarning
-
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        warnings.filterwarnings("ignore", category=RuntimeWarning)
        from sklearn.utils.estimator_checks import check_estimator
@@ -239,6 +198,9 @@ class Stree_test(unittest.TestCase):
                "min_distance linear": 0.9533333333333334,
                "min_distance rbf": 0.836,
                "min_distance poly": 0.9473333333333334,
+                "max_distance linear": 0.9533333333333334,
+                "max_distance rbf": 0.836,
+                "max_distance poly": 0.9473333333333334,
            },
            "Iris": {
                "max_samples linear": 0.98,
@@ -247,11 +209,14 @@ class Stree_test(unittest.TestCase):
                "min_distance linear": 0.98,
                "min_distance rbf": 1.0,
                "min_distance poly": 1.0,
+                "max_distance linear": 0.98,
+                "max_distance rbf": 1.0,
+                "max_distance poly": 1.0,
            },
        }
        for name, dataset in datasets.items():
            px, py = dataset
-            for criteria in ["max_samples", "min_distance"]:
+            for criteria in ["max_samples", "min_distance", "max_distance"]:
                for kernel in self._kernels:
                    clf = Stree(
                        C=1e4,
@@ -322,13 +287,130 @@ class Stree_test(unittest.TestCase):
        with self.assertRaises(ValueError):
            clf.predict(X[:, :3])

+    # Tests of score
+
+    def test_score_binary(self):
+        X, y = load_dataset(self._random_state)
+        accuracies = [
+            0.9506666666666667,
+            0.9606666666666667,
+            0.9433333333333334,
+        ]
+        for kernel, accuracy_expected in zip(self._kernels, accuracies):
+            clf = Stree(random_state=self._random_state, kernel=kernel,)
+            clf.fit(X, y)
+            accuracy_score = clf.score(X, y)
+            yp = clf.predict(X)
+            accuracy_computed = np.mean(yp == y)
+            self.assertEqual(accuracy_score, accuracy_computed)
+            self.assertAlmostEqual(accuracy_expected, accuracy_score)
+
    def test_score_max_features(self):
        X, y = load_dataset(self._random_state)
        clf = Stree(random_state=self._random_state, max_features=2)
        clf.fit(X, y)
        self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))

+    def test_score_multi_class(self):
+        warnings.filterwarnings("ignore")
+        accuracies = [
+            0.8258427,  # Wine    linear min_distance
+            0.6741573,  # Wine    linear max_distance
+            0.8314607,  # Wine    linear max_samples
+            0.6629213,  # Wine    rbf   min_distance
+            1.0000000,  # Wine    rbf   max_distance
+            0.4044944,  # Wine    rbf   max_samples
+            0.9157303,  # Wine    poly  min_distance
+            1.0000000,  # Wine    poly  max_distance
+            0.7640449,  # Wine    poly  max_samples
+            0.9933333,  # Iris    linear min_distance
+            0.9666667,  # Iris    linear max_distance
+            0.9666667,  # Iris    linear max_samples
+            0.9800000,  # Iris    rbf   min_distance
+            0.9800000,  # Iris    rbf   max_distance
+            0.9800000,  # Iris    rbf   max_samples
+            1.0000000,  # Iris    poly  min_distance
+            1.0000000,  # Iris    poly  max_distance
+            1.0000000,  # Iris    poly  max_samples
+            0.8993333,  # Synthetic linear min_distance
+            0.6533333,  # Synthetic linear max_distance
+            0.9313333,  # Synthetic linear max_samples
+            0.8320000,  # Synthetic rbf   min_distance
+            0.6660000,  # Synthetic rbf   max_distance
+            0.8320000,  # Synthetic rbf   max_samples
+            0.6066667,  # Synthetic poly  min_distance
+            0.6840000,  # Synthetic poly  max_distance
+            0.6340000,  # Synthetic poly  max_samples
+        ]
+        datasets = [
+            ("Wine", load_wine(return_X_y=True)),
+            ("Iris", load_iris(return_X_y=True)),
+            (
+                "Synthetic",
+                load_dataset(self._random_state, n_classes=3, n_features=5),
+            ),
+        ]
+        for dataset_name, dataset in datasets:
+            X, y = dataset
+            for kernel in self._kernels:
+                for criteria in [
+                    "min_distance",
+                    "max_distance",
+                    "max_samples",
+                ]:
+                    clf = Stree(
+                        C=17,
+                        random_state=self._random_state,
+                        kernel=kernel,
+                        split_criteria=criteria,
+                        degree=5,
+                        gamma="auto",
+                    )
+                    clf.fit(X, y)
+                    accuracy_score = clf.score(X, y)
+                    yp = clf.predict(X)
+                    accuracy_computed = np.mean(yp == y)
+                    # print(
+                    #     "{:.7f},  # {:7} {:5} {}".format(
+                    #         accuracy_score, dataset_name, kernel, criteria
+                    #     )
+                    # )
+                    accuracy_expected = accuracies.pop(0)
+                    self.assertEqual(accuracy_score, accuracy_computed)
+                    self.assertAlmostEqual(accuracy_expected, accuracy_score)
+
    def test_bogus_splitter_parameter(self):
        clf = Stree(splitter="duck")
        with self.assertRaises(ValueError):
            clf.fit(*load_dataset())
+
+    def test_weights_removing_class(self):
+        # This patch solves an stderr message from sklearn svm lib
+        # "WARNING: class label x specified in weight is not found"
+        X = np.array(
+            [
+                [0.1, 0.1],
+                [0.1, 0.2],
+                [0.2, 0.1],
+                [5, 6],
+                [8, 9],
+                [6, 7],
+                [0.2, 0.2],
+            ]
+        )
+        y = np.array([0, 0, 0, 1, 1, 1, 0])
+        epsilon = 1e-5
+        weights = [1, 1, 1, 0, 0, 0, 1]
+        weights = np.array(weights, dtype="float64")
+        weights_epsilon = [x + epsilon for x in weights]
+        weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
+        original = weights_no_zero.copy()
+        clf = Stree()
+        clf.fit(X, y)
+        node = clf.train(X, y, weights, 1, "test",)
+        # if a class is lost with zero weights the patch adds epsilon
+        self.assertListEqual(weights.tolist(), weights_epsilon)
+        self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
+        # zero weights are ok when they don't erase a class
+        _ = clf.train(X, y, weights_no_zero, 1, "test")
+        self.assertListEqual(weights_no_zero.tolist(), original.tolist())
Author	SHA1	Message	Date
Ricardo Montañana	f5706c3159	Update version and notebooks	2020-06-28 10:44:29 +02:00
Ricardo Montañana	be552fdd6c	Add test for getting 3 feature_sets in Splitter Add ensemble notebook	2020-06-28 02:45:08 +02:00
Ricardo Montañana	5e3a8e3ec5	Change adaboost notebook	2020-06-27 23:34:15 +02:00
Ricardo Montañana	554ec03c32	Get only 3 sets for best split Fix flaky test in Splitter_test	2020-06-27 18:29:40 +02:00
Ricardo Montañana	4b7e4a3fb0	better solution to the sklearn bagging problem Add better tests enhance .coveragerc	2020-06-26 11:22:45 +02:00
Ricardo Montañana	76723993fd	Solve Warning class label not found when bagging	2020-06-25 13:07:50 +02:00
Ricardo Montañana	ecd0b86f4d	Solve the mistake of min and max distance The split criteria functions min and max distance return classes while max_samples return distances positives and negatives to hyperplane of the class with more samples in node	2020-06-17 00:13:52 +02:00
Ricardo Montañana	3e52a4746c	Fix entroy and information_gain functions	2020-06-16 13:56:02 +02:00
Ricardo Montañana Gómez	a20e45e8e7	Merge pull request #10 from Doctorado-ML/add_subspaces #2 Add subspaces	2020-06-15 11:30:53 +02:00