From 5a36c5d29b76a3e3fe19d4d131b8dd82b46ac7ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Mon, 10 May 2021 09:10:39 +0200
Subject: [PATCH] Implement ovo strategy Add kernel liblinear with LinearSVC
 classifier Set ovo strategy as default

---
 README.md                      |  33 +++---
 docs/source/hyperparameters.md |  35 +++---
 stree/Strees.py                |  28 ++++-
 stree/__init__.py              |   2 +-
 stree/tests/Snode_test.py      |   6 +-
 stree/tests/Stree_test.py      | 203 +++++++++++++++++++++++++++------
 6 files changed, 230 insertions(+), 77 deletions(-)
diff --git a/README.md b/README.md
index f8c4139..b8ac6e9 100644
--- a/README.md
+++ b/README.md
@@ -34,22 +34,23 @@ Can be found in
 
 ## Hyperparameters
 
-|     | **Hyperparameter** | **Type/Values**                                        | **Default** | **Meaning**                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| \*  | C                  | \<float\>                                              | 1.0         | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.                                                                                                                                                                                                                                                                                                                              |
-| \*  | kernel             | {"linear", "poly", "rbf", "sigmoid"}                   | linear      | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’.                                                                                                                                                                                                                                                                                                                                                  |
-| \*  | max_iter           | \<int\>                                                | 1e5         | Hard limit on iterations within solver, or -1 for no limit.                                                                                                                                                                                                                                                                                                                                                                                          |
-| \*  | random_state       | \<int\>                                                | None        | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls                                                                                                                                                                                                                                              |
-|     | max_depth          | \<int\>                                                | None        | Specifies the maximum depth of the tree                                                                                                                                                                                                                                                                                                                                                                                                              |
-| \*  | tol                | \<float\>                                              | 1e-4        | Tolerance for stopping criterion.                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| \*  | degree             | \<int\>                                                | 3           | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.                                                                                                                                                                                                                                                                                                                                                                     |
-| \*  | gamma              | {"scale", "auto"} or \<float\>                         | scale       | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features.                                                                                                                                                                                                                                                                      |
-|     | split_criteria     | {"impurity", "max_samples"}                            | impurity    | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*                                                                                                                                                                                                                                                                                                                                   |
-|     | criterion          | {“gini”, “entropy”}                                    | entropy     | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.                                                                                                                                                                                                                                                          |
-|     | min_samples_split  | \<int\>                                                | 0           | The minimum number of samples required to split an internal node. 0 (default) for any                                                                                                                                                                                                                                                                                                                                                                |
-|     | max_features       | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None        | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
-|     | splitter           | {"best", "random"}                                     | random      | The strategy used to choose the feature set at each node (only used if max_features < num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from if random is selected. If best is selected sklearn SelectKBest algorithm is used in every node to choose the _max_features_ best features                       |
-|     | normalize          | \<bool\>                                               | False       | If standardization of features should be applied on each node with the samples that reach it                                                                                                                                                                                                                                                                                                                                                         |
+|     | **Hyperparameter**  | **Type/Values**                                        | **Default** | **Meaning**                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| --- | ------------------- | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| \*  | C                   | \<float\>                                              | 1.0         | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.                                                                                                                                                                                                                                                                                                                              |
+| \*  | kernel              | {"liblinear", "linear", "poly", "rbf", "sigmoid"}      | linear      | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library                                                                                                                                       |
+| \*  | max_iter            | \<int\>                                                | 1e5         | Hard limit on iterations within solver, or -1 for no limit.                                                                                                                                                                                                                                                                                                                                                                                          |
+| \*  | random_state        | \<int\>                                                | None        | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls                                                                                                                                                                                                                                              |
+|     | max_depth           | \<int\>                                                | None        | Specifies the maximum depth of the tree                                                                                                                                                                                                                                                                                                                                                                                                              |
+| \*  | tol                 | \<float\>                                              | 1e-4        | Tolerance for stopping criterion.                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| \*  | degree              | \<int\>                                                | 3           | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.                                                                                                                                                                                                                                                                                                                                                                     |
+| \*  | gamma               | {"scale", "auto"} or \<float\>                         | scale       | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features.                                                                                                                                                                                                                                                                      |
+|     | split_criteria      | {"impurity", "max_samples"}                            | impurity    | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy                                                                                                                                                                                                                                                                       |
+|     | criterion           | {“gini”, “entropy”}                                    | entropy     | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.                                                                                                                                                                                                                                                          |
+|     | min_samples_split   | \<int\>                                                | 0           | The minimum number of samples required to split an internal node. 0 (default) for any                                                                                                                                                                                                                                                                                                                                                                |
+|     | max_features        | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None        | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
+|     | splitter            | {"best", "random", "mutual"}                           | "random"    | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose one randomly. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label                                                 |
+|     | normalize           | \<bool\>                                               | False       | If standardization of features should be applied on each node with the samples that reach it                                                                                                                                                                                                                                                                                                                                                         |
+| \*  | multiclass_strategy | {"ovo", "ovr"}                                         | "ovo"       | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest                                                                                                                                                                                                                                                                                                                                                      |
 
 \* Hyperparameter used by the support vector classifier of every node
 
diff --git a/docs/source/hyperparameters.md b/docs/source/hyperparameters.md
index a4fa6f7..6ffbc3d 100644
--- a/docs/source/hyperparameters.md
+++ b/docs/source/hyperparameters.md
@@ -1,21 +1,22 @@
-# Hyperparameters
+## Hyperparameters
 
-|     | **Hyperparameter** | **Type/Values**                                        | **Default** | **Meaning**                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| --- | ------------------ | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| \*  | C                  | \<float\>                                              | 1.0         | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.                                                                                                                                                                                                                                                                                                                              |
-| \*  | kernel             | {"linear", "poly", "rbf"}                              | linear      | Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’ or ‘rbf’.                                                                                                                                                                                                                                                                                                                                                  |
-| \*  | max_iter           | \<int\>                                                | 1e5         | Hard limit on iterations within solver, or -1 for no limit.                                                                                                                                                                                                                                                                                                                                                                                          |
-| \*  | random_state       | \<int\>                                                | None        | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls                                                                                                                                                                                                                                              |
-|     | max_depth          | \<int\>                                                | None        | Specifies the maximum depth of the tree                                                                                                                                                                                                                                                                                                                                                                                                              |
-| \*  | tol                | \<float\>                                              | 1e-4        | Tolerance for stopping criterion.                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| \*  | degree             | \<int\>                                                | 3           | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.                                                                                                                                                                                                                                                                                                                                                                     |
-| \*  | gamma              | {"scale", "auto"} or \<float\>                         | scale       | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features.                                                                                                                                                                                                                                                                      |
-|     | split_criteria     | {"impurity", "max_samples"}                            | impurity    | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*                                                                                                                                                                                                                                                                                                                                   |
-|     | criterion          | {“gini”, “entropy”}                                    | entropy     | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.                                                                                                                                                                                                                                                          |
-|     | min_samples_split  | \<int\>                                                | 0           | The minimum number of samples required to split an internal node. 0 (default) for any                                                                                                                                                                                                                                                                                                                                                                |
-|     | max_features       | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None        | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
-|     | splitter           | {"best", "random"}                                     | random      | The strategy used to choose the feature set at each node (only used if max_features != num_features). <br>Supported strategies are “best” to choose the best feature set and “random” to choose a random combination. <br>The algorithm generates 5 candidates at most to choose from in both strategies.                                                                                                                                            |
-|     | normalize          | \<bool\>                                               | False       | If standardization of features should be applied on each node with the samples that reach it                                                                                                                                                                                                                                                                                                                                                         |
+|     | **Hyperparameter**  | **Type/Values**                                        | **Default** | **Meaning**                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| --- | ------------------- | ------------------------------------------------------ | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| \*  | C                   | \<float\>                                              | 1.0         | Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive.                                                                                                                                                                                                                                                                                                                              |
+| \*  | kernel              | {"liblinear", "linear", "poly", "rbf", "sigmoid"}      | linear      | Specifies the kernel type to be used in the algorithm. It must be one of ‘liblinear’, ‘linear’, ‘poly’ or ‘rbf’. liblinear uses [liblinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) library and the rest uses [libsvm](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) library through scikit-learn library                                                                                                                                       |
+| \*  | max_iter            | \<int\>                                                | 1e5         | Hard limit on iterations within solver, or -1 for no limit.                                                                                                                                                                                                                                                                                                                                                                                          |
+| \*  | random_state        | \<int\>                                                | None        | Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False.<br>Pass an int for reproducible output across multiple function calls                                                                                                                                                                                                                                              |
+|     | max_depth           | \<int\>                                                | None        | Specifies the maximum depth of the tree                                                                                                                                                                                                                                                                                                                                                                                                              |
+| \*  | tol                 | \<float\>                                              | 1e-4        | Tolerance for stopping criterion.                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| \*  | degree              | \<int\>                                                | 3           | Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.                                                                                                                                                                                                                                                                                                                                                                     |
+| \*  | gamma               | {"scale", "auto"} or \<float\>                         | scale       | Kernel coefficient for ‘rbf’ and ‘poly’.<br>if gamma='scale' (default) is passed then it uses 1 / (n_features \* X.var()) as value of gamma,<br>if ‘auto’, uses 1 / n_features.                                                                                                                                                                                                                                                                      |
+|     | split_criteria      | {"impurity", "max_samples"}                            | impurity    | Decides (just in case of a multi class classification) which column (class) use to split the dataset in a node\*\*. max_samples is incompatible with 'ovo' multiclass_strategy                                                                                                                                                                                                                                                                       |
+|     | criterion           | {“gini”, “entropy”}                                    | entropy     | The function to measure the quality of a split (only used if max_features != num_features). <br>Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.                                                                                                                                                                                                                                                          |
+|     | min_samples_split   | \<int\>                                                | 0           | The minimum number of samples required to split an internal node. 0 (default) for any                                                                                                                                                                                                                                                                                                                                                                |
+|     | max_features        | \<int\>, \<float\> <br><br>or {“auto”, “sqrt”, “log2”} | None        | The number of features to consider when looking for the split:<br>If int, then consider max_features features at each split.<br>If float, then max_features is a fraction and int(max_features \* n_features) features are considered at each split.<br>If “auto”, then max_features=sqrt(n_features).<br>If “sqrt”, then max_features=sqrt(n_features).<br>If “log2”, then max_features=log2(n_features).<br>If None, then max_features=n_features. |
+|     | splitter            | {"best", "random", "mutual"}                           | "random"    | The strategy used to choose the feature set at each node (only used if max_features < num_features). Supported strategies are: **“best”**: sklearn SelectKBest algorithm is used in every node to choose the max_features best features. **“random”**: The algorithm generates 5 candidates and choose one randomly. **"mutual"**: Chooses the best features w.r.t. their mutual info with the label                                                 |
+|     | normalize           | \<bool\>                                               | False       | If standardization of features should be applied on each node with the samples that reach it                                                                                                                                                                                                                                                                                                                                                         |
+| \*  | multiclass_strategy | {"ovo", "ovr"}                                         | "ovo"       | Strategy to use with multiclass datasets, **"ovo"**: one versus one. **"ovr"**: one versus rest                                                                                                                                                                                                                                                                                                                                                      |
 
 \* Hyperparameter used by the support vector classifier of every node
 
diff --git a/stree/Strees.py b/stree/Strees.py
index 15c794e..32c9ff2 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -373,16 +373,17 @@ class Splitter:
         tuple
             indices of the features selected
         """
+        # No feature reduction
         if dataset.shape[1] == max_features:
-            # No feature reduction applies
             return tuple(range(dataset.shape[1]))
+        # Random feature reduction
         if self._feature_select == "random":
             features_sets = self._generate_spaces(
                 dataset.shape[1], max_features
             )
             return self._select_best_set(dataset, labels, features_sets)
+        # return the KBest features
         if self._feature_select == "best":
-            # Take KBest features
             return (
                 SelectKBest(k=max_features)
                 .fit(dataset, labels)
@@ -569,6 +570,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         min_samples_split: int = 0,
         max_features=None,
         splitter: str = "random",
+        multiclass_strategy: str = "ovo",
         normalize: bool = False,
     ):
         self.max_iter = max_iter
@@ -585,6 +587,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         self.criterion = criterion
         self.splitter = splitter
         self.normalize = normalize
+        self.multiclass_strategy = multiclass_strategy
 
     def _more_tags(self) -> dict:
         """Required by sklearn to supply features of the classifier
@@ -629,7 +632,23 @@ class Stree(BaseEstimator, ClassifierMixin):
                 f"Maximum depth has to be greater than 1... got (max_depth=\
                     {self.max_depth})"
             )
-        kernels = ["linear", "rbf", "poly", "sigmoid"]
+        if self.multiclass_strategy not in ["ovr", "ovo"]:
+            raise ValueError(
+                "mutliclass_strategy has to be either ovr or ovo"
+                f" but got {self.multiclass_strategy}"
+            )
+        if self.multiclass_strategy == "ovo":
+            if self.kernel == "liblinear":
+                raise ValueError(
+                    "The kernel liblinear is incompatible with ovo "
+                    "multiclass_strategy"
+                )
+            if self.split_criteria == "max_samples":
+                raise ValueError(
+                    "The multiclass_strategy 'ovo' is incompatible with "
+                    "split_criteria 'max_samples'"
+                )
+        kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
         if self.kernel not in kernels:
             raise ValueError(f"Kernel {self.kernel} not in {kernels}")
         check_classification_targets(y)
@@ -749,7 +768,7 @@ class Stree(BaseEstimator, ClassifierMixin):
                 C=self.C,
                 tol=self.tol,
             )
-            if self.kernel == "linear"
+            if self.kernel == "liblinear"
             else SVC(
                 kernel=self.kernel,
                 max_iter=self.max_iter,
@@ -758,6 +777,7 @@ class Stree(BaseEstimator, ClassifierMixin):
                 gamma=self.gamma,
                 degree=self.degree,
                 random_state=self.random_state,
+                decision_function_shape=self.multiclass_strategy,
             )
         )
 
diff --git a/stree/__init__.py b/stree/__init__.py
index eddafae..3450f0b 100644
--- a/stree/__init__.py
+++ b/stree/__init__.py
@@ -1,6 +1,6 @@
 from .Strees import Stree, Snode, Siterator, Splitter
 
-__version__ = "1.0"
+__version__ = "1.1"
 
 __author__ = "Ricardo Montañana Gómez"
 __copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
diff --git a/stree/tests/Snode_test.py b/stree/tests/Snode_test.py
index d60cbfc..51bcca5 100644
--- a/stree/tests/Snode_test.py
+++ b/stree/tests/Snode_test.py
@@ -8,7 +8,11 @@ from .utils import load_dataset
 class Snode_test(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         self._random_state = 1
-        self._clf = Stree(random_state=self._random_state)
+        self._clf = Stree(
+            random_state=self._random_state,
+            kernel="liblinear",
+            multiclass_strategy="ovr",
+        )
         self._clf.fit(*load_dataset(self._random_state))
         super().__init__(*args, **kwargs)
 
diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py
index d05b322..5581b66 100644
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -14,7 +14,7 @@ from .utils import load_dataset
 class Stree_test(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         self._random_state = 1
-        self._kernels = ["linear", "rbf", "poly"]
+        self._kernels = ["liblinear", "linear", "rbf", "poly", "sigmoid"]
         super().__init__(*args, **kwargs)
 
     @classmethod
@@ -22,10 +22,9 @@ class Stree_test(unittest.TestCase):
         os.environ["TESTING"] = "1"
 
     def test_valid_kernels(self):
-        valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
         X, y = load_dataset()
-        for kernel in valid_kernels:
-            clf = Stree(kernel=kernel)
+        for kernel in self._kernels:
+            clf = Stree(kernel=kernel, multiclass_strategy="ovr")
             clf.fit(X, y)
             self.assertIsNotNone(clf.tree_)
 
@@ -55,14 +54,19 @@ class Stree_test(unittest.TestCase):
         # i.e. The partition algorithm didn't forget any sample
         self.assertEqual(node._y.shape[0], y_down.shape[0] + y_up.shape[0])
         unique_y, count_y = np.unique(node._y, return_counts=True)
-        _, count_d = np.unique(y_down, return_counts=True)
-        _, count_u = np.unique(y_up, return_counts=True)
+        labels_d, count_d = np.unique(y_down, return_counts=True)
+        labels_u, count_u = np.unique(y_up, return_counts=True)
+        dict_d = {label: count_d[i] for i, label in enumerate(labels_d)}
+        dict_u = {label: count_u[i] for i, label in enumerate(labels_u)}
         #
         for i in unique_y:
-            number_up = count_u[i]
             try:
-                number_down = count_d[i]
-            except IndexError:
+                number_up = dict_u[i]
+            except KeyError:
+                number_up = 0
+            try:
+                number_down = dict_d[i]
+            except KeyError:
                 number_down = 0
             self.assertEqual(count_y[i], number_down + number_up)
         # Is the partition made the same as the prediction?
@@ -77,14 +81,22 @@ class Stree_test(unittest.TestCase):
         """Check if the tree is built the same way as predictions of models"""
         warnings.filterwarnings("ignore")
         for kernel in self._kernels:
-            clf = Stree(kernel=kernel, random_state=self._random_state)
+            clf = Stree(
+                kernel="sigmoid",
+                multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
+                random_state=self._random_state,
+            )
             clf.fit(*load_dataset(self._random_state))
             self._check_tree(clf.tree_)
 
     def test_single_prediction(self):
         X, y = load_dataset(self._random_state)
         for kernel in self._kernels:
-            clf = Stree(kernel=kernel, random_state=self._random_state)
+            clf = Stree(
+                kernel=kernel,
+                multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
+                random_state=self._random_state,
+            )
             yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
             self.assertEqual(yp[0], y[0])
 
@@ -92,8 +104,12 @@ class Stree_test(unittest.TestCase):
         # First 27 elements the predictions are the same as the truth
         num = 27
         X, y = load_dataset(self._random_state)
-        for kernel in self._kernels:
-            clf = Stree(kernel=kernel, random_state=self._random_state)
+        for kernel in ["liblinear", "linear", "rbf", "poly"]:
+            clf = Stree(
+                kernel=kernel,
+                multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
+                random_state=self._random_state,
+            )
             yp = clf.fit(X, y).predict(X[:num, :])
             self.assertListEqual(y[:num].tolist(), yp.tolist())
 
@@ -103,7 +119,11 @@ class Stree_test(unittest.TestCase):
         """
         X, y = load_dataset(self._random_state)
         for kernel in self._kernels:
-            clf = Stree(kernel=kernel, random_state=self._random_state)
+            clf = Stree(
+                kernel=kernel,
+                multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
+                random_state=self._random_state,
+            )
             clf.fit(X, y)
             # Compute prediction line by line
             yp_line = np.array([], dtype=int)
@@ -135,7 +155,11 @@ class Stree_test(unittest.TestCase):
         ]
         computed = []
         expected_string = ""
-        clf = Stree(kernel="linear", random_state=self._random_state)
+        clf = Stree(
+            kernel="liblinear",
+            multiclass_strategy="ovr",
+            random_state=self._random_state,
+        )
         clf.fit(*load_dataset(self._random_state))
         for node in clf:
             computed.append(str(node))
@@ -173,7 +197,12 @@ class Stree_test(unittest.TestCase):
     def test_check_max_depth(self):
         depths = (3, 4)
         for depth in depths:
-            tcl = Stree(random_state=self._random_state, max_depth=depth)
+            tcl = Stree(
+                kernel="liblinear",
+                multiclass_strategy="ovr",
+                random_state=self._random_state,
+                max_depth=depth,
+            )
             tcl.fit(*load_dataset(self._random_state))
             self.assertEqual(depth, tcl.depth_)
 
@@ -194,7 +223,7 @@ class Stree_test(unittest.TestCase):
         for kernel in self._kernels:
             clf = Stree(
                 kernel=kernel,
-                split_criteria="max_samples",
+                multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
                 random_state=self._random_state,
             )
             px = [[1, 2], [5, 6], [9, 10]]
@@ -205,26 +234,36 @@ class Stree_test(unittest.TestCase):
             self.assertListEqual(py, clf.classes_.tolist())
 
     def test_muticlass_dataset(self):
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        warnings.filterwarnings("ignore", category=RuntimeWarning)
         datasets = {
             "Synt": load_dataset(random_state=self._random_state, n_classes=3),
             "Iris": load_wine(return_X_y=True),
         }
         outcomes = {
             "Synt": {
-                "max_samples linear": 0.9606666666666667,
+                "max_samples liblinear": 0.9606666666666667,
+                "max_samples linear": 0.786,
                 "max_samples rbf": 0.7133333333333334,
                 "max_samples poly": 0.618,
-                "impurity linear": 0.9606666666666667,
+                "max_samples sigmoid": 0.8826666666666667,
+                "impurity liblinear": 0.9606666666666667,
+                "impurity linear": 0.786,
                 "impurity rbf": 0.7133333333333334,
                 "impurity poly": 0.618,
+                "impurity sigmoid": 0.8826666666666667,
             },
             "Iris": {
+                "max_samples liblinear": 1.0,
                 "max_samples linear": 1.0,
                 "max_samples rbf": 0.6910112359550562,
                 "max_samples poly": 0.6966292134831461,
+                "max_samples sigmoid": 0.6573033707865169,
+                "impurity liblinear": 1,
                 "impurity linear": 1,
                 "impurity rbf": 0.6910112359550562,
                 "impurity poly": 0.6966292134831461,
+                "impurity sigmoid": 0.6573033707865169,
             },
         }
 
@@ -235,14 +274,15 @@ class Stree_test(unittest.TestCase):
                     clf = Stree(
                         C=55,
                         max_iter=1e5,
+                        multiclass_strategy="ovr",
                         kernel=kernel,
                         random_state=self._random_state,
                     )
                     clf.fit(px, py)
                     outcome = outcomes[name][f"{criteria} {kernel}"]
                     # print(
-                    #     f"{name} {criteria} {kernel} {outcome} {clf.score(px"
-                    #     ", py)}"
+                    #     f"{name} {criteria} {kernel} {outcome} "
+                    #     f"{clf.score(px, py)}"
                     # )
                     self.assertAlmostEqual(outcome, clf.score(px, py))
 
@@ -312,17 +352,19 @@ class Stree_test(unittest.TestCase):
             clf.predict(X[:, :3])
 
     # Tests of score
-
     def test_score_binary(self):
         X, y = load_dataset(self._random_state)
         accuracies = [
             0.9506666666666667,
+            0.9493333333333334,
             0.9606666666666667,
             0.9433333333333334,
+            0.9153333333333333,
         ]
         for kernel, accuracy_expected in zip(self._kernels, accuracies):
             clf = Stree(
                 random_state=self._random_state,
+                multiclass_strategy="ovr" if kernel == "liblinear" else "ovo",
                 kernel=kernel,
             )
             clf.fit(X, y)
@@ -334,7 +376,12 @@ class Stree_test(unittest.TestCase):
 
     def test_score_max_features(self):
         X, y = load_dataset(self._random_state)
-        clf = Stree(random_state=self._random_state, max_features=2)
+        clf = Stree(
+            kernel="liblinear",
+            multiclass_strategy="ovr",
+            random_state=self._random_state,
+            max_features=2,
+        )
         clf.fit(X, y)
         self.assertAlmostEqual(0.9453333333333334, clf.score(X, y))
 
@@ -346,7 +393,9 @@ class Stree_test(unittest.TestCase):
     def test_multiclass_classifier_integrity(self):
         """Checks if the multiclass operation is done right"""
         X, y = load_iris(return_X_y=True)
-        clf = Stree(random_state=0)
+        clf = Stree(
+            kernel="liblinear", multiclass_strategy="ovr", random_state=0
+        )
         clf.fit(X, y)
         score = clf.score(X, y)
         # Check accuracy of the whole model
@@ -402,10 +451,10 @@ class Stree_test(unittest.TestCase):
         clf2 = Stree(
             kernel="rbf", random_state=self._random_state, normalize=True
         )
-        self.assertEqual(0.768, clf.fit(X, y).score(X, y))
-        self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
+        self.assertEqual(0.966, clf.fit(X, y).score(X, y))
+        self.assertEqual(0.964, clf2.fit(X, y).score(X, y))
         X, y = load_wine(return_X_y=True)
-        self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
+        self.assertEqual(0.6685393258426966, clf.fit(X, y).score(X, y))
         self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
 
     def test_score_multiclass_poly(self):
@@ -423,24 +472,78 @@ class Stree_test(unittest.TestCase):
             random_state=self._random_state,
             normalize=True,
         )
-        self.assertEqual(0.786, clf.fit(X, y).score(X, y))
-        self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
+        self.assertEqual(0.946, clf.fit(X, y).score(X, y))
+        self.assertEqual(0.972, clf2.fit(X, y).score(X, y))
         X, y = load_wine(return_X_y=True)
-        self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
-        self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
+        self.assertEqual(0.7808988764044944, clf.fit(X, y).score(X, y))
+        self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
+
+    def test_score_multiclass_liblinear(self):
+        X, y = load_dataset(
+            random_state=self._random_state,
+            n_classes=3,
+            n_features=5,
+            n_samples=500,
+        )
+        clf = Stree(
+            kernel="liblinear",
+            multiclass_strategy="ovr",
+            random_state=self._random_state,
+            C=10,
+        )
+        clf2 = Stree(
+            kernel="liblinear",
+            multiclass_strategy="ovr",
+            random_state=self._random_state,
+            normalize=True,
+        )
+        self.assertEqual(0.968, clf.fit(X, y).score(X, y))
+        self.assertEqual(0.97, clf2.fit(X, y).score(X, y))
+        X, y = load_wine(return_X_y=True)
+        self.assertEqual(1.0, clf.fit(X, y).score(X, y))
+        self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
+
+    def test_score_multiclass_sigmoid(self):
+        X, y = load_dataset(
+            random_state=self._random_state,
+            n_classes=3,
+            n_features=5,
+            n_samples=500,
+        )
+        clf = Stree(kernel="sigmoid", random_state=self._random_state, C=10)
+        clf2 = Stree(
+            kernel="sigmoid",
+            random_state=self._random_state,
+            normalize=True,
+            C=10,
+        )
+        self.assertEqual(0.796, clf.fit(X, y).score(X, y))
+        self.assertEqual(0.952, clf2.fit(X, y).score(X, y))
+        X, y = load_wine(return_X_y=True)
+        self.assertEqual(0.6910112359550562, clf.fit(X, y).score(X, y))
+        self.assertEqual(0.9662921348314607, clf2.fit(X, y).score(X, y))
 
     def test_score_multiclass_linear(self):
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        warnings.filterwarnings("ignore", category=RuntimeWarning)
         X, y = load_dataset(
             random_state=self._random_state,
             n_classes=3,
             n_features=5,
             n_samples=1500,
         )
-        clf = Stree(kernel="linear", random_state=self._random_state)
+        clf = Stree(
+            kernel="liblinear",
+            multiclass_strategy="ovr",
+            random_state=self._random_state,
+        )
         self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
         # Check with context based standardization
         clf2 = Stree(
-            kernel="linear", random_state=self._random_state, normalize=True
+            kernel="liblinear",
+            multiclass_strategy="ovr",
+            random_state=self._random_state,
+            normalize=True,
         )
         self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
         X, y = load_wine(return_X_y=True)
@@ -467,7 +570,7 @@ class Stree_test(unittest.TestCase):
             ]
         )
         y = np.array([1, 1, 1, 2, 2, 2, 5, 5, 5])
-        yw = np.array([1, 1, 1, 5, 5, 5, 5, 5, 5])
+        yw = np.array([1, 1, 1, 1, 1, 1, 5, 5, 5])
         w = [1, 1, 1, 0, 0, 0, 1, 1, 1]
         model1 = Stree().fit(X, y)
         model2 = Stree().fit(X, y, w)
@@ -504,14 +607,14 @@ class Stree_test(unittest.TestCase):
         clf = Stree(random_state=self._random_state)
         clf.fit(X, y)
         nodes, leaves = clf.nodes_leaves()
-        self.assertEqual(25, nodes)
-        self.assertEqual(13, leaves)
+        self.assertEqual(31, nodes)
+        self.assertEqual(16, leaves)
         X, y = load_wine(return_X_y=True)
         clf = Stree(random_state=self._random_state)
         clf.fit(X, y)
         nodes, leaves = clf.nodes_leaves()
-        self.assertEqual(9, nodes)
-        self.assertEqual(5, leaves)
+        self.assertEqual(11, nodes)
+        self.assertEqual(6, leaves)
 
     def test_nodes_leaves_artificial(self):
         n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
@@ -530,3 +633,27 @@ class Stree_test(unittest.TestCase):
         nodes, leaves = clf.nodes_leaves()
         self.assertEqual(6, nodes)
         self.assertEqual(2, leaves)
+
+    def test_bogus_multiclass_strategy(self):
+        clf = Stree(multiclass_strategy="other")
+        X, y = load_wine(return_X_y=True)
+        with self.assertRaises(ValueError):
+            clf.fit(X, y)
+
+    def test_multiclass_strategy(self):
+        X, y = load_wine(return_X_y=True)
+        clf_o = Stree(multiclass_strategy="ovo")
+        clf_r = Stree(multiclass_strategy="ovr")
+        score_o = clf_o.fit(X, y).score(X, y)
+        score_r = clf_r.fit(X, y).score(X, y)
+        self.assertEqual(1.0, score_o)
+        self.assertEqual(0.9269662921348315, score_r)
+
+    def test_incompatible_hyperparameters(self):
+        X, y = load_wine(return_X_y=True)
+        clf = Stree(kernel="liblinear", multiclass_strategy="ovo")
+        with self.assertRaises(ValueError):
+            clf.fit(X, y)
+        clf = Stree(multiclass_strategy="ovo", split_criteria="max_samples")
+        with self.assertRaises(ValueError):
+            clf.fit(X, y)