From ae1c199e210c50891bcd897aee0166ade6fa9fb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Sat, 13 Jun 2020 17:58:45 +0200
Subject: [PATCH 1/6] # 2 - add max_features parameters

---
 stree/Strees.py           | 68 ++++++++++++++++++++++++++++++++++-----
 stree/tests/Stree_test.py | 44 +++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 8 deletions(-)
diff --git a/stree/Strees.py b/stree/Strees.py
index eeffe65..e36ac1d 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -7,7 +7,9 @@ Build an oblique tree classifier based on SVM Trees
 """
 
 import os
-
+import numbers
+import random
+from itertools import combinations
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.svm import SVC, LinearSVC
@@ -127,8 +129,9 @@ class Stree(BaseEstimator, ClassifierMixin):
         tol: float = 1e-4,
         degree: int = 3,
         gamma="scale",
-        split_criteria="max_samples",
+        split_criteria: str = "max_samples",
         min_samples_split: int = 0,
+        max_features=None,
     ):
         self.max_iter = max_iter
         self.C = C
@@ -140,6 +143,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         self.degree = degree
         self.min_samples_split = min_samples_split
         self.split_criteria = split_criteria
+        self.max_features = max_features
 
     def _more_tags(self) -> dict:
         """Required by sklearn to supply features of the classifier
@@ -160,10 +164,10 @@ class Stree(BaseEstimator, ClassifierMixin):
         :rtype: list
         """
         up = ~down
-        return (
+        return [
             origin[up] if any(up) else None,
             origin[down] if any(down) else None,
-        )
+        ]
 
     def _distances(self, node: Snode, data: np.ndarray) -> np.array:
         """Compute distances of the samples to the hyperplane of the node
@@ -257,7 +261,8 @@ class Stree(BaseEstimator, ClassifierMixin):
         self.n_classes_ = self.classes_.shape[0]
         self.n_iter_ = self.max_iter
         self.depth_ = 0
-        self.n_features_in_ = X.shape[1]
+        self.n_features_ = X.shape[1]
+        self.max_features_ = self._initialize_max_features()
         self.tree_ = self.train(X, y, sample_weight, 1, "root")
         self._build_predictor()
         return self
@@ -294,10 +299,11 @@ class Stree(BaseEstimator, ClassifierMixin):
             return Snode(None, X, y, title + ", <pure>")
         # Train the model
         clf = self._build_clf()
-        clf.fit(X, y, sample_weight=sample_weight)
-        node = Snode(clf, X, y, title)
+        Xs, indices_subset = self._get_subspace(X)
+        clf.fit(Xs, y, sample_weight=sample_weight)
+        node = Snode(clf, Xs, y, title)
         self.depth_ = max(depth, self.depth_)
-        down = self._split_criteria(self._distances(node, X), node)
+        down = self._split_criteria(self._distances(node, Xs), node)
         X_U, X_D = self._split_array(X, down)
         y_u, y_d = self._split_array(y, down)
         sw_u, sw_d = self._split_array(sample_weight, down)
@@ -446,3 +452,49 @@ class Stree(BaseEstimator, ClassifierMixin):
         for i in self:
             output += str(i) + "\n"
         return output
+
+    def _initialize_max_features(self) -> int:
+        if isinstance(self.max_features, str):
+            if self.max_features == "auto":
+                max_features = max(1, int(np.sqrt(self.n_features_)))
+            elif self.max_features == "sqrt":
+                max_features = max(1, int(np.sqrt(self.n_features_)))
+            elif self.max_features == "log2":
+                max_features = max(1, int(np.log2(self.n_features_)))
+            else:
+                raise ValueError(
+                    "Invalid value for max_features. "
+                    "Allowed string values are 'auto', "
+                    "'sqrt' or 'log2'."
+                )
+        elif self.max_features is None:
+            max_features = self.n_features_
+        elif isinstance(self.max_features, numbers.Integral):
+            max_features = self.max_features
+        else:  # float
+            if self.max_features > 0.0:
+                max_features = max(
+                    1, int(self.max_features * self.n_features_)
+                )
+            else:
+                raise ValueError(
+                    "Invalid value for max_features."
+                    "Allowed float must be in range (0, 1] "
+                    f"got ({self.max_features})"
+                )
+        return max_features
+
+    def _get_subspace(self, dataset: np.array) -> list:
+        """Return the best subspace to make a split
+        """
+
+        def get_subspaces_set(dataset: np.array) -> np.array:
+            features = range(dataset.shape[1])
+            features_sets = list(combinations(features, self.max_features_))
+            if len(features_sets) > 1:
+                return features_sets[random.randint(0, len(features_sets))]
+            else:
+                return features_sets[0]
+
+        indices = get_subspaces_set(dataset)
+        return dataset[:, indices], indices
diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py
index a921838..0c809b3 100644
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -295,3 +295,47 @@ class Stree_test(unittest.TestCase):
         computed = clf._max_samples(data, y)
         self.assertEqual((4,), computed.shape)
         self.assertListEqual(expected.tolist(), computed.tolist())
+
+    def test_max_features(self):
+        n_features = 16
+        expected_values = [
+            ("auto", 4),
+            ("log2", 4),
+            ("sqrt", 4),
+            (0.5, 8),
+            (3, 3),
+            (None, 16),
+        ]
+        clf = Stree()
+        clf.n_features_ = n_features
+        for max_features, expected in expected_values:
+            clf.set_params(**dict(max_features=max_features))
+            computed = clf._initialize_max_features()
+            self.assertEqual(expected, computed)
+        # Check bogus max_features
+        values = ["duck", -0.1, 0.0]
+        for max_features in values:
+            clf.set_params(**dict(max_features=max_features))
+            with self.assertRaises(ValueError):
+                _ = clf._initialize_max_features()
+
+    def test_get_subspaces(self):
+        dataset = np.random.random((10, 16))
+        y = np.random.randint(0, 2, 10)
+        expected_values = [
+            ("auto", 4),
+            ("log2", 4),
+            ("sqrt", 4),
+            (0.5, 8),
+            (3, 3),
+            (None, 16),
+        ]
+        clf = Stree()
+        for max_features, expected in expected_values:
+            clf.set_params(**dict(max_features=max_features))
+            clf.fit(dataset, y)
+            computed, indices = clf._get_subspace(dataset)
+            self.assertListEqual(
+                dataset[:, indices].tolist(), computed.tolist()
+            )
+            self.assertEqual(expected, len(indices))

From f1ee4de37beeea80390d63c7bce8afe7ba9e966a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Sun, 14 Jun 2020 03:08:55 +0200
Subject: [PATCH 2/6] #2 - Add gini and entropy  measures rename get_dataset to
 load_dataset add features and impurity to  __str__ of node

---
 main.py                   | 67 +++----------------------------------
 stree/Strees.py           | 70 +++++++++++++++++++++++++++++++++++----
 stree/tests/Snode_test.py | 14 ++++----
 stree/tests/Stree_test.py | 63 +++++++++++++++++++++++------------
 stree/tests/utils.py      |  2 +-
 5 files changed, 118 insertions(+), 98 deletions(-)

diff --git a/main.py b/main.py
index 30d36de..e4722c7 100644
--- a/main.py
+++ b/main.py
@@ -1,72 +1,15 @@
 import time
 from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_iris
 from stree import Stree
 
 random_state = 1
 
+X, y = load_iris(return_X_y=True)
 
-def load_creditcard(n_examples=0):
-    import pandas as pd
-    import numpy as np
-    import random
-
-    df = pd.read_csv("data/creditcard.csv")
-    print(
-        "Fraud: {0:.3f}% {1}".format(
-            df.Class[df.Class == 1].count() * 100 / df.shape[0],
-            df.Class[df.Class == 1].count(),
-        )
-    )
-    print(
-        "Valid: {0:.3f}% {1}".format(
-            df.Class[df.Class == 0].count() * 100 / df.shape[0],
-            df.Class[df.Class == 0].count(),
-        )
-    )
-    y = np.expand_dims(df.Class.values, axis=1)
-    X = df.drop(["Class", "Time", "Amount"], axis=1).values
-    if n_examples > 0:
-        # Take first n_examples samples
-        X = X[:n_examples, :]
-        y = y[:n_examples, :]
-    else:
-        # Take all the positive samples with a number of random negatives
-        if n_examples < 0:
-            Xt = X[(y == 1).ravel()]
-            yt = y[(y == 1).ravel()]
-            indices = random.sample(range(X.shape[0]), -1 * n_examples)
-            X = np.append(Xt, X[indices], axis=0)
-            y = np.append(yt, y[indices], axis=0)
-    print("X.shape", X.shape, " y.shape", y.shape)
-    print(
-        "Fraud: {0:.3f}% {1}".format(
-            len(y[y == 1]) * 100 / X.shape[0], len(y[y == 1])
-        )
-    )
-    print(
-        "Valid: {0:.3f}% {1}".format(
-            len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0])
-        )
-    )
-    Xtrain, Xtest, ytrain, ytest = train_test_split(
-        X,
-        y,
-        train_size=0.7,
-        shuffle=True,
-        random_state=random_state,
-        stratify=y,
-    )
-    return Xtrain, Xtest, ytrain, ytest
-
-
-# data = load_creditcard(-5000) # Take all true samples + 5000 of the others
-# data = load_creditcard(5000)  # Take the first 5000 samples
-data = load_creditcard()  # Take all the samples
-
-Xtrain = data[0]
-Xtest = data[1]
-ytrain = data[2]
-ytest = data[3]
+Xtrain, Xtest, ytrain, ytest = train_test_split(
+    X, y, test_size=0.2, random_state=random_state
+)
 
 now = time.time()
 clf = Stree(C=0.01, random_state=random_state)
diff --git a/stree/Strees.py b/stree/Strees.py
index e36ac1d..cb8731f 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -29,7 +29,15 @@ class Snode:
     dataset assigned to it
     """
 
-    def __init__(self, clf: SVC, X: np.ndarray, y: np.ndarray, title: str):
+    def __init__(
+        self,
+        clf: SVC,
+        X: np.ndarray,
+        y: np.ndarray,
+        features: np.array,
+        impurity: float,
+        title: str,
+    ):
         self._clf = clf
         self._title = title
         self._belief = 0.0
@@ -39,10 +47,21 @@ class Snode:
         self._down = None
         self._up = None
         self._class = None
+        self._feature = None
+        self._sample_weight = None
+        self._features = features
+        self._impurity = impurity
 
     @classmethod
     def copy(cls, node: "Snode") -> "Snode":
-        return cls(node._clf, node._X, node._y, node._title)
+        return cls(
+            node._clf,
+            node._X,
+            node._y,
+            node._features,
+            node._impurity,
+            node._title,
+        )
 
     def set_down(self, son):
         self._down = son
@@ -83,11 +102,15 @@ class Snode:
             count_values = np.unique(self._y, return_counts=True)
             result = (
                 f"{self._title} - Leaf class={self._class} belief="
-                f"{self._belief: .6f} counts={count_values}"
+                f"{self._belief: .6f} impurity={self._impurity:.4f} "
+                f"counts={count_values}"
             )
             return result
         else:
-            return f"{self._title}"
+            return (
+                f"{self._title} feaures={self._features} impurity="
+                f"{self._impurity:.4f}"
+            )
 
 
 class Siterator:
@@ -130,6 +153,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         degree: int = 3,
         gamma="scale",
         split_criteria: str = "max_samples",
+        criterion: str = "gini",
         min_samples_split: int = 0,
         max_features=None,
     ):
@@ -144,6 +168,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         self.min_samples_split = min_samples_split
         self.split_criteria = split_criteria
         self.max_features = max_features
+        self.criterion = criterion
 
     def _more_tags(self) -> dict:
         """Required by sklearn to supply features of the classifier
@@ -251,6 +276,10 @@ class Stree(BaseEstimator, ClassifierMixin):
                 f"split_criteria has to be min_distance or \
                 max_samples got ({self.split_criteria})"
             )
+        if self.criterion not in ["gini", "entropy"]:
+            raise ValueError(
+                f"criterion must be gini or entropy got({self.criterion})"
+            )
 
         check_classification_targets(y)
         X, y = check_X_y(X, y)
@@ -263,6 +292,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         self.depth_ = 0
         self.n_features_ = X.shape[1]
         self.max_features_ = self._initialize_max_features()
+        self.criterion_function_ = getattr(self, f"_{self.criterion}")
         self.tree_ = self.train(X, y, sample_weight, 1, "root")
         self._build_predictor()
         return self
@@ -296,12 +326,20 @@ class Stree(BaseEstimator, ClassifierMixin):
             return None
         if np.unique(y).shape[0] == 1:
             # only 1 class => pure dataset
-            return Snode(None, X, y, title + ", <pure>")
+            return Snode(
+                clf=None,
+                X=X,
+                y=y,
+                features=X.shape[1],
+                impurity=0.0,
+                title=title + ", <pure>",
+            )
         # Train the model
         clf = self._build_clf()
         Xs, indices_subset = self._get_subspace(X)
         clf.fit(Xs, y, sample_weight=sample_weight)
-        node = Snode(clf, Xs, y, title)
+        impurity = self.criterion_function_(y)
+        node = Snode(clf, X, y, indices_subset, impurity, title)
         self.depth_ = max(depth, self.depth_)
         down = self._split_criteria(self._distances(node, Xs), node)
         X_U, X_D = self._split_array(X, down)
@@ -309,7 +347,14 @@ class Stree(BaseEstimator, ClassifierMixin):
         sw_u, sw_d = self._split_array(sample_weight, down)
         if X_U is None or X_D is None:
             # didn't part anything
-            return Snode(clf, X, y, title + ", <cgaf>")
+            return Snode(
+                clf,
+                X,
+                y,
+                features=X.shape[1],
+                impurity=impurity,
+                title=title + ", <cgaf>",
+            )
         node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
         node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
         return node
@@ -484,6 +529,17 @@ class Stree(BaseEstimator, ClassifierMixin):
                 )
         return max_features
 
+    @staticmethod
+    def _gini(y: np.array) -> float:
+        _, count = np.unique(y, return_counts=True)
+        return 1 - np.sum(np.square(count / np.sum(count)))
+
+    @staticmethod
+    def _entropy(y: np.array) -> float:
+        _, count = np.unique(y, return_counts=True)
+        proportion = count / np.sum(count)
+        return -np.sum(proportion * np.log2(proportion))
+
     def _get_subspace(self, dataset: np.array) -> list:
         """Return the best subspace to make a split
         """
diff --git a/stree/tests/Snode_test.py b/stree/tests/Snode_test.py
index c82bd99..6f3c4d2 100644
--- a/stree/tests/Snode_test.py
+++ b/stree/tests/Snode_test.py
@@ -4,14 +4,14 @@ import unittest
 import numpy as np
 
 from stree import Stree, Snode
-from .utils import get_dataset
+from .utils import load_dataset
 
 
 class Snode_test(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         self._random_state = 1
         self._clf = Stree(random_state=self._random_state)
-        self._clf.fit(*get_dataset(self._random_state))
+        self._clf.fit(*load_dataset(self._random_state))
         super().__init__(*args, **kwargs)
 
     @classmethod
@@ -63,27 +63,27 @@ class Snode_test(unittest.TestCase):
         run_tree(self._clf.tree_)
 
     def test_make_predictor_on_leaf(self):
-        test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
+        test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
         test.make_predictor()
         self.assertEqual(1, test._class)
         self.assertEqual(0.75, test._belief)
 
     def test_make_predictor_on_not_leaf(self):
-        test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test")
-        test.set_up(Snode(None, [1], [1], "another_test"))
+        test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test")
+        test.set_up(Snode(None, [1], [1], [], 0.0, "another_test"))
         test.make_predictor()
         self.assertIsNone(test._class)
         self.assertEqual(0, test._belief)
 
     def test_make_predictor_on_leaf_bogus_data(self):
-        test = Snode(None, [1, 2, 3, 4], [], "test")
+        test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test")
         test.make_predictor()
         self.assertIsNone(test._class)
 
     def test_copy_node(self):
         px = [1, 2, 3, 4]
         py = [1]
-        test = Snode(Stree(), px, py, "test")
+        test = Snode(Stree(), px, py, [], 0.0, "test")
         computed = Snode.copy(test)
         self.assertListEqual(computed._X, px)
         self.assertListEqual(computed._y, py)
diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py
index 0c809b3..a3fb3d1 100644
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -5,7 +5,7 @@ import numpy as np
 from sklearn.datasets import load_iris
 
 from stree import Stree, Snode
-from .utils import get_dataset
+from .utils import load_dataset
 
 
 class Stree_test(unittest.TestCase):
@@ -64,7 +64,7 @@ class Stree_test(unittest.TestCase):
         warnings.filterwarnings("ignore")
         for kernel in self._kernels:
             clf = Stree(kernel=kernel, random_state=self._random_state)
-            clf.fit(*get_dataset(self._random_state))
+            clf.fit(*load_dataset(self._random_state))
             self._check_tree(clf.tree_)
 
     def _find_out(
@@ -88,7 +88,7 @@ class Stree_test(unittest.TestCase):
         return res
 
     def test_single_prediction(self):
-        X, y = get_dataset(self._random_state)
+        X, y = load_dataset(self._random_state)
         for kernel in self._kernels:
             clf = Stree(kernel=kernel, random_state=self._random_state)
             yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1])))
@@ -97,14 +97,14 @@ class Stree_test(unittest.TestCase):
     def test_multiple_prediction(self):
         # First 27 elements the predictions are the same as the truth
         num = 27
-        X, y = get_dataset(self._random_state)
+        X, y = load_dataset(self._random_state)
         for kernel in self._kernels:
             clf = Stree(kernel=kernel, random_state=self._random_state)
             yp = clf.fit(X, y).predict(X[:num, :])
             self.assertListEqual(y[:num].tolist(), yp.tolist())
 
     def test_score(self):
-        X, y = get_dataset(self._random_state)
+        X, y = load_dataset(self._random_state)
         accuracies = [
             0.9506666666666667,
             0.9606666666666667,
@@ -123,7 +123,7 @@ class Stree_test(unittest.TestCase):
         """Check if predicting sample by sample gives the same result as
         predicting all samples at once
         """
-        X, y = get_dataset(self._random_state)
+        X, y = load_dataset(self._random_state)
         for kernel in self._kernels:
             clf = Stree(kernel=kernel, random_state=self._random_state)
             clf.fit(X, y)
@@ -141,22 +141,22 @@ class Stree_test(unittest.TestCase):
         """Check preorder iterator
         """
         expected = [
-            "root",
-            "root - Down",
-            "root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 counts"
-            "=(array([0, 1]), array([ 17, 691]))",
-            "root - Down - Up",
+            "root feaures=(0, 1, 2) impurity=0.5000",
+            "root - Down feaures=(0, 1, 2) impurity=0.0671",
+            "root - Down - Down, <cgaf> - Leaf class=1 belief= 0.975989 "
+            "impurity=0.0469 counts=(array([0, 1]), array([ 17, 691]))",
+            "root - Down - Up feaures=(0, 1, 2) impurity=0.3967",
             "root - Down - Up - Down, <cgaf> - Leaf class=1 belief= 0.750000 "
-            "counts=(array([0, 1]), array([1, 3]))",
+            "impurity=0.3750 counts=(array([0, 1]), array([1, 3]))",
             "root - Down - Up - Up, <pure> - Leaf class=0 belief= 1.000000 "
-            "counts=(array([0]), array([7]))",
-            "root - Up, <cgaf> - Leaf class=0 belief= 0.928297 counts=(array("
-            "[0, 1]), array([725,  56]))",
+            "impurity=0.0000 counts=(array([0]), array([7]))",
+            "root - Up, <cgaf> - Leaf class=0 belief= 0.928297 impurity=0.1331"
+            " counts=(array([0, 1]), array([725,  56]))",
         ]
         computed = []
         expected_string = ""
         clf = Stree(kernel="linear", random_state=self._random_state)
-        clf.fit(*get_dataset(self._random_state))
+        clf.fit(*load_dataset(self._random_state))
         for node in clf:
             computed.append(str(node))
             expected_string += str(node) + "\n"
@@ -176,12 +176,12 @@ class Stree_test(unittest.TestCase):
     def test_exception_if_C_is_negative(self):
         tclf = Stree(C=-1)
         with self.assertRaises(ValueError):
-            tclf.fit(*get_dataset(self._random_state))
+            tclf.fit(*load_dataset(self._random_state))
 
     def test_exception_if_bogus_split_criteria(self):
         tclf = Stree(split_criteria="duck")
         with self.assertRaises(ValueError):
-            tclf.fit(*get_dataset(self._random_state))
+            tclf.fit(*load_dataset(self._random_state))
 
     def test_check_max_depth_is_positive_or_None(self):
         tcl = Stree()
@@ -190,13 +190,13 @@ class Stree_test(unittest.TestCase):
         self.assertGreaterEqual(1, tcl.max_depth)
         with self.assertRaises(ValueError):
             tcl = Stree(max_depth=-1)
-            tcl.fit(*get_dataset(self._random_state))
+            tcl.fit(*load_dataset(self._random_state))
 
     def test_check_max_depth(self):
         depths = (3, 4)
         for depth in depths:
             tcl = Stree(random_state=self._random_state, max_depth=depth)
-            tcl.fit(*get_dataset(self._random_state))
+            tcl.fit(*load_dataset(self._random_state))
             self.assertEqual(depth, tcl.depth_)
 
     def test_unfitted_tree_is_iterable(self):
@@ -230,7 +230,7 @@ class Stree_test(unittest.TestCase):
 
     def test_muticlass_dataset(self):
         datasets = {
-            "Synt": get_dataset(random_state=self._random_state, n_classes=3),
+            "Synt": load_dataset(random_state=self._random_state, n_classes=3),
             "Iris": load_iris(return_X_y=True),
         }
         outcomes = {
@@ -339,3 +339,24 @@ class Stree_test(unittest.TestCase):
                 dataset[:, indices].tolist(), computed.tolist()
             )
             self.assertEqual(expected, len(indices))
+
+    def test_bogus_criterion(self):
+        clf = Stree(criterion="duck")
+        with self.assertRaises(ValueError):
+            clf.fit(*load_dataset())
+
+    def test_gini(self):
+        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
+        expected = 0.48
+        self.assertEqual(expected, Stree._gini(y))
+        clf = Stree(criterion="gini")
+        clf.fit(*load_dataset())
+        self.assertEqual(expected, clf.criterion_function_(y))
+
+    def test_entropy(self):
+        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
+        expected = 0.9709505944546686
+        self.assertAlmostEqual(expected, Stree._entropy(y))
+        clf = Stree(criterion="entropy")
+        clf.fit(*load_dataset())
+        self.assertEqual(expected, clf.criterion_function_(y))
diff --git a/stree/tests/utils.py b/stree/tests/utils.py
index 7b47642..a371e88 100644
--- a/stree/tests/utils.py
+++ b/stree/tests/utils.py
@@ -1,7 +1,7 @@
 from sklearn.datasets import make_classification
 
 
-def get_dataset(random_state=0, n_classes=2):
+def load_dataset(random_state=0, n_classes=2):
     X, y = make_classification(
         n_samples=1500,
         n_features=3,

From 502ee72799c98dfe7b8588f2f3fb98305b74ab34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Sun, 14 Jun 2020 14:00:21 +0200
Subject: [PATCH 3/6] #2 Add predict and score support Add a test in features
 notebook Show max_features in main.py

---
 main.py                   |  9 +++++++
 notebooks/features.ipynb  | 55 ++++++++++++++++++++++++++++++++-------
 stree/Strees.py           | 18 +++++++++----
 stree/tests/Stree_test.py | 14 ++++++++++
 4 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/main.py b/main.py
index e4722c7..7b40929 100644
--- a/main.py
+++ b/main.py
@@ -12,6 +12,15 @@ Xtrain, Xtest, ytrain, ytest = train_test_split(
 )
 
 now = time.time()
+print("Predicting with max_features=sqrt(n_features)")
+clf = Stree(C=0.01, random_state=random_state, max_features="auto")
+clf.fit(Xtrain, ytrain)
+print(f"Took {time.time() - now:.2f} seconds to train")
+print(clf)
+print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}")
+print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}")
+print("=" * 40)
+print("Predicting with max_features=n_features")
 clf = Stree(C=0.01, random_state=random_state)
 clf.fit(Xtrain, ytrain)
 print(f"Took {time.time() - now:.2f} seconds to train")
diff --git a/notebooks/features.ipynb b/notebooks/features.ipynb
index 9eda9b0..c7d0611 100644
--- a/notebooks/features.ipynb
+++ b/notebooks/features.ipynb
@@ -64,7 +64,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28)  y.shape (1492,)\nFraud: 33.110% 494\nValid: 66.890% 998\n"
+     "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28)  y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n"
     }
    ],
    "source": [
@@ -135,7 +135,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "Accuracy of Train without weights 0.9789272030651341\nAccuracy of Train with    weights 0.9952107279693486\nAccuracy of Tests without weights 0.9598214285714286\nAccuracy of Tests with    weights 0.9508928571428571\n"
+     "text": "Accuracy of Train without weights 0.9808429118773946\nAccuracy of Train with    weights 0.9904214559386973\nAccuracy of Tests without weights 0.9441964285714286\nAccuracy of Tests with    weights 0.9375\n"
     }
    ],
    "source": [
@@ -162,7 +162,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "Time: 0.27s\tKernel: linear\tAccuracy_train: 0.9683908045977011\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9875478927203065\tAccuracy_test: 0.9598214285714286\nTime: 0.06s\tKernel: poly\tAccuracy_train: 0.9885057471264368\tAccuracy_test: 0.9464285714285714\n"
+     "text": "Time: 0.13s\tKernel: linear\tAccuracy_train: 0.9693486590038314\tAccuracy_test: 0.9598214285714286\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9923371647509579\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: poly\tAccuracy_train: 0.9913793103448276\tAccuracy_test: 0.9375\n"
     }
    ],
    "source": [
@@ -195,7 +195,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9531\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.983713 counts=(array([0, 1]), array([  5, 302]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.940299 counts=(array([0, 1]), array([693,  44]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, <cgaf> - Leaf class=1 belief= 0.990228 counts=(array([0, 1]), array([  3, 304]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.943012 counts=(array([0, 1]), array([695,  42]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9655\nClassifier's accuracy (test) : 0.9643\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([5]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.950617 counts=(array([0, 1]), array([693,  36]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9684\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.954039 counts=(array([0, 1]), array([685,  33]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9751\nClassifier's accuracy (test) : 0.9464\nroot\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681,  26]))\n\n**************************************************\n0.6869 secs\n"
+     "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9487\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0374\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.984076 impurity=0.0313 counts=(array([0, 1]), array([  5, 309]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.947874 impurity=0.0988 counts=(array([0, 1]), array([691,  38]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9531\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0192\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.993506 impurity=0.0129 counts=(array([0, 1]), array([  2, 306]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.944218 impurity=0.1053 counts=(array([0, 1]), array([694,  41]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9643\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0189\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951989 impurity=0.0914 counts=(array([0, 1]), array([694,  35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9621\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0250\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([4]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.951923 impurity=0.0915 counts=(array([0, 1]), array([693,  35]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9703\nClassifier's accuracy (test) : 0.9665\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691,  31]))\n\n**************************************************\n0.4375 secs\n"
     }
    ],
    "source": [
@@ -227,7 +227,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681,  26]))\n"
+     "text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691,  31]))\n"
     }
    ],
    "source": [
@@ -244,7 +244,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "root\nroot - Down\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, <pure> - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, <cgaf> - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681,  26]))\n"
+     "text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, <pure> - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, <cgaf> - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691,  31]))\n"
     }
    ],
    "source": [
@@ -268,7 +268,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x1254f13b0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x1254e84d0>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x1254e83b0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x1254e0cb0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x1254e0dd0>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x1254e0ef0>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x1254e2050>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x1254e2170>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x1254eb4d0>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x1254eb4d0>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x1254e2320>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x1254e2290>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x1254e85f0>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x1254e8290>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x1254e8710>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x1254f1290>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x1254e0b90>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x1254e8950>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x1254f15f0>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x1254eb050>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x1254eba70>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x1254e8a70>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x1254eb170>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x1254f40e0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x1254da9e0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x1254eb710>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x1254eb5f0>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x1254f1c20>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x1254f4200>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x1254e2830>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x1254e29e0>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x1254e2b00>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x1254e2c20>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x1254e2d40>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x1254f1e60>, 'Stree')\n38 functools.partial(<function check_set_params at 0x1254f1f80>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x1254e2440>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x1254e2710>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x1254f43b0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x1254f4440>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x1254f44d0>, 'Stree')\n"
+     "text": "1 functools.partial(<function check_no_attributes_set_in_init at 0x12735b3b0>, 'Stree')\n2 functools.partial(<function check_estimators_dtypes at 0x1273514d0>, 'Stree')\n3 functools.partial(<function check_fit_score_takes_y at 0x1273513b0>, 'Stree')\n4 functools.partial(<function check_sample_weights_pandas_series at 0x12734acb0>, 'Stree')\n5 functools.partial(<function check_sample_weights_not_an_array at 0x12734add0>, 'Stree')\n6 functools.partial(<function check_sample_weights_list at 0x12734aef0>, 'Stree')\n7 functools.partial(<function check_sample_weights_shape at 0x12734d050>, 'Stree')\n8 functools.partial(<function check_sample_weights_invariance at 0x12734d170>, 'Stree')\n9 functools.partial(<function check_estimators_fit_returns_self at 0x1273564d0>, 'Stree')\n10 functools.partial(<function check_estimators_fit_returns_self at 0x1273564d0>, 'Stree', readonly_memmap=True)\n11 functools.partial(<function check_complex_data at 0x12734d320>, 'Stree')\n12 functools.partial(<function check_dtype_object at 0x12734d290>, 'Stree')\n13 functools.partial(<function check_estimators_empty_data_messages at 0x1273515f0>, 'Stree')\n14 functools.partial(<function check_pipeline_consistency at 0x127351290>, 'Stree')\n15 functools.partial(<function check_estimators_nan_inf at 0x127351710>, 'Stree')\n16 functools.partial(<function check_estimators_overwrite_params at 0x12735b290>, 'Stree')\n17 functools.partial(<function check_estimator_sparse_data at 0x12734ab90>, 'Stree')\n18 functools.partial(<function check_estimators_pickle at 0x127351950>, 'Stree')\n19 functools.partial(<function check_classifier_data_not_an_array at 0x12735b5f0>, 'Stree')\n20 functools.partial(<function check_classifiers_one_label at 0x127356050>, 'Stree')\n21 functools.partial(<function check_classifiers_classes at 0x127356a70>, 'Stree')\n22 functools.partial(<function check_estimators_partial_fit_n_features at 0x127351a70>, 'Stree')\n23 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree')\n24 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree', readonly_memmap=True)\n25 functools.partial(<function check_classifiers_train at 0x127356170>, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(<function check_classifiers_regression_target at 0x12735f0e0>, 'Stree')\n27 functools.partial(<function check_supervised_y_no_nan at 0x1273449e0>, 'Stree')\n28 functools.partial(<function check_supervised_y_2d at 0x127356710>, 'Stree')\n29 functools.partial(<function check_estimators_unfitted at 0x1273565f0>, 'Stree')\n30 functools.partial(<function check_non_transformer_estimators_n_iter at 0x12735bc20>, 'Stree')\n31 functools.partial(<function check_decision_proba_consistency at 0x12735f200>, 'Stree')\n32 functools.partial(<function check_fit2d_predict1d at 0x12734d830>, 'Stree')\n33 functools.partial(<function check_methods_subset_invariance at 0x12734d9e0>, 'Stree')\n34 functools.partial(<function check_fit2d_1sample at 0x12734db00>, 'Stree')\n35 functools.partial(<function check_fit2d_1feature at 0x12734dc20>, 'Stree')\n36 functools.partial(<function check_fit1d at 0x12734dd40>, 'Stree')\n37 functools.partial(<function check_get_params_invariance at 0x12735be60>, 'Stree')\n38 functools.partial(<function check_set_params at 0x12735bf80>, 'Stree')\n39 functools.partial(<function check_dict_unchanged at 0x12734d440>, 'Stree')\n40 functools.partial(<function check_dont_overwrite_parameters at 0x12734d710>, 'Stree')\n41 functools.partial(<function check_fit_idempotent at 0x12735f3b0>, 'Stree')\n42 functools.partial(<function check_n_features_in at 0x12735f440>, 'Stree')\n43 functools.partial(<function check_requires_y_none at 0x12735f4d0>, 'Stree')\n"
     }
    ],
    "source": [
@@ -306,7 +306,7 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "== Not Weighted ===\nSVC train score ..: 0.9521072796934866\nSTree train score : 0.9578544061302682\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9575892857142857\n==== Weighted =====\nSVC train score ..: 0.9616858237547893\nSTree train score : 0.9616858237547893\nSVC test score ...: 0.9642857142857143\nSTree test score .: 0.9598214285714286\n*SVC test score ..: 0.951413553411694\n*STree test score : 0.9480517444389333\n"
+     "text": "== Not Weighted ===\nSVC train score ..: 0.9578544061302682\nSTree train score : 0.960727969348659\nSVC test score ...: 0.9508928571428571\nSTree test score .: 0.9553571428571429\n==== Weighted =====\nSVC train score ..: 0.9636015325670498\nSTree train score : 0.9626436781609196\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9553571428571429\n*SVC test score ..: 0.9447820728419238\n*STree test score : 0.9447820728419238\n"
     }
    ],
    "source": [
@@ -338,12 +338,49 @@
     {
      "output_type": "stream",
      "name": "stdout",
-     "text": "root\nroot - Down\nroot - Down - Down, <cgaf> - Leaf class=1 belief= 0.969325 counts=(array([0, 1]), array([ 10, 316]))\nroot - Down - Up, <pure> - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.958159 counts=(array([0, 1]), array([687,  30]))\n\n"
+     "text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down, <cgaf> - Leaf class=1 belief= 0.978261 impurity=0.0425 counts=(array([0, 1]), array([  7, 315]))\nroot - Up, <cgaf> - Leaf class=0 belief= 0.955679 impurity=0.0847 counts=(array([0, 1]), array([690,  32]))\n\n"
     }
    ],
    "source": [
     "print(clf)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Test max_features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": "****************************************\nmax_features None = 28\nTrain score : 0.9664750957854407\nTest score .: 0.9642857142857143\nTook 0.09 seconds\n****************************************\nmax_features auto = 5\nTrain score : 0.9511494252873564\nTest score .: 0.9441964285714286\nTook 0.37 seconds\n****************************************\nmax_features log2 = 4\nTrain score : 0.935823754789272\nTest score .: 0.9330357142857143\nTook 0.10 seconds\n****************************************\nmax_features 7 = 7\nTrain score : 0.9568965517241379\nTest score .: 0.9397321428571429\nTook 3.36 seconds\n****************************************\nmax_features 0.5 = 14\nTrain score : 0.960727969348659\nTest score .: 0.9486607142857143\nTook 112.42 seconds\n****************************************\nmax_features 0.1 = 2\nTrain score : 0.8793103448275862\nTest score .: 0.8839285714285714\nTook 0.06 seconds\n****************************************\nmax_features 0.7 = 19\nTrain score : 0.9655172413793104\nTest score .: 0.9553571428571429\nTook 10.59 seconds\n"
+    }
+   ],
+   "source": [
+    "for max_features in [None, \"auto\", \"log2\", 7, .5, .1, .7]:\n",
+    "    now = time.time()\n",
+    "    print(\"*\"*40)\n",
+    "    clf = Stree(random_state=random_state, max_features=max_features)\n",
+    "    clf.fit(Xtrain, ytrain)\n",
+    "    print(f\"max_features {max_features} = {clf.max_features_}\")\n",
+    "    print(\"Train score :\", clf.score(Xtrain, ytrain))\n",
+    "    print(\"Test score .:\", clf.score(Xtest, ytest))\n",
+    "    print(f\"Took {time.time() - now:.2f} seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/stree/Strees.py b/stree/Strees.py
index cb8731f..1a37b4e 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -205,7 +205,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         the hyperplane of the node
         :rtype: np.array
         """
-        return node._clf.decision_function(data)
+        return node._clf.decision_function(data[:, node._features])
 
     def _min_distance(self, data: np.array, _) -> np.array:
         # chooses the lowest distance of every sample
@@ -286,11 +286,14 @@ class Stree(BaseEstimator, ClassifierMixin):
         sample_weight = _check_sample_weight(sample_weight, X)
         check_classification_targets(y)
         # Initialize computed parameters
+        if self.random_state is not None:
+            random.seed(self.random_state)
         self.classes_, y = np.unique(y, return_inverse=True)
         self.n_classes_ = self.classes_.shape[0]
         self.n_iter_ = self.max_iter
         self.depth_ = 0
         self.n_features_ = X.shape[1]
+        self.n_features_in_ = X.shape[1]
         self.max_features_ = self._initialize_max_features()
         self.criterion_function_ = getattr(self, f"_{self.criterion}")
         self.tree_ = self.train(X, y, sample_weight, 1, "root")
@@ -336,12 +339,12 @@ class Stree(BaseEstimator, ClassifierMixin):
             )
         # Train the model
         clf = self._build_clf()
-        Xs, indices_subset = self._get_subspace(X)
+        Xs, features = self._get_subspace(X)
         clf.fit(Xs, y, sample_weight=sample_weight)
         impurity = self.criterion_function_(y)
-        node = Snode(clf, X, y, indices_subset, impurity, title)
+        node = Snode(clf, X, y, features, impurity, title)
         self.depth_ = max(depth, self.depth_)
-        down = self._split_criteria(self._distances(node, Xs), node)
+        down = self._split_criteria(self._distances(node, X), node)
         X_U, X_D = self._split_array(X, down)
         y_u, y_d = self._split_array(y, down)
         sw_u, sw_d = self._split_array(sample_weight, down)
@@ -439,6 +442,11 @@ class Stree(BaseEstimator, ClassifierMixin):
         check_is_fitted(self, ["tree_"])
         # Input validation
         X = check_array(X)
+        if X.shape[1] != self.n_features_:
+            raise ValueError(
+                f"Expected {self.n_features_} features but got "
+                f"({X.shape[1]})"
+            )
         # setup prediction & make it happen
         indices = np.arange(X.shape[0])
         result = (
@@ -548,7 +556,7 @@ class Stree(BaseEstimator, ClassifierMixin):
             features = range(dataset.shape[1])
             features_sets = list(combinations(features, self.max_features_))
             if len(features_sets) > 1:
-                return features_sets[random.randint(0, len(features_sets))]
+                return features_sets[random.randint(0, len(features_sets) - 1)]
             else:
                 return features_sets[0]
 
diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py
index a3fb3d1..371e1d0 100644
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -360,3 +360,17 @@ class Stree_test(unittest.TestCase):
         clf = Stree(criterion="entropy")
         clf.fit(*load_dataset())
         self.assertEqual(expected, clf.criterion_function_(y))
+
+    def test_predict_feature_dimensions(self):
+        X = np.random.rand(10, 5)
+        y = np.random.randint(0, 2, 10)
+        clf = Stree()
+        clf.fit(X, y)
+        with self.assertRaises(ValueError):
+            clf.predict(X[:, :3])
+
+    def test_score_max_features(self):
+        X, y = load_dataset(self._random_state)
+        clf = Stree(random_state=self._random_state, max_features=2)
+        clf.fit(X, y)
+        self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))

From c94bc068bd0a923360ea0ff3bf224672cec0595b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Mon, 15 Jun 2020 00:22:57 +0200
Subject: [PATCH 4/6] #2 Refactor Stree & create Splitter Add and test splitter
 parameter

---
 notebooks/benchmark.ipynb    | 200 ++++++++++++++++++++++--
 stree/Strees.py              | 290 ++++++++++++++++++++++-------------
 stree/__init__.py            |   4 +-
 stree/tests/Splitter_test.py | 142 +++++++++++++++++
 stree/tests/Stree_test.py    |  62 ++------
 stree/tests/__init__.py      |   3 +-
 stree/tests/utils.py         |   4 +-
 7 files changed, 529 insertions(+), 176 deletions(-)
 create mode 100644 stree/tests/Splitter_test.py

diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb
index 76901aa..9e19ff0 100644
--- a/notebooks/benchmark.ipynb
+++ b/notebooks/benchmark.ipynb
@@ -68,9 +68,11 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
-     "text": "2020-05-23 19:42:08\n"
+     "output_type": "stream",
+     "text": [
+      "2020-06-14 23:45:42\n"
+     ]
     }
    ],
    "source": [
@@ -102,9 +104,12 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
-     "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
+     "output_type": "stream",
+     "text": [
+      "Fraud: 0.173% 492\n",
+      "Valid: 99.827% 284,315\n"
+     ]
     }
    ],
    "source": [
@@ -130,9 +135,12 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
-     "text": "X shape: (284807, 29)\ny shape: (284807,)\n"
+     "output_type": "stream",
+     "text": [
+      "X shape: (284807, 29)\n",
+      "y shape: (284807,)\n"
+     ]
     }
    ],
    "source": [
@@ -248,9 +256,168 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
-     "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 16.99 seconds\n=========== Linear Tree - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Linear Tree - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999578  0.999613  0.999596     85295\n           1   0.772414  0.756757  0.764505       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.885996  0.878185  0.882050     85443\nweighted avg   0.999184  0.999192  0.999188     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85262    33]\n [   36   112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 175.7 seconds\n=========== Random Forest - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Random Forest - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999660  0.999965  0.999812     85295\n           1   0.975410  0.804054  0.881481       148\n\n    accuracy                       0.999625     85443\n   macro avg   0.987535  0.902009  0.940647     85443\nweighted avg   0.999618  0.999625  0.999607     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85292     3]\n [   29   119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 39.64 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999613  0.999869  0.999741    199020\n           1   0.911263  0.776163  0.838305       344\n\n    accuracy                       0.999483    199364\n   macro avg   0.955438  0.888016  0.919023    199364\nweighted avg   0.999461  0.999483  0.999463    199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999613  0.999883  0.999748     85295\n           1   0.920000  0.777027  0.842491       148\n\n    accuracy                       0.999497     85443\n   macro avg   0.959807  0.888455  0.921119     85443\nweighted avg   0.999475  0.999497  0.999476     85443\n\nConfusion Matrix in Train\n[[198994     26]\n [    77    267]]\nConfusion Matrix in Test\n[[85285    10]\n [   33   115]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 48.29 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999392  0.999678  0.999535    199020\n           1   0.777003  0.648256  0.706815       344\n\n    accuracy                       0.999072    199364\n   macro avg   0.888198  0.823967  0.853175    199364\nweighted avg   0.999008  0.999072  0.999030    199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999484  0.999707  0.999596     85295\n           1   0.806202  0.702703  0.750903       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.902843  0.851205  0.875249     85443\nweighted avg   0.999149  0.999192  0.999165     85443\n\nConfusion Matrix in Train\n[[198956     64]\n [   121    223]]\nConfusion Matrix in Test\n[[85270    25]\n [   44   104]]\n************************** Gradient Boost. **********************\nTrain Model Gradient Boost. took: 251.6 seconds\n=========== Gradient Boost. - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999096  0.999854  0.999475    199020\n           1   0.849741  0.476744  0.610801       344\n\n    accuracy                       0.998952    199364\n   macro avg   0.924419  0.738299  0.805138    199364\nweighted avg   0.998839  0.998952  0.998804    199364\n\n=========== Gradient Boost. - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.998981  0.999730  0.999355     85295\n           1   0.726190  0.412162  0.525862       148\n\n    accuracy                       0.998713     85443\n   macro avg   0.862586  0.705946  0.762609     85443\nweighted avg   0.998508  0.998713  0.998535     85443\n\nConfusion Matrix in Train\n[[198991     29]\n [   180    164]]\nConfusion Matrix in Test\n[[85272    23]\n [   87    61]]\n"
+     "output_type": "stream",
+     "text": [
+      "************************** Linear Tree **********************\n",
+      "Train Model Linear Tree took: 13.52 seconds\n",
+      "=========== Linear Tree - Train 199,364 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   1.000000  1.000000  1.000000    199020\n",
+      "           1   1.000000  1.000000  1.000000       344\n",
+      "\n",
+      "    accuracy                       1.000000    199364\n",
+      "   macro avg   1.000000  1.000000  1.000000    199364\n",
+      "weighted avg   1.000000  1.000000  1.000000    199364\n",
+      "\n",
+      "=========== Linear Tree - Test 85,443 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.999578  0.999613  0.999596     85295\n",
+      "           1   0.772414  0.756757  0.764505       148\n",
+      "\n",
+      "    accuracy                       0.999192     85443\n",
+      "   macro avg   0.885996  0.878185  0.882050     85443\n",
+      "weighted avg   0.999184  0.999192  0.999188     85443\n",
+      "\n",
+      "Confusion Matrix in Train\n",
+      "[[199020      0]\n",
+      " [     0    344]]\n",
+      "Confusion Matrix in Test\n",
+      "[[85262    33]\n",
+      " [   36   112]]\n",
+      "************************** Random Forest **********************\n",
+      "Train Model Random Forest took: 152.5 seconds\n",
+      "=========== Random Forest - Train 199,364 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   1.000000  1.000000  1.000000    199020\n",
+      "           1   1.000000  1.000000  1.000000       344\n",
+      "\n",
+      "    accuracy                       1.000000    199364\n",
+      "   macro avg   1.000000  1.000000  1.000000    199364\n",
+      "weighted avg   1.000000  1.000000  1.000000    199364\n",
+      "\n",
+      "=========== Random Forest - Test 85,443 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.999660  0.999965  0.999812     85295\n",
+      "           1   0.975410  0.804054  0.881481       148\n",
+      "\n",
+      "    accuracy                       0.999625     85443\n",
+      "   macro avg   0.987535  0.902009  0.940647     85443\n",
+      "weighted avg   0.999618  0.999625  0.999607     85443\n",
+      "\n",
+      "Confusion Matrix in Train\n",
+      "[[199020      0]\n",
+      " [     0    344]]\n",
+      "Confusion Matrix in Test\n",
+      "[[85292     3]\n",
+      " [   29   119]]\n",
+      "************************** Stree (SVM Tree) **********************\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
+      "  \"the number of iterations.\", ConvergenceWarning)\n",
+      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
+      "  \"the number of iterations.\", ConvergenceWarning)\n",
+      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
+      "  \"the number of iterations.\", ConvergenceWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train Model Stree (SVM Tree) took: 32.55 seconds\n",
+      "=========== Stree (SVM Tree) - Train 199,364 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.999623  0.999864  0.999744    199020\n",
+      "           1   0.908784  0.781977  0.840625       344\n",
+      "\n",
+      "    accuracy                       0.999488    199364\n",
+      "   macro avg   0.954204  0.890921  0.920184    199364\n",
+      "weighted avg   0.999467  0.999488  0.999469    199364\n",
+      "\n",
+      "=========== Stree (SVM Tree) - Test 85,443 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.999637  0.999918  0.999777     85295\n",
+      "           1   0.943548  0.790541  0.860294       148\n",
+      "\n",
+      "    accuracy                       0.999555     85443\n",
+      "   macro avg   0.971593  0.895229  0.930036     85443\n",
+      "weighted avg   0.999540  0.999555  0.999536     85443\n",
+      "\n",
+      "Confusion Matrix in Train\n",
+      "[[198993     27]\n",
+      " [    75    269]]\n",
+      "Confusion Matrix in Test\n",
+      "[[85288     7]\n",
+      " [   31   117]]\n",
+      "************************** AdaBoost model **********************\n",
+      "Train Model AdaBoost model took: 47.34 seconds\n",
+      "=========== AdaBoost model - Train 199,364 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.999392  0.999678  0.999535    199020\n",
+      "           1   0.777003  0.648256  0.706815       344\n",
+      "\n",
+      "    accuracy                       0.999072    199364\n",
+      "   macro avg   0.888198  0.823967  0.853175    199364\n",
+      "weighted avg   0.999008  0.999072  0.999030    199364\n",
+      "\n",
+      "=========== AdaBoost model - Test 85,443 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.999484  0.999707  0.999596     85295\n",
+      "           1   0.806202  0.702703  0.750903       148\n",
+      "\n",
+      "    accuracy                       0.999192     85443\n",
+      "   macro avg   0.902843  0.851205  0.875249     85443\n",
+      "weighted avg   0.999149  0.999192  0.999165     85443\n",
+      "\n",
+      "Confusion Matrix in Train\n",
+      "[[198956     64]\n",
+      " [   121    223]]\n",
+      "Confusion Matrix in Test\n",
+      "[[85270    25]\n",
+      " [   44   104]]\n",
+      "************************** Gradient Boost. **********************\n",
+      "Train Model Gradient Boost. took: 244.1 seconds\n",
+      "=========== Gradient Boost. - Train 199,364 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.999096  0.999854  0.999475    199020\n",
+      "           1   0.849741  0.476744  0.610801       344\n",
+      "\n",
+      "    accuracy                       0.998952    199364\n",
+      "   macro avg   0.924419  0.738299  0.805138    199364\n",
+      "weighted avg   0.998839  0.998952  0.998804    199364\n",
+      "\n",
+      "=========== Gradient Boost. - Test 85,443 samples =============\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0   0.998981  0.999730  0.999355     85295\n",
+      "           1   0.726190  0.412162  0.525862       148\n",
+      "\n",
+      "    accuracy                       0.998713     85443\n",
+      "   macro avg   0.862586  0.705946  0.762609     85443\n",
+      "weighted avg   0.998508  0.998713  0.998535     85443\n",
+      "\n",
+      "Confusion Matrix in Train\n",
+      "[[198991     29]\n",
+      " [   180    164]]\n",
+      "Confusion Matrix in Test\n",
+      "[[85272    23]\n",
+      " [   87    61]]\n"
+     ]
     }
    ],
    "source": [
@@ -277,9 +444,18 @@
    "metadata": {},
    "outputs": [
     {
-     "output_type": "stream",
      "name": "stdout",
-     "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 175.717 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time:  16.99 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 175.72 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time:  39.64 seconds\t f1: 0.8425\nModel: AdaBoost model\t Time:  48.29 seconds\t f1: 0.7509\nModel: Gradient Boost.\t Time: 251.58 seconds\t f1: 0.5259\n"
+     "output_type": "stream",
+     "text": [
+      "**************************************************************************************************************\n",
+      "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
+      "**************************************************************************************************************\n",
+      "Model: Linear Tree\t Time:  13.52 seconds\t f1: 0.7645\n",
+      "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
+      "Model: Stree (SVM Tree)\t Time:  32.55 seconds\t f1: 0.8603\n",
+      "Model: AdaBoost model\t Time:  47.34 seconds\t f1: 0.7509\n",
+      "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n"
+     ]
     }
    ],
    "source": [
@@ -325,7 +501,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6-final"
+   "version": "3.7.6"
   },
   "toc": {
    "base_numbering": 1,
@@ -379,4 +555,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/stree/Strees.py b/stree/Strees.py
index 1a37b4e..e2e33c3 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -9,12 +9,14 @@ Build an oblique tree classifier based on SVM Trees
 import os
 import numbers
 import random
+import warnings
 from itertools import combinations
 import numpy as np
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.svm import SVC, LinearSVC
 from sklearn.utils import check_consistent_length
 from sklearn.utils.multiclass import check_classification_targets
+from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.validation import (
     check_X_y,
     check_array,
@@ -134,6 +136,168 @@ class Siterator:
         return node
 
 
+class Splitter:
+    def __init__(
+        self,
+        clf: SVC = None,
+        criterion: str = None,
+        splitter_type: str = None,
+        criteria: str = None,
+        min_samples_split: int = None,
+        random_state=None,
+    ):
+        self._clf = clf
+        self._random_state = random_state
+        if random_state is not None:
+            random.seed(random_state)
+        self._criterion = criterion
+        self._min_samples_split = min_samples_split
+        self._criteria = criteria
+        self._splitter_type = splitter_type
+
+        if clf is None:
+            raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
+
+        if criterion not in ["gini", "entropy"]:
+            raise ValueError(
+                f"criterion must be gini or entropy got({criterion})"
+            )
+
+        if criteria not in ["min_distance", "max_samples"]:
+            raise ValueError(
+                f"split_criteria has to be min_distance or \
+                max_samples got ({criteria})"
+            )
+
+        if splitter_type not in ["random", "best"]:
+            raise ValueError(
+                f"splitter must be either random or best got({splitter_type})"
+            )
+        self.criterion_function = getattr(self, f"_{self._criterion}")
+        self.decision_criteria = getattr(self, f"_{self._criteria}")
+
+    def impurity(self, y: np.array) -> np.array:
+        return self.criterion_function(y)
+
+    @staticmethod
+    def _gini(y: np.array) -> float:
+        _, count = np.unique(y, return_counts=True)
+        return 1 - np.sum(np.square(count / np.sum(count)))
+
+    @staticmethod
+    def _entropy(y: np.array) -> float:
+        _, count = np.unique(y, return_counts=True)
+        proportion = count / np.sum(count)
+        return -np.sum(proportion * np.log2(proportion))
+
+    def information_gain(
+        self, labels_up: np.array, labels_dn: np.array
+    ) -> float:
+        card_up = labels_up.shape[0]
+        card_dn = labels_dn.shape[0]
+        samples = card_up + card_dn
+        up = card_up / samples * self.criterion_function(labels_up)
+        dn = card_dn / samples * self.criterion_function(labels_dn)
+        return up + dn
+
+    def _select_best_set(
+        self, dataset: np.array, labels: np.array, features_sets: list
+    ) -> list:
+        min_impurity = 1
+        selected = None
+        warnings.filterwarnings("ignore", category=ConvergenceWarning)
+        for feature_set in features_sets:
+            self._clf.fit(dataset[:, feature_set], labels)
+            node = Snode(
+                self._clf, dataset, labels, feature_set, 0.0, "subset"
+            )
+            self.partition(dataset, node)
+            y1, y2 = self.part(labels)
+            impurity = self.information_gain(y1, y2)
+            if impurity < min_impurity:
+                min_impurity = impurity
+                selected = feature_set
+        return selected
+
+    def _get_subspaces_set(
+        self, dataset: np.array, labels: np.array, max_features: int
+    ) -> np.array:
+        features = range(dataset.shape[1])
+        features_sets = list(combinations(features, max_features))
+        if len(features_sets) > 1:
+            if self._splitter_type == "random":
+                return features_sets[random.randint(0, len(features_sets) - 1)]
+            else:
+                return self._select_best_set(dataset, labels, features_sets)
+        else:
+            return features_sets[0]
+
+    def get_subspace(
+        self, dataset: np.array, labels: np.array, max_features: int
+    ) -> list:
+        """Return the best subspace to make a split
+        """
+        indices = self._get_subspaces_set(dataset, labels, max_features)
+        return dataset[:, indices], indices
+
+    @staticmethod
+    def _min_distance(data: np.array, _) -> np.array:
+        # chooses the lowest distance of every sample
+        indices = np.argmin(np.abs(data), axis=1)
+        return np.array(
+            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
+        )
+
+    @staticmethod
+    def _max_samples(data: np.array, y: np.array) -> np.array:
+        # select the class with max number of samples
+        _, samples = np.unique(y, return_counts=True)
+        selected = np.argmax(samples)
+        return data[:, selected]
+
+    def partition(self, samples: np.array, node: Snode):
+        """Set the criteria to split arrays
+
+        """
+        data = self._distances(node, samples)
+        if data.shape[0] < self._min_samples_split:
+            self._down = np.ones((data.shape[0]), dtype=bool)
+            return
+        if data.ndim > 1:
+            # split criteria for multiclass
+            data = self.decision_criteria(data, node._y)
+        self._down = data > 0
+
+    def _distances(self, node: Snode, data: np.ndarray) -> np.array:
+        """Compute distances of the samples to the hyperplane of the node
+
+        :param node: node containing the svm classifier
+        :type node: Snode
+        :param data: samples to find out distance to hyperplane
+        :type data: np.ndarray
+        :return: array of shape (m, 1) with the distances of every sample to
+        the hyperplane of the node
+        :rtype: np.array
+        """
+        return node._clf.decision_function(data[:, node._features])
+
+    def part(self, origin: np.array) -> list:
+        """Split an array in two based on indices (down) and its complement
+
+        :param origin: dataset to split
+        :type origin: np.array
+        :param down: indices to use to split array
+        :type down: np.array
+        :return: list with two splits of the array
+        :rtype: list
+        """
+        up = ~self._down
+        return [
+            origin[up] if any(up) else None,
+            origin[self._down] if any(self._down) else None,
+        ]
+
+
 class Stree(BaseEstimator, ClassifierMixin):
     """Estimator that is based on binary trees of svm nodes
     can deal with sample_weights in predict, used in boosting sklearn methods
@@ -156,6 +320,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         criterion: str = "gini",
         min_samples_split: int = 0,
         max_features=None,
+        splitter: str = "random",
     ):
         self.max_iter = max_iter
         self.C = C
@@ -169,6 +334,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         self.split_criteria = split_criteria
         self.max_features = max_features
         self.criterion = criterion
+        self.splitter = splitter
 
     def _more_tags(self) -> dict:
         """Required by sklearn to supply features of the classifier
@@ -178,68 +344,6 @@ class Stree(BaseEstimator, ClassifierMixin):
         """
         return {"requires_y": True}
 
-    def _split_array(self, origin: np.array, down: np.array) -> list:
-        """Split an array in two based on indices (down) and its complement
-
-        :param origin: dataset to split
-        :type origin: np.array
-        :param down: indices to use to split array
-        :type down: np.array
-        :return: list with two splits of the array
-        :rtype: list
-        """
-        up = ~down
-        return [
-            origin[up] if any(up) else None,
-            origin[down] if any(down) else None,
-        ]
-
-    def _distances(self, node: Snode, data: np.ndarray) -> np.array:
-        """Compute distances of the samples to the hyperplane of the node
-
-        :param node: node containing the svm classifier
-        :type node: Snode
-        :param data: samples to find out distance to hyperplane
-        :type data: np.ndarray
-        :return: array of shape (m, 1) with the distances of every sample to
-        the hyperplane of the node
-        :rtype: np.array
-        """
-        return node._clf.decision_function(data[:, node._features])
-
-    def _min_distance(self, data: np.array, _) -> np.array:
-        # chooses the lowest distance of every sample
-        indices = np.argmin(np.abs(data), axis=1)
-        return np.array(
-            [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)]
-        )
-
-    def _max_samples(self, data: np.array, y: np.array) -> np.array:
-        # select the class with max number of samples
-        _, samples = np.unique(y, return_counts=True)
-        selected = np.argmax(samples)
-        return data[:, selected]
-
-    def _split_criteria(self, data: np.array, node: Snode) -> np.array:
-        """Set the criteria to split arrays
-
-        :param data: distances of samples to hyperplanes shape (m, nclasses)
-        if nclasses > 2 else (m,)
-        :type data: np.array
-        :param node: node containing the svm classifier
-        :type node: Snode
-        :return: array of booleans of samples under or above zero
-        :rtype: np.array
-        """
-
-        if data.shape[0] < self.min_samples_split:
-            return np.ones((data.shape[0]), dtype=bool)
-        if data.ndim > 1:
-            # split criteria for multiclass
-            data = getattr(self, f"_{self.split_criteria}")(data, node._y)
-        res = data > 0
-        return res
-
     def fit(
         self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
     ) -> "Stree":
@@ -271,21 +375,20 @@ class Stree(BaseEstimator, ClassifierMixin):
                 f"Maximum depth has to be greater than 1... got (max_depth=\
                     {self.max_depth})"
             )
-        if self.split_criteria not in ["min_distance", "max_samples"]:
-            raise ValueError(
-                f"split_criteria has to be min_distance or \
-                max_samples got ({self.split_criteria})"
-            )
-        if self.criterion not in ["gini", "entropy"]:
-            raise ValueError(
-                f"criterion must be gini or entropy got({self.criterion})"
-            )
 
         check_classification_targets(y)
         X, y = check_X_y(X, y)
         sample_weight = _check_sample_weight(sample_weight, X)
         check_classification_targets(y)
         # Initialize computed parameters
+        self.splitter_ = Splitter(
+            clf=self._build_clf(),
+            criterion=self.criterion,
+            splitter_type=self.splitter,
+            criteria=self.split_criteria,
+            random_state=self.random_state,
+            min_samples_split=self.min_samples_split,
+        )
         if self.random_state is not None:
             random.seed(self.random_state)
         self.classes_, y = np.unique(y, return_inverse=True)
@@ -295,7 +398,6 @@ class Stree(BaseEstimator, ClassifierMixin):
         self.n_features_ = X.shape[1]
         self.n_features_in_ = X.shape[1]
         self.max_features_ = self._initialize_max_features()
-        self.criterion_function_ = getattr(self, f"_{self.criterion}")
         self.tree_ = self.train(X, y, sample_weight, 1, "root")
         self._build_predictor()
         return self
@@ -339,15 +441,15 @@ class Stree(BaseEstimator, ClassifierMixin):
             )
         # Train the model
         clf = self._build_clf()
-        Xs, features = self._get_subspace(X)
+        Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
         clf.fit(Xs, y, sample_weight=sample_weight)
-        impurity = self.criterion_function_(y)
+        impurity = self.splitter_.impurity(y)
         node = Snode(clf, X, y, features, impurity, title)
         self.depth_ = max(depth, self.depth_)
-        down = self._split_criteria(self._distances(node, X), node)
-        X_U, X_D = self._split_array(X, down)
-        y_u, y_d = self._split_array(y, down)
-        sw_u, sw_d = self._split_array(sample_weight, down)
+        self.splitter_.partition(X, node)
+        X_U, X_D = self.splitter_.part(X)
+        y_u, y_d = self.splitter_.part(y)
+        sw_u, sw_d = self.splitter_.part(sample_weight)
         if X_U is None or X_D is None:
             # didn't part anything
             return Snode(
@@ -431,9 +533,9 @@ class Stree(BaseEstimator, ClassifierMixin):
                 # set a class for every sample in dataset
                 prediction = np.full((xp.shape[0], 1), node._class)
                 return prediction, indices
-            down = self._split_criteria(self._distances(node, xp), node)
-            x_u, x_d = self._split_array(xp, down)
-            i_u, i_d = self._split_array(indices, down)
+            self.splitter_.partition(xp, node)
+            x_u, x_d = self.splitter_.part(xp)
+            i_u, i_d = self.splitter_.part(indices)
             prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
             prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
             return np.append(prx_u, prx_d), np.append(prin_u, prin_d)
@@ -536,29 +638,3 @@ class Stree(BaseEstimator, ClassifierMixin):
                     f"got ({self.max_features})"
                 )
         return max_features
-
-    @staticmethod
-    def _gini(y: np.array) -> float:
-        _, count = np.unique(y, return_counts=True)
-        return 1 - np.sum(np.square(count / np.sum(count)))
-
-    @staticmethod
-    def _entropy(y: np.array) -> float:
-        _, count = np.unique(y, return_counts=True)
-        proportion = count / np.sum(count)
-        return -np.sum(proportion * np.log2(proportion))
-
-    def _get_subspace(self, dataset: np.array) -> list:
-        """Return the best subspace to make a split
-        """
-
-        def get_subspaces_set(dataset: np.array) -> np.array:
-            features = range(dataset.shape[1])
-            features_sets = list(combinations(features, self.max_features_))
-            if len(features_sets) > 1:
-                return features_sets[random.randint(0, len(features_sets) - 1)]
-            else:
-                return features_sets[0]
-
-        indices = get_subspaces_set(dataset)
-        return dataset[:, indices], indices
diff --git a/stree/__init__.py b/stree/__init__.py
index 03b8a2c..6768b82 100644
--- a/stree/__init__.py
+++ b/stree/__init__.py
@@ -1,3 +1,3 @@
-from .Strees import Stree, Snode, Siterator
+from .Strees import Stree, Snode, Siterator, Splitter
 
-__all__ = ["Stree", "Snode", "Siterator"]
+__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py
new file mode 100644
index 0000000..b620ce1
--- /dev/null
+++ b/stree/tests/Splitter_test.py
@@ -0,0 +1,142 @@
+import os
+import unittest
+
+import numpy as np
+from sklearn.svm import LinearSVC
+
+from stree import Splitter
+from .utils import load_dataset
+
+
+class Splitter_test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        self._random_state = 1
+        super().__init__(*args, **kwargs)
+
+    def build(
+        self,
+        clf=LinearSVC(),
+        min_samples_split=0,
+        splitter_type="random",
+        criterion="gini",
+        criteria="min_distance",
+        random_state=None,
+    ):
+        return Splitter(
+            clf=clf,
+            min_samples_split=min_samples_split,
+            splitter_type=splitter_type,
+            criterion=criterion,
+            criteria=criteria,
+            random_state=random_state,
+        )
+
+    @classmethod
+    def setUp(cls):
+        os.environ["TESTING"] = "1"
+
+    def test_init(self):
+        with self.assertRaises(ValueError):
+            self.build(criterion="duck")
+        with self.assertRaises(ValueError):
+            self.build(splitter_type="duck")
+        with self.assertRaises(ValueError):
+            self.build(criteria="duck")
+        with self.assertRaises(ValueError):
+            self.build(clf=None)
+        for splitter_type in ["best", "random"]:
+            for criterion in ["gini", "entropy"]:
+                for criteria in ["min_distance", "max_samples"]:
+                    tcl = self.build(
+                        splitter_type=splitter_type,
+                        criterion=criterion,
+                        criteria=criteria,
+                    )
+                    self.assertEqual(splitter_type, tcl._splitter_type)
+                    self.assertEqual(criterion, tcl._criterion)
+                    self.assertEqual(criteria, tcl._criteria)
+
+    def test_gini(self):
+        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
+        expected = 0.48
+        self.assertEqual(expected, Splitter._gini(y))
+        tcl = self.build(criterion="gini")
+        self.assertEqual(expected, tcl.criterion_function(y))
+
+    def test_entropy(self):
+        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
+        expected = 0.9709505944546686
+        self.assertAlmostEqual(expected, Splitter._entropy(y))
+        tcl = self.build(criterion="entropy")
+        self.assertEqual(expected, tcl.criterion_function(y))
+
+    def test_information_gain(self):
+        yu = np.array([0, 1, 1, 1, 1, 1])
+        yd = np.array([0, 0, 0, 1])
+        values_expected = [
+            ("gini", 0.31666666666666665),
+            ("entropy", 0.7145247027726656),
+        ]
+        for criterion, expected in values_expected:
+            tcl = self.build(criterion=criterion)
+            computed = tcl.information_gain(yu, yd)
+            self.assertAlmostEqual(expected, computed)
+
+    def test_max_samples(self):
+        tcl = self.build(criteria="max_samples")
+        data = np.array(
+            [
+                [-0.1, 0.2, -0.3],
+                [0.7, 0.01, -0.1],
+                [0.7, -0.9, 0.5],
+                [0.1, 0.2, 0.3],
+            ]
+        )
+        expected = np.array([0.2, 0.01, -0.9, 0.2])
+        y = [1, 2, 1, 0]
+        computed = tcl._max_samples(data, y)
+        self.assertEqual((4,), computed.shape)
+        self.assertListEqual(expected.tolist(), computed.tolist())
+
+    def test_min_distance(self):
+        tcl = self.build()
+        data = np.array(
+            [
+                [-0.1, 0.2, -0.3],
+                [0.7, 0.01, -0.1],
+                [0.7, -0.9, 0.5],
+                [0.1, 0.2, 0.3],
+            ]
+        )
+        expected = np.array([-0.1, 0.01, 0.5, 0.1])
+        computed = tcl._min_distance(data, None)
+        self.assertEqual((4,), computed.shape)
+        self.assertListEqual(expected.tolist(), computed.tolist())
+
+    def test_splitter_parameter(self):
+        expected_values = [
+            [1, 7, 9],
+            [1, 7, 9],
+            [1, 7, 9],
+            [1, 7, 9],
+            [0, 5, 6],
+            [0, 5, 6],
+            [0, 5, 6],
+            [0, 5, 6],
+        ]
+        X, y = load_dataset(self._random_state, n_features=12)
+        for splitter_type in ["best", "random"]:
+            for criterion in ["gini", "entropy"]:
+                for criteria in ["min_distance", "max_samples"]:
+                    tcl = self.build(
+                        splitter_type=splitter_type,
+                        criterion=criterion,
+                        criteria=criteria,
+                        random_state=self._random_state,
+                    )
+                    expected = expected_values.pop(0)
+                    dataset, computed = tcl.get_subspace(X, y, max_features=3)
+                    self.assertListEqual(expected, list(computed))
+                    self.assertListEqual(
+                        X[:, computed].tolist(), dataset.tolist()
+                    )
diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py
index 371e1d0..0fea9e5 100644
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -204,13 +204,11 @@ class Stree_test(unittest.TestCase):
         self.assertEqual(0, len(list(tcl)))
 
     def test_min_samples_split(self):
-        tcl_split = Stree(min_samples_split=3)
-        tcl_nosplit = Stree(min_samples_split=4)
         dataset = [[1], [2], [3]], [1, 1, 0]
-        tcl_split.fit(*dataset)
+        tcl_split = Stree(min_samples_split=3).fit(*dataset)
         self.assertIsNotNone(tcl_split.tree_.get_down())
         self.assertIsNotNone(tcl_split.tree_.get_up())
-        tcl_nosplit.fit(*dataset)
+        tcl_nosplit = Stree(min_samples_split=4).fit(*dataset)
         self.assertIsNone(tcl_nosplit.tree_.get_down())
         self.assertIsNone(tcl_nosplit.tree_.get_up())
 
@@ -265,37 +263,6 @@ class Stree_test(unittest.TestCase):
                     outcome = outcomes[name][f"{criteria} {kernel}"]
                     self.assertAlmostEqual(outcome, clf.score(px, py))
 
-    def test_min_distance(self):
-        clf = Stree()
-        data = np.array(
-            [
-                [-0.1, 0.2, -0.3],
-                [0.7, 0.01, -0.1],
-                [0.7, -0.9, 0.5],
-                [0.1, 0.2, 0.3],
-            ]
-        )
-        expected = np.array([-0.1, 0.01, 0.5, 0.1])
-        computed = clf._min_distance(data, None)
-        self.assertEqual((4,), computed.shape)
-        self.assertListEqual(expected.tolist(), computed.tolist())
-
-    def test_max_samples(self):
-        clf = Stree()
-        data = np.array(
-            [
-                [-0.1, 0.2, -0.3],
-                [0.7, 0.01, -0.1],
-                [0.7, -0.9, 0.5],
-                [0.1, 0.2, 0.3],
-            ]
-        )
-        expected = np.array([0.2, 0.01, -0.9, 0.2])
-        y = [1, 2, 1, 0]
-        computed = clf._max_samples(data, y)
-        self.assertEqual((4,), computed.shape)
-        self.assertListEqual(expected.tolist(), computed.tolist())
-
     def test_max_features(self):
         n_features = 16
         expected_values = [
@@ -334,7 +301,9 @@ class Stree_test(unittest.TestCase):
         for max_features, expected in expected_values:
             clf.set_params(**dict(max_features=max_features))
             clf.fit(dataset, y)
-            computed, indices = clf._get_subspace(dataset)
+            computed, indices = clf.splitter_.get_subspace(
+                dataset, y, clf.max_features_
+            )
             self.assertListEqual(
                 dataset[:, indices].tolist(), computed.tolist()
             )
@@ -345,22 +314,6 @@ class Stree_test(unittest.TestCase):
         with self.assertRaises(ValueError):
             clf.fit(*load_dataset())
 
-    def test_gini(self):
-        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
-        expected = 0.48
-        self.assertEqual(expected, Stree._gini(y))
-        clf = Stree(criterion="gini")
-        clf.fit(*load_dataset())
-        self.assertEqual(expected, clf.criterion_function_(y))
-
-    def test_entropy(self):
-        y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
-        expected = 0.9709505944546686
-        self.assertAlmostEqual(expected, Stree._entropy(y))
-        clf = Stree(criterion="entropy")
-        clf.fit(*load_dataset())
-        self.assertEqual(expected, clf.criterion_function_(y))
-
     def test_predict_feature_dimensions(self):
         X = np.random.rand(10, 5)
         y = np.random.randint(0, 2, 10)
@@ -374,3 +327,8 @@ class Stree_test(unittest.TestCase):
         clf = Stree(random_state=self._random_state, max_features=2)
         clf.fit(X, y)
         self.assertAlmostEqual(0.9426666666666667, clf.score(X, y))
+
+    def test_bogus_splitter_parameter(self):
+        clf = Stree(splitter="duck")
+        with self.assertRaises(ValueError):
+            clf.fit(*load_dataset())
diff --git a/stree/tests/__init__.py b/stree/tests/__init__.py
index 625eea9..32e7a88 100644
--- a/stree/tests/__init__.py
+++ b/stree/tests/__init__.py
@@ -1,4 +1,5 @@
 from .Stree_test import Stree_test
 from .Snode_test import Snode_test
+from .Splitter_test import Splitter_test
 
-__all__ = ["Stree_test", "Snode_test"]
+__all__ = ["Stree_test", "Snode_test", "Splitter_test"]
diff --git a/stree/tests/utils.py b/stree/tests/utils.py
index a371e88..94b0506 100644
--- a/stree/tests/utils.py
+++ b/stree/tests/utils.py
@@ -1,10 +1,10 @@
 from sklearn.datasets import make_classification
 
 
-def load_dataset(random_state=0, n_classes=2):
+def load_dataset(random_state=0, n_classes=2, n_features=3):
     X, y = make_classification(
         n_samples=1500,
-        n_features=3,
+        n_features=n_features,
         n_informative=3,
         n_redundant=0,
         n_repeated=0,

From 736ab7ef2073830da806b920ab1bb59a289ce117 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Mon, 15 Jun 2020 10:33:51 +0200
Subject: [PATCH 5/6] #2 update benchmark notebook

---
 notebooks/benchmark.ipynb | 254 +++++++-------------------------------
 stree/Strees.py           |   4 +-
 2 files changed, 48 insertions(+), 210 deletions(-)

diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb
index 9e19ff0..b87cf36 100644
--- a/notebooks/benchmark.ipynb
+++ b/notebooks/benchmark.ipynb
@@ -17,7 +17,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -64,15 +64,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "2020-06-14 23:45:42\n"
-     ]
+     "name": "stdout",
+     "text": "2020-06-15 10:17:17\n"
     }
    ],
    "source": [
@@ -88,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,16 +98,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "Fraud: 0.173% 492\n",
-      "Valid: 99.827% 284,315\n"
-     ]
+     "name": "stdout",
+     "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n"
     }
    ],
    "source": [
@@ -119,7 +114,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -131,16 +126,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "X shape: (284807, 29)\n",
-      "y shape: (284807,)\n"
-     ]
+     "name": "stdout",
+     "text": "X shape: (284807, 29)\ny shape: (284807,)\n"
     }
    ],
    "source": [
@@ -159,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,7 +172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -190,7 +182,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -200,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -210,7 +202,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -227,7 +219,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -252,179 +244,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
+     "output_type": "stream",
      "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "************************** Linear Tree **********************\n",
-      "Train Model Linear Tree took: 13.52 seconds\n",
-      "=========== Linear Tree - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   1.000000  1.000000  1.000000    199020\n",
-      "           1   1.000000  1.000000  1.000000       344\n",
-      "\n",
-      "    accuracy                       1.000000    199364\n",
-      "   macro avg   1.000000  1.000000  1.000000    199364\n",
-      "weighted avg   1.000000  1.000000  1.000000    199364\n",
-      "\n",
-      "=========== Linear Tree - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999578  0.999613  0.999596     85295\n",
-      "           1   0.772414  0.756757  0.764505       148\n",
-      "\n",
-      "    accuracy                       0.999192     85443\n",
-      "   macro avg   0.885996  0.878185  0.882050     85443\n",
-      "weighted avg   0.999184  0.999192  0.999188     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[199020      0]\n",
-      " [     0    344]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85262    33]\n",
-      " [   36   112]]\n",
-      "************************** Random Forest **********************\n",
-      "Train Model Random Forest took: 152.5 seconds\n",
-      "=========== Random Forest - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   1.000000  1.000000  1.000000    199020\n",
-      "           1   1.000000  1.000000  1.000000       344\n",
-      "\n",
-      "    accuracy                       1.000000    199364\n",
-      "   macro avg   1.000000  1.000000  1.000000    199364\n",
-      "weighted avg   1.000000  1.000000  1.000000    199364\n",
-      "\n",
-      "=========== Random Forest - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999660  0.999965  0.999812     85295\n",
-      "           1   0.975410  0.804054  0.881481       148\n",
-      "\n",
-      "    accuracy                       0.999625     85443\n",
-      "   macro avg   0.987535  0.902009  0.940647     85443\n",
-      "weighted avg   0.999618  0.999625  0.999607     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[199020      0]\n",
-      " [     0    344]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85292     3]\n",
-      " [   29   119]]\n",
-      "************************** Stree (SVM Tree) **********************\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
-      "  \"the number of iterations.\", ConvergenceWarning)\n",
-      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
-      "  \"the number of iterations.\", ConvergenceWarning)\n",
-      "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
-      "  \"the number of iterations.\", ConvergenceWarning)\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train Model Stree (SVM Tree) took: 32.55 seconds\n",
-      "=========== Stree (SVM Tree) - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999623  0.999864  0.999744    199020\n",
-      "           1   0.908784  0.781977  0.840625       344\n",
-      "\n",
-      "    accuracy                       0.999488    199364\n",
-      "   macro avg   0.954204  0.890921  0.920184    199364\n",
-      "weighted avg   0.999467  0.999488  0.999469    199364\n",
-      "\n",
-      "=========== Stree (SVM Tree) - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999637  0.999918  0.999777     85295\n",
-      "           1   0.943548  0.790541  0.860294       148\n",
-      "\n",
-      "    accuracy                       0.999555     85443\n",
-      "   macro avg   0.971593  0.895229  0.930036     85443\n",
-      "weighted avg   0.999540  0.999555  0.999536     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[198993     27]\n",
-      " [    75    269]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85288     7]\n",
-      " [   31   117]]\n",
-      "************************** AdaBoost model **********************\n",
-      "Train Model AdaBoost model took: 47.34 seconds\n",
-      "=========== AdaBoost model - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999392  0.999678  0.999535    199020\n",
-      "           1   0.777003  0.648256  0.706815       344\n",
-      "\n",
-      "    accuracy                       0.999072    199364\n",
-      "   macro avg   0.888198  0.823967  0.853175    199364\n",
-      "weighted avg   0.999008  0.999072  0.999030    199364\n",
-      "\n",
-      "=========== AdaBoost model - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999484  0.999707  0.999596     85295\n",
-      "           1   0.806202  0.702703  0.750903       148\n",
-      "\n",
-      "    accuracy                       0.999192     85443\n",
-      "   macro avg   0.902843  0.851205  0.875249     85443\n",
-      "weighted avg   0.999149  0.999192  0.999165     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[198956     64]\n",
-      " [   121    223]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85270    25]\n",
-      " [   44   104]]\n",
-      "************************** Gradient Boost. **********************\n",
-      "Train Model Gradient Boost. took: 244.1 seconds\n",
-      "=========== Gradient Boost. - Train 199,364 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.999096  0.999854  0.999475    199020\n",
-      "           1   0.849741  0.476744  0.610801       344\n",
-      "\n",
-      "    accuracy                       0.998952    199364\n",
-      "   macro avg   0.924419  0.738299  0.805138    199364\n",
-      "weighted avg   0.998839  0.998952  0.998804    199364\n",
-      "\n",
-      "=========== Gradient Boost. - Test 85,443 samples =============\n",
-      "              precision    recall  f1-score   support\n",
-      "\n",
-      "           0   0.998981  0.999730  0.999355     85295\n",
-      "           1   0.726190  0.412162  0.525862       148\n",
-      "\n",
-      "    accuracy                       0.998713     85443\n",
-      "   macro avg   0.862586  0.705946  0.762609     85443\n",
-      "weighted avg   0.998508  0.998713  0.998535     85443\n",
-      "\n",
-      "Confusion Matrix in Train\n",
-      "[[198991     29]\n",
-      " [   180    164]]\n",
-      "Confusion Matrix in Test\n",
-      "[[85272    23]\n",
-      " [   87    61]]\n"
-     ]
+     "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 13.91 seconds\n=========== Linear Tree - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Linear Tree - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999578  0.999613  0.999596     85295\n           1   0.772414  0.756757  0.764505       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.885996  0.878185  0.882050     85443\nweighted avg   0.999184  0.999192  0.999188     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85262    33]\n [   36   112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 173.1 seconds\n=========== Random Forest - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   1.000000  1.000000  1.000000    199020\n           1   1.000000  1.000000  1.000000       344\n\n    accuracy                       1.000000    199364\n   macro avg   1.000000  1.000000  1.000000    199364\nweighted avg   1.000000  1.000000  1.000000    199364\n\n=========== Random Forest - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999660  0.999965  0.999812     85295\n           1   0.975410  0.804054  0.881481       148\n\n    accuracy                       0.999625     85443\n   macro avg   0.987535  0.902009  0.940647     85443\nweighted avg   0.999618  0.999625  0.999607     85443\n\nConfusion Matrix in Train\n[[199020      0]\n [     0    344]]\nConfusion Matrix in Test\n[[85292     3]\n [   29   119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 38.4 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999623  0.999864  0.999744    199020\n           1   0.908784  0.781977  0.840625       344\n\n    accuracy                       0.999488    199364\n   macro avg   0.954204  0.890921  0.920184    199364\nweighted avg   0.999467  0.999488  0.999469    199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999637  0.999918  0.999777     85295\n           1   0.943548  0.790541  0.860294       148\n\n    accuracy                       0.999555     85443\n   macro avg   0.971593  0.895229  0.930036     85443\nweighted avg   0.999540  0.999555  0.999536     85443\n\nConfusion Matrix in Train\n[[198993     27]\n [    75    269]]\nConfusion Matrix in Test\n[[85288     7]\n [   31   117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 47.21 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999392  0.999678  0.999535    199020\n           1   0.777003  0.648256  0.706815       344\n\n    accuracy                       0.999072    199364\n   macro avg   0.888198  0.823967  0.853175    199364\nweighted avg   0.999008  0.999072  0.999030    199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n              precision    recall  f1-score   support\n\n           0   0.999484  0.999707  0.999596     85295\n           1   0.806202  0.702703  0.750903       148\n\n    accuracy                       0.999192     85443\n   macro avg   0.902843  0.851205  0.875249     85443\nweighted avg   0.999149  0.999192  0.999165     85443\n\nConfusion Matrix in Train\n[[198956     64]\n [   121    223]]\nConfusion Matrix in Test\n[[85270    25]\n [   44   104]]\n"
     }
    ],
    "source": [
     "# Train & Test models\n",
     "models = {\n",
     "    'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree,  \n",
-    "    'AdaBoost model': adaboost, 'Gradient Boost.': gradient\n",
+    "    'AdaBoost model': adaboost\n",
     "}\n",
     "\n",
     "best_f1 = 0\n",
@@ -440,22 +273,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
      "output_type": "stream",
-     "text": [
-      "**************************************************************************************************************\n",
-      "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
-      "**************************************************************************************************************\n",
-      "Model: Linear Tree\t Time:  13.52 seconds\t f1: 0.7645\n",
-      "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
-      "Model: Stree (SVM Tree)\t Time:  32.55 seconds\t f1: 0.8603\n",
-      "Model: AdaBoost model\t Time:  47.34 seconds\t f1: 0.7509\n",
-      "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n"
-     ]
+     "name": "stdout",
+     "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 173.095 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time:  13.91 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 173.09 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time:  38.40 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time:  47.21 seconds\t f1: 0.7509\n"
     }
    ],
    "source": [
@@ -466,6 +290,20 @@
     "    print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")"
    ]
   },
+  {
+   "cell_type": "raw",
+   "metadata": {},
+   "source": [
+    "**************************************************************************************************************\n",
+    "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n",
+    "**************************************************************************************************************\n",
+    "Model: Linear Tree\t Time:  13.52 seconds\t f1: 0.7645\n",
+    "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n",
+    "Model: Stree (SVM Tree)\t Time:  32.55 seconds\t f1: 0.8603\n",
+    "Model: AdaBoost model\t Time:  47.34 seconds\t f1: 0.7509\n",
+    "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -487,9 +325,9 @@
  "metadata": {
   "hide_input": false,
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.7.6 64-bit ('general': venv)",
    "language": "python",
-   "name": "python3"
+   "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39"
   },
   "language_info": {
    "codemirror_mode": {
@@ -501,7 +339,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.7.6-final"
   },
   "toc": {
    "base_numbering": 1,
@@ -555,4 +393,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/stree/Strees.py b/stree/Strees.py
index e2e33c3..7624972 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -193,8 +193,8 @@ class Splitter:
     def information_gain(
         self, labels_up: np.array, labels_dn: np.array
     ) -> float:
-        card_up = labels_up.shape[0]
-        card_dn = labels_dn.shape[0]
+        card_up = labels_up.shape[0] if labels_up is not None else 0
+        card_dn = labels_dn.shape[0] if labels_dn is not None else 0
         samples = card_up + card_dn
         up = card_up / samples * self.criterion_function(labels_up)
         dn = card_dn / samples * self.criterion_function(labels_dn)

From 9334951d1b84d9fb3420054b4be370d7dad91bd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Mon, 15 Jun 2020 11:09:11 +0200
Subject: [PATCH 6/6] #2 Cosmetic and style updates

---
 stree/Strees.py              | 8 +++++---
 stree/tests/Splitter_test.py | 2 +-
 stree/tests/Stree_test.py    | 8 ++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/stree/Strees.py b/stree/Strees.py
index 7624972..ceeed7a 100644
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -268,7 +268,8 @@ class Splitter:
             data = self.decision_criteria(data, node._y)
         self._down = data > 0
 
-    def _distances(self, node: Snode, data: np.ndarray) -> np.array:
+    @staticmethod
+    def _distances(node: Snode, data: np.ndarray) -> np.array:
         """Compute distances of the samples to the hyperplane of the node
 
         :param node: node containing the svm classifier
@@ -498,7 +499,8 @@ class Stree(BaseEstimator, ClassifierMixin):
             )
         )
 
-    def _reorder_results(self, y: np.array, indices: np.array) -> np.array:
+    @staticmethod
+    def _reorder_results(y: np.array, indices: np.array) -> np.array:
         """Reorder an array based on the array of indices passed
 
         :param y: data untidy
@@ -579,7 +581,7 @@ class Stree(BaseEstimator, ClassifierMixin):
         X, y = check_X_y(X, y)
         y_pred = self.predict(X).reshape(y.shape)
         # Compute accuracy for each possible representation
-        y_type, y_true, y_pred = _check_targets(y, y_pred)
+        _, y_true, y_pred = _check_targets(y, y_pred)
         check_consistent_length(y_true, y_pred, sample_weight)
         score = y_true == y_pred
         return _weighted_sum(score, sample_weight, normalize=True)
diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py
index b620ce1..68c6123 100644
--- a/stree/tests/Splitter_test.py
+++ b/stree/tests/Splitter_test.py
@@ -13,8 +13,8 @@ class Splitter_test(unittest.TestCase):
         self._random_state = 1
         super().__init__(*args, **kwargs)
 
+    @staticmethod
     def build(
-        self,
         clf=LinearSVC(),
         min_samples_split=0,
         splitter_type="random",
diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py
index 0fea9e5..ccc0442 100644
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -67,9 +67,8 @@ class Stree_test(unittest.TestCase):
             clf.fit(*load_dataset(self._random_state))
             self._check_tree(clf.tree_)
 
-    def _find_out(
-        self, px: np.array, x_original: np.array, y_original
-    ) -> list:
+    @staticmethod
+    def _find_out(px: np.array, x_original: np.array, y_original) -> list:
         """Find the original values of y for a given array of samples
 
         Arguments:
@@ -163,7 +162,8 @@ class Stree_test(unittest.TestCase):
         self.assertListEqual(expected, computed)
         self.assertEqual(expected_string, str(clf))
 
-    def test_is_a_sklearn_classifier(self):
+    @staticmethod
+    def test_is_a_sklearn_classifier():
         import warnings
         from sklearn.exceptions import ConvergenceWarning