From ae1c199e210c50891bcd897aee0166ade6fa9fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 13 Jun 2020 17:58:45 +0200 Subject: [PATCH 1/6] # 2 - add max_features parameters --- stree/Strees.py | 68 ++++++++++++++++++++++++++++++++++----- stree/tests/Stree_test.py | 44 +++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 8 deletions(-) diff --git a/stree/Strees.py b/stree/Strees.py index eeffe65..e36ac1d 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -7,7 +7,9 @@ Build an oblique tree classifier based on SVM Trees """ import os - +import numbers +import random +from itertools import combinations import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import SVC, LinearSVC @@ -127,8 +129,9 @@ class Stree(BaseEstimator, ClassifierMixin): tol: float = 1e-4, degree: int = 3, gamma="scale", - split_criteria="max_samples", + split_criteria: str = "max_samples", min_samples_split: int = 0, + max_features=None, ): self.max_iter = max_iter self.C = C @@ -140,6 +143,7 @@ class Stree(BaseEstimator, ClassifierMixin): self.degree = degree self.min_samples_split = min_samples_split self.split_criteria = split_criteria + self.max_features = max_features def _more_tags(self) -> dict: """Required by sklearn to supply features of the classifier @@ -160,10 +164,10 @@ class Stree(BaseEstimator, ClassifierMixin): :rtype: list """ up = ~down - return ( + return [ origin[up] if any(up) else None, origin[down] if any(down) else None, - ) + ] def _distances(self, node: Snode, data: np.ndarray) -> np.array: """Compute distances of the samples to the hyperplane of the node @@ -257,7 +261,8 @@ class Stree(BaseEstimator, ClassifierMixin): self.n_classes_ = self.classes_.shape[0] self.n_iter_ = self.max_iter self.depth_ = 0 - self.n_features_in_ = X.shape[1] + self.n_features_ = X.shape[1] + self.max_features_ = self._initialize_max_features() self.tree_ = self.train(X, y, sample_weight, 1, "root") self._build_predictor() return self @@ -294,10 +299,11 @@ class Stree(BaseEstimator, ClassifierMixin): return Snode(None, X, y, title + ", ") # Train the model clf = self._build_clf() - clf.fit(X, y, sample_weight=sample_weight) - node = Snode(clf, X, y, title) + Xs, indices_subset = self._get_subspace(X) + clf.fit(Xs, y, sample_weight=sample_weight) + node = Snode(clf, Xs, y, title) self.depth_ = max(depth, self.depth_) - down = self._split_criteria(self._distances(node, X), node) + down = self._split_criteria(self._distances(node, Xs), node) X_U, X_D = self._split_array(X, down) y_u, y_d = self._split_array(y, down) sw_u, sw_d = self._split_array(sample_weight, down) @@ -446,3 +452,49 @@ class Stree(BaseEstimator, ClassifierMixin): for i in self: output += str(i) + "\n" return output + + def _initialize_max_features(self) -> int: + if isinstance(self.max_features, str): + if self.max_features == "auto": + max_features = max(1, int(np.sqrt(self.n_features_))) + elif self.max_features == "sqrt": + max_features = max(1, int(np.sqrt(self.n_features_))) + elif self.max_features == "log2": + max_features = max(1, int(np.log2(self.n_features_))) + else: + raise ValueError( + "Invalid value for max_features. " + "Allowed string values are 'auto', " + "'sqrt' or 'log2'." + ) + elif self.max_features is None: + max_features = self.n_features_ + elif isinstance(self.max_features, numbers.Integral): + max_features = self.max_features + else: # float + if self.max_features > 0.0: + max_features = max( + 1, int(self.max_features * self.n_features_) + ) + else: + raise ValueError( + "Invalid value for max_features." + "Allowed float must be in range (0, 1] " + f"got ({self.max_features})" + ) + return max_features + + def _get_subspace(self, dataset: np.array) -> list: + """Return the best subspace to make a split + """ + + def get_subspaces_set(dataset: np.array) -> np.array: + features = range(dataset.shape[1]) + features_sets = list(combinations(features, self.max_features_)) + if len(features_sets) > 1: + return features_sets[random.randint(0, len(features_sets))] + else: + return features_sets[0] + + indices = get_subspaces_set(dataset) + return dataset[:, indices], indices diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index a921838..0c809b3 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -295,3 +295,47 @@ class Stree_test(unittest.TestCase): computed = clf._max_samples(data, y) self.assertEqual((4,), computed.shape) self.assertListEqual(expected.tolist(), computed.tolist()) + + def test_max_features(self): + n_features = 16 + expected_values = [ + ("auto", 4), + ("log2", 4), + ("sqrt", 4), + (0.5, 8), + (3, 3), + (None, 16), + ] + clf = Stree() + clf.n_features_ = n_features + for max_features, expected in expected_values: + clf.set_params(**dict(max_features=max_features)) + computed = clf._initialize_max_features() + self.assertEqual(expected, computed) + # Check bogus max_features + values = ["duck", -0.1, 0.0] + for max_features in values: + clf.set_params(**dict(max_features=max_features)) + with self.assertRaises(ValueError): + _ = clf._initialize_max_features() + + def test_get_subspaces(self): + dataset = np.random.random((10, 16)) + y = np.random.randint(0, 2, 10) + expected_values = [ + ("auto", 4), + ("log2", 4), + ("sqrt", 4), + (0.5, 8), + (3, 3), + (None, 16), + ] + clf = Stree() + for max_features, expected in expected_values: + clf.set_params(**dict(max_features=max_features)) + clf.fit(dataset, y) + computed, indices = clf._get_subspace(dataset) + self.assertListEqual( + dataset[:, indices].tolist(), computed.tolist() + ) + self.assertEqual(expected, len(indices)) From f1ee4de37beeea80390d63c7bce8afe7ba9e966a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 14 Jun 2020 03:08:55 +0200 Subject: [PATCH 2/6] #2 - Add gini and entropy measures rename get_dataset to load_dataset add features and impurity to __str__ of node --- main.py | 67 +++---------------------------------- stree/Strees.py | 70 +++++++++++++++++++++++++++++++++++---- stree/tests/Snode_test.py | 14 ++++---- stree/tests/Stree_test.py | 63 +++++++++++++++++++++++------------ stree/tests/utils.py | 2 +- 5 files changed, 118 insertions(+), 98 deletions(-) diff --git a/main.py b/main.py index 30d36de..e4722c7 100644 --- a/main.py +++ b/main.py @@ -1,72 +1,15 @@ import time from sklearn.model_selection import train_test_split +from sklearn.datasets import load_iris from stree import Stree random_state = 1 +X, y = load_iris(return_X_y=True) -def load_creditcard(n_examples=0): - import pandas as pd - import numpy as np - import random - - df = pd.read_csv("data/creditcard.csv") - print( - "Fraud: {0:.3f}% {1}".format( - df.Class[df.Class == 1].count() * 100 / df.shape[0], - df.Class[df.Class == 1].count(), - ) - ) - print( - "Valid: {0:.3f}% {1}".format( - df.Class[df.Class == 0].count() * 100 / df.shape[0], - df.Class[df.Class == 0].count(), - ) - ) - y = np.expand_dims(df.Class.values, axis=1) - X = df.drop(["Class", "Time", "Amount"], axis=1).values - if n_examples > 0: - # Take first n_examples samples - X = X[:n_examples, :] - y = y[:n_examples, :] - else: - # Take all the positive samples with a number of random negatives - if n_examples < 0: - Xt = X[(y == 1).ravel()] - yt = y[(y == 1).ravel()] - indices = random.sample(range(X.shape[0]), -1 * n_examples) - X = np.append(Xt, X[indices], axis=0) - y = np.append(yt, y[indices], axis=0) - print("X.shape", X.shape, " y.shape", y.shape) - print( - "Fraud: {0:.3f}% {1}".format( - len(y[y == 1]) * 100 / X.shape[0], len(y[y == 1]) - ) - ) - print( - "Valid: {0:.3f}% {1}".format( - len(y[y == 0]) * 100 / X.shape[0], len(y[y == 0]) - ) - ) - Xtrain, Xtest, ytrain, ytest = train_test_split( - X, - y, - train_size=0.7, - shuffle=True, - random_state=random_state, - stratify=y, - ) - return Xtrain, Xtest, ytrain, ytest - - -# data = load_creditcard(-5000) # Take all true samples + 5000 of the others -# data = load_creditcard(5000) # Take the first 5000 samples -data = load_creditcard() # Take all the samples - -Xtrain = data[0] -Xtest = data[1] -ytrain = data[2] -ytest = data[3] +Xtrain, Xtest, ytrain, ytest = train_test_split( + X, y, test_size=0.2, random_state=random_state +) now = time.time() clf = Stree(C=0.01, random_state=random_state) diff --git a/stree/Strees.py b/stree/Strees.py index e36ac1d..cb8731f 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -29,7 +29,15 @@ class Snode: dataset assigned to it """ - def __init__(self, clf: SVC, X: np.ndarray, y: np.ndarray, title: str): + def __init__( + self, + clf: SVC, + X: np.ndarray, + y: np.ndarray, + features: np.array, + impurity: float, + title: str, + ): self._clf = clf self._title = title self._belief = 0.0 @@ -39,10 +47,21 @@ class Snode: self._down = None self._up = None self._class = None + self._feature = None + self._sample_weight = None + self._features = features + self._impurity = impurity @classmethod def copy(cls, node: "Snode") -> "Snode": - return cls(node._clf, node._X, node._y, node._title) + return cls( + node._clf, + node._X, + node._y, + node._features, + node._impurity, + node._title, + ) def set_down(self, son): self._down = son @@ -83,11 +102,15 @@ class Snode: count_values = np.unique(self._y, return_counts=True) result = ( f"{self._title} - Leaf class={self._class} belief=" - f"{self._belief: .6f} counts={count_values}" + f"{self._belief: .6f} impurity={self._impurity:.4f} " + f"counts={count_values}" ) return result else: - return f"{self._title}" + return ( + f"{self._title} feaures={self._features} impurity=" + f"{self._impurity:.4f}" + ) class Siterator: @@ -130,6 +153,7 @@ class Stree(BaseEstimator, ClassifierMixin): degree: int = 3, gamma="scale", split_criteria: str = "max_samples", + criterion: str = "gini", min_samples_split: int = 0, max_features=None, ): @@ -144,6 +168,7 @@ class Stree(BaseEstimator, ClassifierMixin): self.min_samples_split = min_samples_split self.split_criteria = split_criteria self.max_features = max_features + self.criterion = criterion def _more_tags(self) -> dict: """Required by sklearn to supply features of the classifier @@ -251,6 +276,10 @@ class Stree(BaseEstimator, ClassifierMixin): f"split_criteria has to be min_distance or \ max_samples got ({self.split_criteria})" ) + if self.criterion not in ["gini", "entropy"]: + raise ValueError( + f"criterion must be gini or entropy got({self.criterion})" + ) check_classification_targets(y) X, y = check_X_y(X, y) @@ -263,6 +292,7 @@ class Stree(BaseEstimator, ClassifierMixin): self.depth_ = 0 self.n_features_ = X.shape[1] self.max_features_ = self._initialize_max_features() + self.criterion_function_ = getattr(self, f"_{self.criterion}") self.tree_ = self.train(X, y, sample_weight, 1, "root") self._build_predictor() return self @@ -296,12 +326,20 @@ class Stree(BaseEstimator, ClassifierMixin): return None if np.unique(y).shape[0] == 1: # only 1 class => pure dataset - return Snode(None, X, y, title + ", ") + return Snode( + clf=None, + X=X, + y=y, + features=X.shape[1], + impurity=0.0, + title=title + ", ", + ) # Train the model clf = self._build_clf() Xs, indices_subset = self._get_subspace(X) clf.fit(Xs, y, sample_weight=sample_weight) - node = Snode(clf, Xs, y, title) + impurity = self.criterion_function_(y) + node = Snode(clf, X, y, indices_subset, impurity, title) self.depth_ = max(depth, self.depth_) down = self._split_criteria(self._distances(node, Xs), node) X_U, X_D = self._split_array(X, down) @@ -309,7 +347,14 @@ class Stree(BaseEstimator, ClassifierMixin): sw_u, sw_d = self._split_array(sample_weight, down) if X_U is None or X_D is None: # didn't part anything - return Snode(clf, X, y, title + ", ") + return Snode( + clf, + X, + y, + features=X.shape[1], + impurity=impurity, + title=title + ", ", + ) node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up")) node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down")) return node @@ -484,6 +529,17 @@ class Stree(BaseEstimator, ClassifierMixin): ) return max_features + @staticmethod + def _gini(y: np.array) -> float: + _, count = np.unique(y, return_counts=True) + return 1 - np.sum(np.square(count / np.sum(count))) + + @staticmethod + def _entropy(y: np.array) -> float: + _, count = np.unique(y, return_counts=True) + proportion = count / np.sum(count) + return -np.sum(proportion * np.log2(proportion)) + def _get_subspace(self, dataset: np.array) -> list: """Return the best subspace to make a split """ diff --git a/stree/tests/Snode_test.py b/stree/tests/Snode_test.py index c82bd99..6f3c4d2 100644 --- a/stree/tests/Snode_test.py +++ b/stree/tests/Snode_test.py @@ -4,14 +4,14 @@ import unittest import numpy as np from stree import Stree, Snode -from .utils import get_dataset +from .utils import load_dataset class Snode_test(unittest.TestCase): def __init__(self, *args, **kwargs): self._random_state = 1 self._clf = Stree(random_state=self._random_state) - self._clf.fit(*get_dataset(self._random_state)) + self._clf.fit(*load_dataset(self._random_state)) super().__init__(*args, **kwargs) @classmethod @@ -63,27 +63,27 @@ class Snode_test(unittest.TestCase): run_tree(self._clf.tree_) def test_make_predictor_on_leaf(self): - test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test") + test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test") test.make_predictor() self.assertEqual(1, test._class) self.assertEqual(0.75, test._belief) def test_make_predictor_on_not_leaf(self): - test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], "test") - test.set_up(Snode(None, [1], [1], "another_test")) + test = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test") + test.set_up(Snode(None, [1], [1], [], 0.0, "another_test")) test.make_predictor() self.assertIsNone(test._class) self.assertEqual(0, test._belief) def test_make_predictor_on_leaf_bogus_data(self): - test = Snode(None, [1, 2, 3, 4], [], "test") + test = Snode(None, [1, 2, 3, 4], [], [], 0.0, "test") test.make_predictor() self.assertIsNone(test._class) def test_copy_node(self): px = [1, 2, 3, 4] py = [1] - test = Snode(Stree(), px, py, "test") + test = Snode(Stree(), px, py, [], 0.0, "test") computed = Snode.copy(test) self.assertListEqual(computed._X, px) self.assertListEqual(computed._y, py) diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index 0c809b3..a3fb3d1 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -5,7 +5,7 @@ import numpy as np from sklearn.datasets import load_iris from stree import Stree, Snode -from .utils import get_dataset +from .utils import load_dataset class Stree_test(unittest.TestCase): @@ -64,7 +64,7 @@ class Stree_test(unittest.TestCase): warnings.filterwarnings("ignore") for kernel in self._kernels: clf = Stree(kernel=kernel, random_state=self._random_state) - clf.fit(*get_dataset(self._random_state)) + clf.fit(*load_dataset(self._random_state)) self._check_tree(clf.tree_) def _find_out( @@ -88,7 +88,7 @@ class Stree_test(unittest.TestCase): return res def test_single_prediction(self): - X, y = get_dataset(self._random_state) + X, y = load_dataset(self._random_state) for kernel in self._kernels: clf = Stree(kernel=kernel, random_state=self._random_state) yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1]))) @@ -97,14 +97,14 @@ class Stree_test(unittest.TestCase): def test_multiple_prediction(self): # First 27 elements the predictions are the same as the truth num = 27 - X, y = get_dataset(self._random_state) + X, y = load_dataset(self._random_state) for kernel in self._kernels: clf = Stree(kernel=kernel, random_state=self._random_state) yp = clf.fit(X, y).predict(X[:num, :]) self.assertListEqual(y[:num].tolist(), yp.tolist()) def test_score(self): - X, y = get_dataset(self._random_state) + X, y = load_dataset(self._random_state) accuracies = [ 0.9506666666666667, 0.9606666666666667, @@ -123,7 +123,7 @@ class Stree_test(unittest.TestCase): """Check if predicting sample by sample gives the same result as predicting all samples at once """ - X, y = get_dataset(self._random_state) + X, y = load_dataset(self._random_state) for kernel in self._kernels: clf = Stree(kernel=kernel, random_state=self._random_state) clf.fit(X, y) @@ -141,22 +141,22 @@ class Stree_test(unittest.TestCase): """Check preorder iterator """ expected = [ - "root", - "root - Down", - "root - Down - Down, - Leaf class=1 belief= 0.975989 counts" - "=(array([0, 1]), array([ 17, 691]))", - "root - Down - Up", + "root feaures=(0, 1, 2) impurity=0.5000", + "root - Down feaures=(0, 1, 2) impurity=0.0671", + "root - Down - Down, - Leaf class=1 belief= 0.975989 " + "impurity=0.0469 counts=(array([0, 1]), array([ 17, 691]))", + "root - Down - Up feaures=(0, 1, 2) impurity=0.3967", "root - Down - Up - Down, - Leaf class=1 belief= 0.750000 " - "counts=(array([0, 1]), array([1, 3]))", + "impurity=0.3750 counts=(array([0, 1]), array([1, 3]))", "root - Down - Up - Up, - Leaf class=0 belief= 1.000000 " - "counts=(array([0]), array([7]))", - "root - Up, - Leaf class=0 belief= 0.928297 counts=(array(" - "[0, 1]), array([725, 56]))", + "impurity=0.0000 counts=(array([0]), array([7]))", + "root - Up, - Leaf class=0 belief= 0.928297 impurity=0.1331" + " counts=(array([0, 1]), array([725, 56]))", ] computed = [] expected_string = "" clf = Stree(kernel="linear", random_state=self._random_state) - clf.fit(*get_dataset(self._random_state)) + clf.fit(*load_dataset(self._random_state)) for node in clf: computed.append(str(node)) expected_string += str(node) + "\n" @@ -176,12 +176,12 @@ class Stree_test(unittest.TestCase): def test_exception_if_C_is_negative(self): tclf = Stree(C=-1) with self.assertRaises(ValueError): - tclf.fit(*get_dataset(self._random_state)) + tclf.fit(*load_dataset(self._random_state)) def test_exception_if_bogus_split_criteria(self): tclf = Stree(split_criteria="duck") with self.assertRaises(ValueError): - tclf.fit(*get_dataset(self._random_state)) + tclf.fit(*load_dataset(self._random_state)) def test_check_max_depth_is_positive_or_None(self): tcl = Stree() @@ -190,13 +190,13 @@ class Stree_test(unittest.TestCase): self.assertGreaterEqual(1, tcl.max_depth) with self.assertRaises(ValueError): tcl = Stree(max_depth=-1) - tcl.fit(*get_dataset(self._random_state)) + tcl.fit(*load_dataset(self._random_state)) def test_check_max_depth(self): depths = (3, 4) for depth in depths: tcl = Stree(random_state=self._random_state, max_depth=depth) - tcl.fit(*get_dataset(self._random_state)) + tcl.fit(*load_dataset(self._random_state)) self.assertEqual(depth, tcl.depth_) def test_unfitted_tree_is_iterable(self): @@ -230,7 +230,7 @@ class Stree_test(unittest.TestCase): def test_muticlass_dataset(self): datasets = { - "Synt": get_dataset(random_state=self._random_state, n_classes=3), + "Synt": load_dataset(random_state=self._random_state, n_classes=3), "Iris": load_iris(return_X_y=True), } outcomes = { @@ -339,3 +339,24 @@ class Stree_test(unittest.TestCase): dataset[:, indices].tolist(), computed.tolist() ) self.assertEqual(expected, len(indices)) + + def test_bogus_criterion(self): + clf = Stree(criterion="duck") + with self.assertRaises(ValueError): + clf.fit(*load_dataset()) + + def test_gini(self): + y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] + expected = 0.48 + self.assertEqual(expected, Stree._gini(y)) + clf = Stree(criterion="gini") + clf.fit(*load_dataset()) + self.assertEqual(expected, clf.criterion_function_(y)) + + def test_entropy(self): + y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] + expected = 0.9709505944546686 + self.assertAlmostEqual(expected, Stree._entropy(y)) + clf = Stree(criterion="entropy") + clf.fit(*load_dataset()) + self.assertEqual(expected, clf.criterion_function_(y)) diff --git a/stree/tests/utils.py b/stree/tests/utils.py index 7b47642..a371e88 100644 --- a/stree/tests/utils.py +++ b/stree/tests/utils.py @@ -1,7 +1,7 @@ from sklearn.datasets import make_classification -def get_dataset(random_state=0, n_classes=2): +def load_dataset(random_state=0, n_classes=2): X, y = make_classification( n_samples=1500, n_features=3, From 502ee72799c98dfe7b8588f2f3fb98305b74ab34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 14 Jun 2020 14:00:21 +0200 Subject: [PATCH 3/6] #2 Add predict and score support Add a test in features notebook Show max_features in main.py --- main.py | 9 +++++++ notebooks/features.ipynb | 55 ++++++++++++++++++++++++++++++++------- stree/Strees.py | 18 +++++++++---- stree/tests/Stree_test.py | 14 ++++++++++ 4 files changed, 82 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index e4722c7..7b40929 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,15 @@ Xtrain, Xtest, ytrain, ytest = train_test_split( ) now = time.time() +print("Predicting with max_features=sqrt(n_features)") +clf = Stree(C=0.01, random_state=random_state, max_features="auto") +clf.fit(Xtrain, ytrain) +print(f"Took {time.time() - now:.2f} seconds to train") +print(clf) +print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}") +print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}") +print("=" * 40) +print("Predicting with max_features=n_features") clf = Stree(C=0.01, random_state=random_state) clf.fit(Xtrain, ytrain) print(f"Took {time.time() - now:.2f} seconds to train") diff --git a/notebooks/features.ipynb b/notebooks/features.ipynb index 9eda9b0..c7d0611 100644 --- a/notebooks/features.ipynb +++ b/notebooks/features.ipynb @@ -64,7 +64,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.110% 494\nValid: 66.890% 998\n" + "text": "Fraud: 0.173% 492\nValid: 99.827% 284315\nX.shape (1492, 28) y.shape (1492,)\nFraud: 33.244% 496\nValid: 66.756% 996\n" } ], "source": [ @@ -135,7 +135,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Accuracy of Train without weights 0.9789272030651341\nAccuracy of Train with weights 0.9952107279693486\nAccuracy of Tests without weights 0.9598214285714286\nAccuracy of Tests with weights 0.9508928571428571\n" + "text": "Accuracy of Train without weights 0.9808429118773946\nAccuracy of Train with weights 0.9904214559386973\nAccuracy of Tests without weights 0.9441964285714286\nAccuracy of Tests with weights 0.9375\n" } ], "source": [ @@ -162,7 +162,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Time: 0.27s\tKernel: linear\tAccuracy_train: 0.9683908045977011\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9875478927203065\tAccuracy_test: 0.9598214285714286\nTime: 0.06s\tKernel: poly\tAccuracy_train: 0.9885057471264368\tAccuracy_test: 0.9464285714285714\n" + "text": "Time: 0.13s\tKernel: linear\tAccuracy_train: 0.9693486590038314\tAccuracy_test: 0.9598214285714286\nTime: 0.09s\tKernel: rbf\tAccuracy_train: 0.9923371647509579\tAccuracy_test: 0.953125\nTime: 0.09s\tKernel: poly\tAccuracy_train: 0.9913793103448276\tAccuracy_test: 0.9375\n" } ], "source": [ @@ -195,7 +195,7 @@ { "output_type": "stream", "name": "stdout", - "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9531\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, - Leaf class=1 belief= 0.983713 counts=(array([0, 1]), array([ 5, 302]))\nroot - Up, - Leaf class=0 belief= 0.940299 counts=(array([0, 1]), array([693, 44]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9569\nClassifier's accuracy (test) : 0.9621\nroot\nroot - Down, - Leaf class=1 belief= 0.990228 counts=(array([0, 1]), array([ 3, 304]))\nroot - Up, - Leaf class=0 belief= 0.943012 counts=(array([0, 1]), array([695, 42]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9655\nClassifier's accuracy (test) : 0.9643\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([310]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([5]))\nroot - Up, - Leaf class=0 belief= 0.950617 counts=(array([0, 1]), array([693, 36]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9684\nClassifier's accuracy (test) : 0.9598\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([311]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up, - Leaf class=0 belief= 0.954039 counts=(array([0, 1]), array([685, 33]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9751\nClassifier's accuracy (test) : 0.9464\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n\n**************************************************\n0.6869 secs\n" + "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9487\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0374\nroot - Down - Down, - Leaf class=1 belief= 0.984076 impurity=0.0313 counts=(array([0, 1]), array([ 5, 309]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, - Leaf class=0 belief= 0.947874 impurity=0.0988 counts=(array([0, 1]), array([691, 38]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9588\nClassifier's accuracy (test) : 0.9531\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0192\nroot - Down - Down, - Leaf class=1 belief= 0.993506 impurity=0.0129 counts=(array([0, 1]), array([ 2, 306]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([1]))\nroot - Up, - Leaf class=0 belief= 0.944218 impurity=0.1053 counts=(array([0, 1]), array([694, 41]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9643\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0189\nroot - Down - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([3]))\nroot - Up, - Leaf class=0 belief= 0.951989 impurity=0.0914 counts=(array([0, 1]), array([694, 35]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9665\nClassifier's accuracy (test) : 0.9621\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0250\nroot - Down - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([312]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([4]))\nroot - Up, - Leaf class=0 belief= 0.951923 impurity=0.0915 counts=(array([0, 1]), array([693, 35]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9703\nClassifier's accuracy (test) : 0.9665\nroot feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n\n**************************************************\n0.4375 secs\n" } ], "source": [ @@ -227,7 +227,7 @@ { "output_type": "stream", "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n" + "text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n" } ], "source": [ @@ -244,7 +244,7 @@ { "output_type": "stream", "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([304]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([8]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Down\nroot - Up - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([3]))\nroot - Up - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([3]))\nroot - Up - Up - Up - Up - Up\nroot - Up - Up - Up - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Up - Up - Up, - Leaf class=0 belief= 0.963225 counts=(array([0, 1]), array([681, 26]))\n" + "text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0367\nroot - Down - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([315]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 impurity=0.0000 counts=(array([0]), array([6]))\nroot - Up feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.0846\nroot - Up - Down, - Leaf class=1 belief= 1.000000 impurity=0.0000 counts=(array([1]), array([1]))\nroot - Up - Up, - Leaf class=0 belief= 0.957064 impurity=0.0822 counts=(array([0, 1]), array([691, 31]))\n" } ], "source": [ @@ -268,7 +268,7 @@ { "output_type": "stream", "name": "stdout", - "text": "1 functools.partial(, 'Stree')\n2 functools.partial(, 'Stree')\n3 functools.partial(, 'Stree')\n4 functools.partial(, 'Stree')\n5 functools.partial(, 'Stree')\n6 functools.partial(, 'Stree')\n7 functools.partial(, 'Stree')\n8 functools.partial(, 'Stree')\n9 functools.partial(, 'Stree')\n10 functools.partial(, 'Stree', readonly_memmap=True)\n11 functools.partial(, 'Stree')\n12 functools.partial(, 'Stree')\n13 functools.partial(, 'Stree')\n14 functools.partial(, 'Stree')\n15 functools.partial(, 'Stree')\n16 functools.partial(, 'Stree')\n17 functools.partial(, 'Stree')\n18 functools.partial(, 'Stree')\n19 functools.partial(, 'Stree')\n20 functools.partial(, 'Stree')\n21 functools.partial(, 'Stree')\n22 functools.partial(, 'Stree')\n23 functools.partial(, 'Stree')\n24 functools.partial(, 'Stree', readonly_memmap=True)\n25 functools.partial(, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(, 'Stree')\n27 functools.partial(, 'Stree')\n28 functools.partial(, 'Stree')\n29 functools.partial(, 'Stree')\n30 functools.partial(, 'Stree')\n31 functools.partial(, 'Stree')\n32 functools.partial(, 'Stree')\n33 functools.partial(, 'Stree')\n34 functools.partial(, 'Stree')\n35 functools.partial(, 'Stree')\n36 functools.partial(, 'Stree')\n37 functools.partial(, 'Stree')\n38 functools.partial(, 'Stree')\n39 functools.partial(, 'Stree')\n40 functools.partial(, 'Stree')\n41 functools.partial(, 'Stree')\n42 functools.partial(, 'Stree')\n43 functools.partial(, 'Stree')\n" + "text": "1 functools.partial(, 'Stree')\n2 functools.partial(, 'Stree')\n3 functools.partial(, 'Stree')\n4 functools.partial(, 'Stree')\n5 functools.partial(, 'Stree')\n6 functools.partial(, 'Stree')\n7 functools.partial(, 'Stree')\n8 functools.partial(, 'Stree')\n9 functools.partial(, 'Stree')\n10 functools.partial(, 'Stree', readonly_memmap=True)\n11 functools.partial(, 'Stree')\n12 functools.partial(, 'Stree')\n13 functools.partial(, 'Stree')\n14 functools.partial(, 'Stree')\n15 functools.partial(, 'Stree')\n16 functools.partial(, 'Stree')\n17 functools.partial(, 'Stree')\n18 functools.partial(, 'Stree')\n19 functools.partial(, 'Stree')\n20 functools.partial(, 'Stree')\n21 functools.partial(, 'Stree')\n22 functools.partial(, 'Stree')\n23 functools.partial(, 'Stree')\n24 functools.partial(, 'Stree', readonly_memmap=True)\n25 functools.partial(, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(, 'Stree')\n27 functools.partial(, 'Stree')\n28 functools.partial(, 'Stree')\n29 functools.partial(, 'Stree')\n30 functools.partial(, 'Stree')\n31 functools.partial(, 'Stree')\n32 functools.partial(, 'Stree')\n33 functools.partial(, 'Stree')\n34 functools.partial(, 'Stree')\n35 functools.partial(, 'Stree')\n36 functools.partial(, 'Stree')\n37 functools.partial(, 'Stree')\n38 functools.partial(, 'Stree')\n39 functools.partial(, 'Stree')\n40 functools.partial(, 'Stree')\n41 functools.partial(, 'Stree')\n42 functools.partial(, 'Stree')\n43 functools.partial(, 'Stree')\n" } ], "source": [ @@ -306,7 +306,7 @@ { "output_type": "stream", "name": "stdout", - "text": "== Not Weighted ===\nSVC train score ..: 0.9521072796934866\nSTree train score : 0.9578544061302682\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9575892857142857\n==== Weighted =====\nSVC train score ..: 0.9616858237547893\nSTree train score : 0.9616858237547893\nSVC test score ...: 0.9642857142857143\nSTree test score .: 0.9598214285714286\n*SVC test score ..: 0.951413553411694\n*STree test score : 0.9480517444389333\n" + "text": "== Not Weighted ===\nSVC train score ..: 0.9578544061302682\nSTree train score : 0.960727969348659\nSVC test score ...: 0.9508928571428571\nSTree test score .: 0.9553571428571429\n==== Weighted =====\nSVC train score ..: 0.9636015325670498\nSTree train score : 0.9626436781609196\nSVC test score ...: 0.9553571428571429\nSTree test score .: 0.9553571428571429\n*SVC test score ..: 0.9447820728419238\n*STree test score : 0.9447820728419238\n" } ], "source": [ @@ -338,12 +338,49 @@ { "output_type": "stream", "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 0.969325 counts=(array([0, 1]), array([ 10, 316]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up, - Leaf class=0 belief= 0.958159 counts=(array([0, 1]), array([687, 30]))\n\n" + "text": "root feaures=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27) impurity=0.4438\nroot - Down, - Leaf class=1 belief= 0.978261 impurity=0.0425 counts=(array([0, 1]), array([ 7, 315]))\nroot - Up, - Leaf class=0 belief= 0.955679 impurity=0.0847 counts=(array([0, 1]), array([690, 32]))\n\n" } ], "source": [ "print(clf)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Test max_features" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "****************************************\nmax_features None = 28\nTrain score : 0.9664750957854407\nTest score .: 0.9642857142857143\nTook 0.09 seconds\n****************************************\nmax_features auto = 5\nTrain score : 0.9511494252873564\nTest score .: 0.9441964285714286\nTook 0.37 seconds\n****************************************\nmax_features log2 = 4\nTrain score : 0.935823754789272\nTest score .: 0.9330357142857143\nTook 0.10 seconds\n****************************************\nmax_features 7 = 7\nTrain score : 0.9568965517241379\nTest score .: 0.9397321428571429\nTook 3.36 seconds\n****************************************\nmax_features 0.5 = 14\nTrain score : 0.960727969348659\nTest score .: 0.9486607142857143\nTook 112.42 seconds\n****************************************\nmax_features 0.1 = 2\nTrain score : 0.8793103448275862\nTest score .: 0.8839285714285714\nTook 0.06 seconds\n****************************************\nmax_features 0.7 = 19\nTrain score : 0.9655172413793104\nTest score .: 0.9553571428571429\nTook 10.59 seconds\n" + } + ], + "source": [ + "for max_features in [None, \"auto\", \"log2\", 7, .5, .1, .7]:\n", + " now = time.time()\n", + " print(\"*\"*40)\n", + " clf = Stree(random_state=random_state, max_features=max_features)\n", + " clf.fit(Xtrain, ytrain)\n", + " print(f\"max_features {max_features} = {clf.max_features_}\")\n", + " print(\"Train score :\", clf.score(Xtrain, ytrain))\n", + " print(\"Test score .:\", clf.score(Xtest, ytest))\n", + " print(f\"Took {time.time() - now:.2f} seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/stree/Strees.py b/stree/Strees.py index cb8731f..1a37b4e 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -205,7 +205,7 @@ class Stree(BaseEstimator, ClassifierMixin): the hyperplane of the node :rtype: np.array """ - return node._clf.decision_function(data) + return node._clf.decision_function(data[:, node._features]) def _min_distance(self, data: np.array, _) -> np.array: # chooses the lowest distance of every sample @@ -286,11 +286,14 @@ class Stree(BaseEstimator, ClassifierMixin): sample_weight = _check_sample_weight(sample_weight, X) check_classification_targets(y) # Initialize computed parameters + if self.random_state is not None: + random.seed(self.random_state) self.classes_, y = np.unique(y, return_inverse=True) self.n_classes_ = self.classes_.shape[0] self.n_iter_ = self.max_iter self.depth_ = 0 self.n_features_ = X.shape[1] + self.n_features_in_ = X.shape[1] self.max_features_ = self._initialize_max_features() self.criterion_function_ = getattr(self, f"_{self.criterion}") self.tree_ = self.train(X, y, sample_weight, 1, "root") @@ -336,12 +339,12 @@ class Stree(BaseEstimator, ClassifierMixin): ) # Train the model clf = self._build_clf() - Xs, indices_subset = self._get_subspace(X) + Xs, features = self._get_subspace(X) clf.fit(Xs, y, sample_weight=sample_weight) impurity = self.criterion_function_(y) - node = Snode(clf, X, y, indices_subset, impurity, title) + node = Snode(clf, X, y, features, impurity, title) self.depth_ = max(depth, self.depth_) - down = self._split_criteria(self._distances(node, Xs), node) + down = self._split_criteria(self._distances(node, X), node) X_U, X_D = self._split_array(X, down) y_u, y_d = self._split_array(y, down) sw_u, sw_d = self._split_array(sample_weight, down) @@ -439,6 +442,11 @@ class Stree(BaseEstimator, ClassifierMixin): check_is_fitted(self, ["tree_"]) # Input validation X = check_array(X) + if X.shape[1] != self.n_features_: + raise ValueError( + f"Expected {self.n_features_} features but got " + f"({X.shape[1]})" + ) # setup prediction & make it happen indices = np.arange(X.shape[0]) result = ( @@ -548,7 +556,7 @@ class Stree(BaseEstimator, ClassifierMixin): features = range(dataset.shape[1]) features_sets = list(combinations(features, self.max_features_)) if len(features_sets) > 1: - return features_sets[random.randint(0, len(features_sets))] + return features_sets[random.randint(0, len(features_sets) - 1)] else: return features_sets[0] diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index a3fb3d1..371e1d0 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -360,3 +360,17 @@ class Stree_test(unittest.TestCase): clf = Stree(criterion="entropy") clf.fit(*load_dataset()) self.assertEqual(expected, clf.criterion_function_(y)) + + def test_predict_feature_dimensions(self): + X = np.random.rand(10, 5) + y = np.random.randint(0, 2, 10) + clf = Stree() + clf.fit(X, y) + with self.assertRaises(ValueError): + clf.predict(X[:, :3]) + + def test_score_max_features(self): + X, y = load_dataset(self._random_state) + clf = Stree(random_state=self._random_state, max_features=2) + clf.fit(X, y) + self.assertAlmostEqual(0.9426666666666667, clf.score(X, y)) From c94bc068bd0a923360ea0ff3bf224672cec0595b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 15 Jun 2020 00:22:57 +0200 Subject: [PATCH 4/6] #2 Refactor Stree & create Splitter Add and test splitter parameter --- notebooks/benchmark.ipynb | 200 ++++++++++++++++++++++-- stree/Strees.py | 290 ++++++++++++++++++++++------------- stree/__init__.py | 4 +- stree/tests/Splitter_test.py | 142 +++++++++++++++++ stree/tests/Stree_test.py | 62 ++------ stree/tests/__init__.py | 3 +- stree/tests/utils.py | 4 +- 7 files changed, 529 insertions(+), 176 deletions(-) create mode 100644 stree/tests/Splitter_test.py diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb index 76901aa..9e19ff0 100644 --- a/notebooks/benchmark.ipynb +++ b/notebooks/benchmark.ipynb @@ -68,9 +68,11 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "2020-05-23 19:42:08\n" + "output_type": "stream", + "text": [ + "2020-06-14 23:45:42\n" + ] } ], "source": [ @@ -102,9 +104,12 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n" + "output_type": "stream", + "text": [ + "Fraud: 0.173% 492\n", + "Valid: 99.827% 284,315\n" + ] } ], "source": [ @@ -130,9 +135,12 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "X shape: (284807, 29)\ny shape: (284807,)\n" + "output_type": "stream", + "text": [ + "X shape: (284807, 29)\n", + "y shape: (284807,)\n" + ] } ], "source": [ @@ -248,9 +256,168 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 16.99 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 175.7 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 39.64 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999613 0.999869 0.999741 199020\n 1 0.911263 0.776163 0.838305 344\n\n accuracy 0.999483 199364\n macro avg 0.955438 0.888016 0.919023 199364\nweighted avg 0.999461 0.999483 0.999463 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999613 0.999883 0.999748 85295\n 1 0.920000 0.777027 0.842491 148\n\n accuracy 0.999497 85443\n macro avg 0.959807 0.888455 0.921119 85443\nweighted avg 0.999475 0.999497 0.999476 85443\n\nConfusion Matrix in Train\n[[198994 26]\n [ 77 267]]\nConfusion Matrix in Test\n[[85285 10]\n [ 33 115]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 48.29 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n************************** Gradient Boost. **********************\nTrain Model Gradient Boost. took: 251.6 seconds\n=========== Gradient Boost. - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999096 0.999854 0.999475 199020\n 1 0.849741 0.476744 0.610801 344\n\n accuracy 0.998952 199364\n macro avg 0.924419 0.738299 0.805138 199364\nweighted avg 0.998839 0.998952 0.998804 199364\n\n=========== Gradient Boost. - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.998981 0.999730 0.999355 85295\n 1 0.726190 0.412162 0.525862 148\n\n accuracy 0.998713 85443\n macro avg 0.862586 0.705946 0.762609 85443\nweighted avg 0.998508 0.998713 0.998535 85443\n\nConfusion Matrix in Train\n[[198991 29]\n [ 180 164]]\nConfusion Matrix in Test\n[[85272 23]\n [ 87 61]]\n" + "output_type": "stream", + "text": [ + "************************** Linear Tree **********************\n", + "Train Model Linear Tree took: 13.52 seconds\n", + "=========== Linear Tree - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 1.000000 1.000000 1.000000 199020\n", + " 1 1.000000 1.000000 1.000000 344\n", + "\n", + " accuracy 1.000000 199364\n", + " macro avg 1.000000 1.000000 1.000000 199364\n", + "weighted avg 1.000000 1.000000 1.000000 199364\n", + "\n", + "=========== Linear Tree - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999578 0.999613 0.999596 85295\n", + " 1 0.772414 0.756757 0.764505 148\n", + "\n", + " accuracy 0.999192 85443\n", + " macro avg 0.885996 0.878185 0.882050 85443\n", + "weighted avg 0.999184 0.999192 0.999188 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[199020 0]\n", + " [ 0 344]]\n", + "Confusion Matrix in Test\n", + "[[85262 33]\n", + " [ 36 112]]\n", + "************************** Random Forest **********************\n", + "Train Model Random Forest took: 152.5 seconds\n", + "=========== Random Forest - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 1.000000 1.000000 1.000000 199020\n", + " 1 1.000000 1.000000 1.000000 344\n", + "\n", + " accuracy 1.000000 199364\n", + " macro avg 1.000000 1.000000 1.000000 199364\n", + "weighted avg 1.000000 1.000000 1.000000 199364\n", + "\n", + "=========== Random Forest - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999660 0.999965 0.999812 85295\n", + " 1 0.975410 0.804054 0.881481 148\n", + "\n", + " accuracy 0.999625 85443\n", + " macro avg 0.987535 0.902009 0.940647 85443\n", + "weighted avg 0.999618 0.999625 0.999607 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[199020 0]\n", + " [ 0 344]]\n", + "Confusion Matrix in Test\n", + "[[85292 3]\n", + " [ 29 119]]\n", + "************************** Stree (SVM Tree) **********************\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n", + "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n", + "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train Model Stree (SVM Tree) took: 32.55 seconds\n", + "=========== Stree (SVM Tree) - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999623 0.999864 0.999744 199020\n", + " 1 0.908784 0.781977 0.840625 344\n", + "\n", + " accuracy 0.999488 199364\n", + " macro avg 0.954204 0.890921 0.920184 199364\n", + "weighted avg 0.999467 0.999488 0.999469 199364\n", + "\n", + "=========== Stree (SVM Tree) - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999637 0.999918 0.999777 85295\n", + " 1 0.943548 0.790541 0.860294 148\n", + "\n", + " accuracy 0.999555 85443\n", + " macro avg 0.971593 0.895229 0.930036 85443\n", + "weighted avg 0.999540 0.999555 0.999536 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[198993 27]\n", + " [ 75 269]]\n", + "Confusion Matrix in Test\n", + "[[85288 7]\n", + " [ 31 117]]\n", + "************************** AdaBoost model **********************\n", + "Train Model AdaBoost model took: 47.34 seconds\n", + "=========== AdaBoost model - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999392 0.999678 0.999535 199020\n", + " 1 0.777003 0.648256 0.706815 344\n", + "\n", + " accuracy 0.999072 199364\n", + " macro avg 0.888198 0.823967 0.853175 199364\n", + "weighted avg 0.999008 0.999072 0.999030 199364\n", + "\n", + "=========== AdaBoost model - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999484 0.999707 0.999596 85295\n", + " 1 0.806202 0.702703 0.750903 148\n", + "\n", + " accuracy 0.999192 85443\n", + " macro avg 0.902843 0.851205 0.875249 85443\n", + "weighted avg 0.999149 0.999192 0.999165 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[198956 64]\n", + " [ 121 223]]\n", + "Confusion Matrix in Test\n", + "[[85270 25]\n", + " [ 44 104]]\n", + "************************** Gradient Boost. **********************\n", + "Train Model Gradient Boost. took: 244.1 seconds\n", + "=========== Gradient Boost. - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999096 0.999854 0.999475 199020\n", + " 1 0.849741 0.476744 0.610801 344\n", + "\n", + " accuracy 0.998952 199364\n", + " macro avg 0.924419 0.738299 0.805138 199364\n", + "weighted avg 0.998839 0.998952 0.998804 199364\n", + "\n", + "=========== Gradient Boost. - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.998981 0.999730 0.999355 85295\n", + " 1 0.726190 0.412162 0.525862 148\n", + "\n", + " accuracy 0.998713 85443\n", + " macro avg 0.862586 0.705946 0.762609 85443\n", + "weighted avg 0.998508 0.998713 0.998535 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[198991 29]\n", + " [ 180 164]]\n", + "Confusion Matrix in Test\n", + "[[85272 23]\n", + " [ 87 61]]\n" + ] } ], "source": [ @@ -277,9 +444,18 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 175.717 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 16.99 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 175.72 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 39.64 seconds\t f1: 0.8425\nModel: AdaBoost model\t Time: 48.29 seconds\t f1: 0.7509\nModel: Gradient Boost.\t Time: 251.58 seconds\t f1: 0.5259\n" + "output_type": "stream", + "text": [ + "**************************************************************************************************************\n", + "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n", + "**************************************************************************************************************\n", + "Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n", + "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n", + "Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n", + "Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n", + "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n" + ] } ], "source": [ @@ -325,7 +501,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6-final" + "version": "3.7.6" }, "toc": { "base_numbering": 1, @@ -379,4 +555,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/stree/Strees.py b/stree/Strees.py index 1a37b4e..e2e33c3 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -9,12 +9,14 @@ Build an oblique tree classifier based on SVM Trees import os import numbers import random +import warnings from itertools import combinations import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import SVC, LinearSVC from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import check_classification_targets +from sklearn.exceptions import ConvergenceWarning from sklearn.utils.validation import ( check_X_y, check_array, @@ -134,6 +136,168 @@ class Siterator: return node +class Splitter: + def __init__( + self, + clf: SVC = None, + criterion: str = None, + splitter_type: str = None, + criteria: str = None, + min_samples_split: int = None, + random_state=None, + ): + self._clf = clf + self._random_state = random_state + if random_state is not None: + random.seed(random_state) + self._criterion = criterion + self._min_samples_split = min_samples_split + self._criteria = criteria + self._splitter_type = splitter_type + + if clf is None: + raise ValueError(f"clf has to be a sklearn estimator, got({clf})") + + if criterion not in ["gini", "entropy"]: + raise ValueError( + f"criterion must be gini or entropy got({criterion})" + ) + + if criteria not in ["min_distance", "max_samples"]: + raise ValueError( + f"split_criteria has to be min_distance or \ + max_samples got ({criteria})" + ) + + if splitter_type not in ["random", "best"]: + raise ValueError( + f"splitter must be either random or best got({splitter_type})" + ) + self.criterion_function = getattr(self, f"_{self._criterion}") + self.decision_criteria = getattr(self, f"_{self._criteria}") + + def impurity(self, y: np.array) -> np.array: + return self.criterion_function(y) + + @staticmethod + def _gini(y: np.array) -> float: + _, count = np.unique(y, return_counts=True) + return 1 - np.sum(np.square(count / np.sum(count))) + + @staticmethod + def _entropy(y: np.array) -> float: + _, count = np.unique(y, return_counts=True) + proportion = count / np.sum(count) + return -np.sum(proportion * np.log2(proportion)) + + def information_gain( + self, labels_up: np.array, labels_dn: np.array + ) -> float: + card_up = labels_up.shape[0] + card_dn = labels_dn.shape[0] + samples = card_up + card_dn + up = card_up / samples * self.criterion_function(labels_up) + dn = card_dn / samples * self.criterion_function(labels_dn) + return up + dn + + def _select_best_set( + self, dataset: np.array, labels: np.array, features_sets: list + ) -> list: + min_impurity = 1 + selected = None + warnings.filterwarnings("ignore", category=ConvergenceWarning) + for feature_set in features_sets: + self._clf.fit(dataset[:, feature_set], labels) + node = Snode( + self._clf, dataset, labels, feature_set, 0.0, "subset" + ) + self.partition(dataset, node) + y1, y2 = self.part(labels) + impurity = self.information_gain(y1, y2) + if impurity < min_impurity: + min_impurity = impurity + selected = feature_set + return selected + + def _get_subspaces_set( + self, dataset: np.array, labels: np.array, max_features: int + ) -> np.array: + features = range(dataset.shape[1]) + features_sets = list(combinations(features, max_features)) + if len(features_sets) > 1: + if self._splitter_type == "random": + return features_sets[random.randint(0, len(features_sets) - 1)] + else: + return self._select_best_set(dataset, labels, features_sets) + else: + return features_sets[0] + + def get_subspace( + self, dataset: np.array, labels: np.array, max_features: int + ) -> list: + """Return the best subspace to make a split + """ + indices = self._get_subspaces_set(dataset, labels, max_features) + return dataset[:, indices], indices + + @staticmethod + def _min_distance(data: np.array, _) -> np.array: + # chooses the lowest distance of every sample + indices = np.argmin(np.abs(data), axis=1) + return np.array( + [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] + ) + + @staticmethod + def _max_samples(data: np.array, y: np.array) -> np.array: + # select the class with max number of samples + _, samples = np.unique(y, return_counts=True) + selected = np.argmax(samples) + return data[:, selected] + + def partition(self, samples: np.array, node: Snode): + """Set the criteria to split arrays + + """ + data = self._distances(node, samples) + if data.shape[0] < self._min_samples_split: + self._down = np.ones((data.shape[0]), dtype=bool) + return + if data.ndim > 1: + # split criteria for multiclass + data = self.decision_criteria(data, node._y) + self._down = data > 0 + + def _distances(self, node: Snode, data: np.ndarray) -> np.array: + """Compute distances of the samples to the hyperplane of the node + + :param node: node containing the svm classifier + :type node: Snode + :param data: samples to find out distance to hyperplane + :type data: np.ndarray + :return: array of shape (m, 1) with the distances of every sample to + the hyperplane of the node + :rtype: np.array + """ + return node._clf.decision_function(data[:, node._features]) + + def part(self, origin: np.array) -> list: + """Split an array in two based on indices (down) and its complement + + :param origin: dataset to split + :type origin: np.array + :param down: indices to use to split array + :type down: np.array + :return: list with two splits of the array + :rtype: list + """ + up = ~self._down + return [ + origin[up] if any(up) else None, + origin[self._down] if any(self._down) else None, + ] + + class Stree(BaseEstimator, ClassifierMixin): """Estimator that is based on binary trees of svm nodes can deal with sample_weights in predict, used in boosting sklearn methods @@ -156,6 +320,7 @@ class Stree(BaseEstimator, ClassifierMixin): criterion: str = "gini", min_samples_split: int = 0, max_features=None, + splitter: str = "random", ): self.max_iter = max_iter self.C = C @@ -169,6 +334,7 @@ class Stree(BaseEstimator, ClassifierMixin): self.split_criteria = split_criteria self.max_features = max_features self.criterion = criterion + self.splitter = splitter def _more_tags(self) -> dict: """Required by sklearn to supply features of the classifier @@ -178,68 +344,6 @@ class Stree(BaseEstimator, ClassifierMixin): """ return {"requires_y": True} - def _split_array(self, origin: np.array, down: np.array) -> list: - """Split an array in two based on indices (down) and its complement - - :param origin: dataset to split - :type origin: np.array - :param down: indices to use to split array - :type down: np.array - :return: list with two splits of the array - :rtype: list - """ - up = ~down - return [ - origin[up] if any(up) else None, - origin[down] if any(down) else None, - ] - - def _distances(self, node: Snode, data: np.ndarray) -> np.array: - """Compute distances of the samples to the hyperplane of the node - - :param node: node containing the svm classifier - :type node: Snode - :param data: samples to find out distance to hyperplane - :type data: np.ndarray - :return: array of shape (m, 1) with the distances of every sample to - the hyperplane of the node - :rtype: np.array - """ - return node._clf.decision_function(data[:, node._features]) - - def _min_distance(self, data: np.array, _) -> np.array: - # chooses the lowest distance of every sample - indices = np.argmin(np.abs(data), axis=1) - return np.array( - [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] - ) - - def _max_samples(self, data: np.array, y: np.array) -> np.array: - # select the class with max number of samples - _, samples = np.unique(y, return_counts=True) - selected = np.argmax(samples) - return data[:, selected] - - def _split_criteria(self, data: np.array, node: Snode) -> np.array: - """Set the criteria to split arrays - - :param data: distances of samples to hyperplanes shape (m, nclasses) - if nclasses > 2 else (m,) - :type data: np.array - :param node: node containing the svm classifier - :type node: Snode - :return: array of booleans of samples under or above zero - :rtype: np.array - """ - - if data.shape[0] < self.min_samples_split: - return np.ones((data.shape[0]), dtype=bool) - if data.ndim > 1: - # split criteria for multiclass - data = getattr(self, f"_{self.split_criteria}")(data, node._y) - res = data > 0 - return res - def fit( self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None ) -> "Stree": @@ -271,21 +375,20 @@ class Stree(BaseEstimator, ClassifierMixin): f"Maximum depth has to be greater than 1... got (max_depth=\ {self.max_depth})" ) - if self.split_criteria not in ["min_distance", "max_samples"]: - raise ValueError( - f"split_criteria has to be min_distance or \ - max_samples got ({self.split_criteria})" - ) - if self.criterion not in ["gini", "entropy"]: - raise ValueError( - f"criterion must be gini or entropy got({self.criterion})" - ) check_classification_targets(y) X, y = check_X_y(X, y) sample_weight = _check_sample_weight(sample_weight, X) check_classification_targets(y) # Initialize computed parameters + self.splitter_ = Splitter( + clf=self._build_clf(), + criterion=self.criterion, + splitter_type=self.splitter, + criteria=self.split_criteria, + random_state=self.random_state, + min_samples_split=self.min_samples_split, + ) if self.random_state is not None: random.seed(self.random_state) self.classes_, y = np.unique(y, return_inverse=True) @@ -295,7 +398,6 @@ class Stree(BaseEstimator, ClassifierMixin): self.n_features_ = X.shape[1] self.n_features_in_ = X.shape[1] self.max_features_ = self._initialize_max_features() - self.criterion_function_ = getattr(self, f"_{self.criterion}") self.tree_ = self.train(X, y, sample_weight, 1, "root") self._build_predictor() return self @@ -339,15 +441,15 @@ class Stree(BaseEstimator, ClassifierMixin): ) # Train the model clf = self._build_clf() - Xs, features = self._get_subspace(X) + Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) clf.fit(Xs, y, sample_weight=sample_weight) - impurity = self.criterion_function_(y) + impurity = self.splitter_.impurity(y) node = Snode(clf, X, y, features, impurity, title) self.depth_ = max(depth, self.depth_) - down = self._split_criteria(self._distances(node, X), node) - X_U, X_D = self._split_array(X, down) - y_u, y_d = self._split_array(y, down) - sw_u, sw_d = self._split_array(sample_weight, down) + self.splitter_.partition(X, node) + X_U, X_D = self.splitter_.part(X) + y_u, y_d = self.splitter_.part(y) + sw_u, sw_d = self.splitter_.part(sample_weight) if X_U is None or X_D is None: # didn't part anything return Snode( @@ -431,9 +533,9 @@ class Stree(BaseEstimator, ClassifierMixin): # set a class for every sample in dataset prediction = np.full((xp.shape[0], 1), node._class) return prediction, indices - down = self._split_criteria(self._distances(node, xp), node) - x_u, x_d = self._split_array(xp, down) - i_u, i_d = self._split_array(indices, down) + self.splitter_.partition(xp, node) + x_u, x_d = self.splitter_.part(xp) + i_u, i_d = self.splitter_.part(indices) prx_u, prin_u = predict_class(x_u, i_u, node.get_up()) prx_d, prin_d = predict_class(x_d, i_d, node.get_down()) return np.append(prx_u, prx_d), np.append(prin_u, prin_d) @@ -536,29 +638,3 @@ class Stree(BaseEstimator, ClassifierMixin): f"got ({self.max_features})" ) return max_features - - @staticmethod - def _gini(y: np.array) -> float: - _, count = np.unique(y, return_counts=True) - return 1 - np.sum(np.square(count / np.sum(count))) - - @staticmethod - def _entropy(y: np.array) -> float: - _, count = np.unique(y, return_counts=True) - proportion = count / np.sum(count) - return -np.sum(proportion * np.log2(proportion)) - - def _get_subspace(self, dataset: np.array) -> list: - """Return the best subspace to make a split - """ - - def get_subspaces_set(dataset: np.array) -> np.array: - features = range(dataset.shape[1]) - features_sets = list(combinations(features, self.max_features_)) - if len(features_sets) > 1: - return features_sets[random.randint(0, len(features_sets) - 1)] - else: - return features_sets[0] - - indices = get_subspaces_set(dataset) - return dataset[:, indices], indices diff --git a/stree/__init__.py b/stree/__init__.py index 03b8a2c..6768b82 100644 --- a/stree/__init__.py +++ b/stree/__init__.py @@ -1,3 +1,3 @@ -from .Strees import Stree, Snode, Siterator +from .Strees import Stree, Snode, Siterator, Splitter -__all__ = ["Stree", "Snode", "Siterator"] +__all__ = ["Stree", "Snode", "Siterator", "Splitter"] diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py new file mode 100644 index 0000000..b620ce1 --- /dev/null +++ b/stree/tests/Splitter_test.py @@ -0,0 +1,142 @@ +import os +import unittest + +import numpy as np +from sklearn.svm import LinearSVC + +from stree import Splitter +from .utils import load_dataset + + +class Splitter_test(unittest.TestCase): + def __init__(self, *args, **kwargs): + self._random_state = 1 + super().__init__(*args, **kwargs) + + def build( + self, + clf=LinearSVC(), + min_samples_split=0, + splitter_type="random", + criterion="gini", + criteria="min_distance", + random_state=None, + ): + return Splitter( + clf=clf, + min_samples_split=min_samples_split, + splitter_type=splitter_type, + criterion=criterion, + criteria=criteria, + random_state=random_state, + ) + + @classmethod + def setUp(cls): + os.environ["TESTING"] = "1" + + def test_init(self): + with self.assertRaises(ValueError): + self.build(criterion="duck") + with self.assertRaises(ValueError): + self.build(splitter_type="duck") + with self.assertRaises(ValueError): + self.build(criteria="duck") + with self.assertRaises(ValueError): + self.build(clf=None) + for splitter_type in ["best", "random"]: + for criterion in ["gini", "entropy"]: + for criteria in ["min_distance", "max_samples"]: + tcl = self.build( + splitter_type=splitter_type, + criterion=criterion, + criteria=criteria, + ) + self.assertEqual(splitter_type, tcl._splitter_type) + self.assertEqual(criterion, tcl._criterion) + self.assertEqual(criteria, tcl._criteria) + + def test_gini(self): + y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] + expected = 0.48 + self.assertEqual(expected, Splitter._gini(y)) + tcl = self.build(criterion="gini") + self.assertEqual(expected, tcl.criterion_function(y)) + + def test_entropy(self): + y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] + expected = 0.9709505944546686 + self.assertAlmostEqual(expected, Splitter._entropy(y)) + tcl = self.build(criterion="entropy") + self.assertEqual(expected, tcl.criterion_function(y)) + + def test_information_gain(self): + yu = np.array([0, 1, 1, 1, 1, 1]) + yd = np.array([0, 0, 0, 1]) + values_expected = [ + ("gini", 0.31666666666666665), + ("entropy", 0.7145247027726656), + ] + for criterion, expected in values_expected: + tcl = self.build(criterion=criterion) + computed = tcl.information_gain(yu, yd) + self.assertAlmostEqual(expected, computed) + + def test_max_samples(self): + tcl = self.build(criteria="max_samples") + data = np.array( + [ + [-0.1, 0.2, -0.3], + [0.7, 0.01, -0.1], + [0.7, -0.9, 0.5], + [0.1, 0.2, 0.3], + ] + ) + expected = np.array([0.2, 0.01, -0.9, 0.2]) + y = [1, 2, 1, 0] + computed = tcl._max_samples(data, y) + self.assertEqual((4,), computed.shape) + self.assertListEqual(expected.tolist(), computed.tolist()) + + def test_min_distance(self): + tcl = self.build() + data = np.array( + [ + [-0.1, 0.2, -0.3], + [0.7, 0.01, -0.1], + [0.7, -0.9, 0.5], + [0.1, 0.2, 0.3], + ] + ) + expected = np.array([-0.1, 0.01, 0.5, 0.1]) + computed = tcl._min_distance(data, None) + self.assertEqual((4,), computed.shape) + self.assertListEqual(expected.tolist(), computed.tolist()) + + def test_splitter_parameter(self): + expected_values = [ + [1, 7, 9], + [1, 7, 9], + [1, 7, 9], + [1, 7, 9], + [0, 5, 6], + [0, 5, 6], + [0, 5, 6], + [0, 5, 6], + ] + X, y = load_dataset(self._random_state, n_features=12) + for splitter_type in ["best", "random"]: + for criterion in ["gini", "entropy"]: + for criteria in ["min_distance", "max_samples"]: + tcl = self.build( + splitter_type=splitter_type, + criterion=criterion, + criteria=criteria, + random_state=self._random_state, + ) + expected = expected_values.pop(0) + dataset, computed = tcl.get_subspace(X, y, max_features=3) + self.assertListEqual(expected, list(computed)) + self.assertListEqual( + X[:, computed].tolist(), dataset.tolist() + ) diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index 371e1d0..0fea9e5 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -204,13 +204,11 @@ class Stree_test(unittest.TestCase): self.assertEqual(0, len(list(tcl))) def test_min_samples_split(self): - tcl_split = Stree(min_samples_split=3) - tcl_nosplit = Stree(min_samples_split=4) dataset = [[1], [2], [3]], [1, 1, 0] - tcl_split.fit(*dataset) + tcl_split = Stree(min_samples_split=3).fit(*dataset) self.assertIsNotNone(tcl_split.tree_.get_down()) self.assertIsNotNone(tcl_split.tree_.get_up()) - tcl_nosplit.fit(*dataset) + tcl_nosplit = Stree(min_samples_split=4).fit(*dataset) self.assertIsNone(tcl_nosplit.tree_.get_down()) self.assertIsNone(tcl_nosplit.tree_.get_up()) @@ -265,37 +263,6 @@ class Stree_test(unittest.TestCase): outcome = outcomes[name][f"{criteria} {kernel}"] self.assertAlmostEqual(outcome, clf.score(px, py)) - def test_min_distance(self): - clf = Stree() - data = np.array( - [ - [-0.1, 0.2, -0.3], - [0.7, 0.01, -0.1], - [0.7, -0.9, 0.5], - [0.1, 0.2, 0.3], - ] - ) - expected = np.array([-0.1, 0.01, 0.5, 0.1]) - computed = clf._min_distance(data, None) - self.assertEqual((4,), computed.shape) - self.assertListEqual(expected.tolist(), computed.tolist()) - - def test_max_samples(self): - clf = Stree() - data = np.array( - [ - [-0.1, 0.2, -0.3], - [0.7, 0.01, -0.1], - [0.7, -0.9, 0.5], - [0.1, 0.2, 0.3], - ] - ) - expected = np.array([0.2, 0.01, -0.9, 0.2]) - y = [1, 2, 1, 0] - computed = clf._max_samples(data, y) - self.assertEqual((4,), computed.shape) - self.assertListEqual(expected.tolist(), computed.tolist()) - def test_max_features(self): n_features = 16 expected_values = [ @@ -334,7 +301,9 @@ class Stree_test(unittest.TestCase): for max_features, expected in expected_values: clf.set_params(**dict(max_features=max_features)) clf.fit(dataset, y) - computed, indices = clf._get_subspace(dataset) + computed, indices = clf.splitter_.get_subspace( + dataset, y, clf.max_features_ + ) self.assertListEqual( dataset[:, indices].tolist(), computed.tolist() ) @@ -345,22 +314,6 @@ class Stree_test(unittest.TestCase): with self.assertRaises(ValueError): clf.fit(*load_dataset()) - def test_gini(self): - y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] - expected = 0.48 - self.assertEqual(expected, Stree._gini(y)) - clf = Stree(criterion="gini") - clf.fit(*load_dataset()) - self.assertEqual(expected, clf.criterion_function_(y)) - - def test_entropy(self): - y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] - expected = 0.9709505944546686 - self.assertAlmostEqual(expected, Stree._entropy(y)) - clf = Stree(criterion="entropy") - clf.fit(*load_dataset()) - self.assertEqual(expected, clf.criterion_function_(y)) - def test_predict_feature_dimensions(self): X = np.random.rand(10, 5) y = np.random.randint(0, 2, 10) @@ -374,3 +327,8 @@ class Stree_test(unittest.TestCase): clf = Stree(random_state=self._random_state, max_features=2) clf.fit(X, y) self.assertAlmostEqual(0.9426666666666667, clf.score(X, y)) + + def test_bogus_splitter_parameter(self): + clf = Stree(splitter="duck") + with self.assertRaises(ValueError): + clf.fit(*load_dataset()) diff --git a/stree/tests/__init__.py b/stree/tests/__init__.py index 625eea9..32e7a88 100644 --- a/stree/tests/__init__.py +++ b/stree/tests/__init__.py @@ -1,4 +1,5 @@ from .Stree_test import Stree_test from .Snode_test import Snode_test +from .Splitter_test import Splitter_test -__all__ = ["Stree_test", "Snode_test"] +__all__ = ["Stree_test", "Snode_test", "Splitter_test"] diff --git a/stree/tests/utils.py b/stree/tests/utils.py index a371e88..94b0506 100644 --- a/stree/tests/utils.py +++ b/stree/tests/utils.py @@ -1,10 +1,10 @@ from sklearn.datasets import make_classification -def load_dataset(random_state=0, n_classes=2): +def load_dataset(random_state=0, n_classes=2, n_features=3): X, y = make_classification( n_samples=1500, - n_features=3, + n_features=n_features, n_informative=3, n_redundant=0, n_repeated=0, From 736ab7ef2073830da806b920ab1bb59a289ce117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 15 Jun 2020 10:33:51 +0200 Subject: [PATCH 5/6] #2 update benchmark notebook --- notebooks/benchmark.ipynb | 254 +++++++------------------------------- stree/Strees.py | 4 +- 2 files changed, 48 insertions(+), 210 deletions(-) diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb index 9e19ff0..b87cf36 100644 --- a/notebooks/benchmark.ipynb +++ b/notebooks/benchmark.ipynb @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -64,15 +64,13 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "2020-06-14 23:45:42\n" - ] + "name": "stdout", + "text": "2020-06-15 10:17:17\n" } ], "source": [ @@ -88,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -100,16 +98,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "Fraud: 0.173% 492\n", - "Valid: 99.827% 284,315\n" - ] + "name": "stdout", + "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n" } ], "source": [ @@ -119,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -131,16 +126,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "X shape: (284807, 29)\n", - "y shape: (284807,)\n" - ] + "name": "stdout", + "text": "X shape: (284807, 29)\ny shape: (284807,)\n" } ], "source": [ @@ -159,7 +151,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -170,7 +162,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -190,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -200,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -210,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -227,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -252,179 +244,20 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [ { + "output_type": "stream", "name": "stdout", - "output_type": "stream", - "text": [ - "************************** Linear Tree **********************\n", - "Train Model Linear Tree took: 13.52 seconds\n", - "=========== Linear Tree - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 1.000000 1.000000 1.000000 199020\n", - " 1 1.000000 1.000000 1.000000 344\n", - "\n", - " accuracy 1.000000 199364\n", - " macro avg 1.000000 1.000000 1.000000 199364\n", - "weighted avg 1.000000 1.000000 1.000000 199364\n", - "\n", - "=========== Linear Tree - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999578 0.999613 0.999596 85295\n", - " 1 0.772414 0.756757 0.764505 148\n", - "\n", - " accuracy 0.999192 85443\n", - " macro avg 0.885996 0.878185 0.882050 85443\n", - "weighted avg 0.999184 0.999192 0.999188 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[199020 0]\n", - " [ 0 344]]\n", - "Confusion Matrix in Test\n", - "[[85262 33]\n", - " [ 36 112]]\n", - "************************** Random Forest **********************\n", - "Train Model Random Forest took: 152.5 seconds\n", - "=========== Random Forest - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 1.000000 1.000000 1.000000 199020\n", - " 1 1.000000 1.000000 1.000000 344\n", - "\n", - " accuracy 1.000000 199364\n", - " macro avg 1.000000 1.000000 1.000000 199364\n", - "weighted avg 1.000000 1.000000 1.000000 199364\n", - "\n", - "=========== Random Forest - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999660 0.999965 0.999812 85295\n", - " 1 0.975410 0.804054 0.881481 148\n", - "\n", - " accuracy 0.999625 85443\n", - " macro avg 0.987535 0.902009 0.940647 85443\n", - "weighted avg 0.999618 0.999625 0.999607 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[199020 0]\n", - " [ 0 344]]\n", - "Confusion Matrix in Test\n", - "[[85292 3]\n", - " [ 29 119]]\n", - "************************** Stree (SVM Tree) **********************\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", - " \"the number of iterations.\", ConvergenceWarning)\n", - "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", - " \"the number of iterations.\", ConvergenceWarning)\n", - "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", - " \"the number of iterations.\", ConvergenceWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train Model Stree (SVM Tree) took: 32.55 seconds\n", - "=========== Stree (SVM Tree) - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999623 0.999864 0.999744 199020\n", - " 1 0.908784 0.781977 0.840625 344\n", - "\n", - " accuracy 0.999488 199364\n", - " macro avg 0.954204 0.890921 0.920184 199364\n", - "weighted avg 0.999467 0.999488 0.999469 199364\n", - "\n", - "=========== Stree (SVM Tree) - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999637 0.999918 0.999777 85295\n", - " 1 0.943548 0.790541 0.860294 148\n", - "\n", - " accuracy 0.999555 85443\n", - " macro avg 0.971593 0.895229 0.930036 85443\n", - "weighted avg 0.999540 0.999555 0.999536 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[198993 27]\n", - " [ 75 269]]\n", - "Confusion Matrix in Test\n", - "[[85288 7]\n", - " [ 31 117]]\n", - "************************** AdaBoost model **********************\n", - "Train Model AdaBoost model took: 47.34 seconds\n", - "=========== AdaBoost model - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999392 0.999678 0.999535 199020\n", - " 1 0.777003 0.648256 0.706815 344\n", - "\n", - " accuracy 0.999072 199364\n", - " macro avg 0.888198 0.823967 0.853175 199364\n", - "weighted avg 0.999008 0.999072 0.999030 199364\n", - "\n", - "=========== AdaBoost model - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999484 0.999707 0.999596 85295\n", - " 1 0.806202 0.702703 0.750903 148\n", - "\n", - " accuracy 0.999192 85443\n", - " macro avg 0.902843 0.851205 0.875249 85443\n", - "weighted avg 0.999149 0.999192 0.999165 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[198956 64]\n", - " [ 121 223]]\n", - "Confusion Matrix in Test\n", - "[[85270 25]\n", - " [ 44 104]]\n", - "************************** Gradient Boost. **********************\n", - "Train Model Gradient Boost. took: 244.1 seconds\n", - "=========== Gradient Boost. - Train 199,364 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.999096 0.999854 0.999475 199020\n", - " 1 0.849741 0.476744 0.610801 344\n", - "\n", - " accuracy 0.998952 199364\n", - " macro avg 0.924419 0.738299 0.805138 199364\n", - "weighted avg 0.998839 0.998952 0.998804 199364\n", - "\n", - "=========== Gradient Boost. - Test 85,443 samples =============\n", - " precision recall f1-score support\n", - "\n", - " 0 0.998981 0.999730 0.999355 85295\n", - " 1 0.726190 0.412162 0.525862 148\n", - "\n", - " accuracy 0.998713 85443\n", - " macro avg 0.862586 0.705946 0.762609 85443\n", - "weighted avg 0.998508 0.998713 0.998535 85443\n", - "\n", - "Confusion Matrix in Train\n", - "[[198991 29]\n", - " [ 180 164]]\n", - "Confusion Matrix in Test\n", - "[[85272 23]\n", - " [ 87 61]]\n" - ] + "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 13.91 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 173.1 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 38.4 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999623 0.999864 0.999744 199020\n 1 0.908784 0.781977 0.840625 344\n\n accuracy 0.999488 199364\n macro avg 0.954204 0.890921 0.920184 199364\nweighted avg 0.999467 0.999488 0.999469 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999637 0.999918 0.999777 85295\n 1 0.943548 0.790541 0.860294 148\n\n accuracy 0.999555 85443\n macro avg 0.971593 0.895229 0.930036 85443\nweighted avg 0.999540 0.999555 0.999536 85443\n\nConfusion Matrix in Train\n[[198993 27]\n [ 75 269]]\nConfusion Matrix in Test\n[[85288 7]\n [ 31 117]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 47.21 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n" } ], "source": [ "# Train & Test models\n", "models = {\n", " 'Linear Tree':linear_tree, 'Random Forest': random_forest, 'Stree (SVM Tree)': stree, \n", - " 'AdaBoost model': adaboost, 'Gradient Boost.': gradient\n", + " 'AdaBoost model': adaboost\n", "}\n", "\n", "best_f1 = 0\n", @@ -440,22 +273,13 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [ { - "name": "stdout", "output_type": "stream", - "text": [ - "**************************************************************************************************************\n", - "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n", - "**************************************************************************************************************\n", - "Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n", - "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n", - "Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n", - "Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n", - "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n" - ] + "name": "stdout", + "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 173.095 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 13.91 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 173.09 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 38.40 seconds\t f1: 0.8603\nModel: AdaBoost model\t Time: 47.21 seconds\t f1: 0.7509\n" } ], "source": [ @@ -466,6 +290,20 @@ " print(f\"Model: {name}\\t Time: {time_spent:6.2f} seconds\\t f1: {f1:.4}\")" ] }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "**************************************************************************************************************\n", + "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n", + "**************************************************************************************************************\n", + "Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n", + "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n", + "Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n", + "Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n", + "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -487,9 +325,9 @@ "metadata": { "hide_input": false, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.6 64-bit ('general': venv)", "language": "python", - "name": "python3" + "name": "python37664bitgeneralvenvfbd0a23e74cf4e778460f5ffc6761f39" }, "language_info": { "codemirror_mode": { @@ -501,7 +339,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.6-final" }, "toc": { "base_numbering": 1, @@ -555,4 +393,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/stree/Strees.py b/stree/Strees.py index e2e33c3..7624972 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -193,8 +193,8 @@ class Splitter: def information_gain( self, labels_up: np.array, labels_dn: np.array ) -> float: - card_up = labels_up.shape[0] - card_dn = labels_dn.shape[0] + card_up = labels_up.shape[0] if labels_up is not None else 0 + card_dn = labels_dn.shape[0] if labels_dn is not None else 0 samples = card_up + card_dn up = card_up / samples * self.criterion_function(labels_up) dn = card_dn / samples * self.criterion_function(labels_dn) From 9334951d1b84d9fb3420054b4be370d7dad91bd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 15 Jun 2020 11:09:11 +0200 Subject: [PATCH 6/6] #2 Cosmetic and style updates --- stree/Strees.py | 8 +++++--- stree/tests/Splitter_test.py | 2 +- stree/tests/Stree_test.py | 8 ++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/stree/Strees.py b/stree/Strees.py index 7624972..ceeed7a 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -268,7 +268,8 @@ class Splitter: data = self.decision_criteria(data, node._y) self._down = data > 0 - def _distances(self, node: Snode, data: np.ndarray) -> np.array: + @staticmethod + def _distances(node: Snode, data: np.ndarray) -> np.array: """Compute distances of the samples to the hyperplane of the node :param node: node containing the svm classifier @@ -498,7 +499,8 @@ class Stree(BaseEstimator, ClassifierMixin): ) ) - def _reorder_results(self, y: np.array, indices: np.array) -> np.array: + @staticmethod + def _reorder_results(y: np.array, indices: np.array) -> np.array: """Reorder an array based on the array of indices passed :param y: data untidy @@ -579,7 +581,7 @@ class Stree(BaseEstimator, ClassifierMixin): X, y = check_X_y(X, y) y_pred = self.predict(X).reshape(y.shape) # Compute accuracy for each possible representation - y_type, y_true, y_pred = _check_targets(y, y_pred) + _, y_true, y_pred = _check_targets(y, y_pred) check_consistent_length(y_true, y_pred, sample_weight) score = y_true == y_pred return _weighted_sum(score, sample_weight, normalize=True) diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py index b620ce1..68c6123 100644 --- a/stree/tests/Splitter_test.py +++ b/stree/tests/Splitter_test.py @@ -13,8 +13,8 @@ class Splitter_test(unittest.TestCase): self._random_state = 1 super().__init__(*args, **kwargs) + @staticmethod def build( - self, clf=LinearSVC(), min_samples_split=0, splitter_type="random", diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index 0fea9e5..ccc0442 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -67,9 +67,8 @@ class Stree_test(unittest.TestCase): clf.fit(*load_dataset(self._random_state)) self._check_tree(clf.tree_) - def _find_out( - self, px: np.array, x_original: np.array, y_original - ) -> list: + @staticmethod + def _find_out(px: np.array, x_original: np.array, y_original) -> list: """Find the original values of y for a given array of samples Arguments: @@ -163,7 +162,8 @@ class Stree_test(unittest.TestCase): self.assertListEqual(expected, computed) self.assertEqual(expected_string, str(clf)) - def test_is_a_sklearn_classifier(self): + @staticmethod + def test_is_a_sklearn_classifier(): import warnings from sklearn.exceptions import ConvergenceWarning