From 8a18c998df77f5e43c9ee7c769cfbe94695ea1bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sun, 18 Apr 2021 18:57:39 +0200 Subject: [PATCH] Implement hyperparam. context based normalization (#32) --- stree/Strees.py | 24 ++++++++++++++++++++---- stree/tests/Snode_test.py | 4 ++-- stree/tests/Stree_test.py | 18 ++++++++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/stree/Strees.py b/stree/Strees.py index d44da0f..d81a3ec 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -15,6 +15,7 @@ from typing import Optional import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import SVC, LinearSVC +from sklearn.preprocessing import StandardScaler from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import check_classification_targets from sklearn.exceptions import ConvergenceWarning @@ -41,6 +42,7 @@ class Snode: impurity: float, title: str, weight: np.ndarray = None, + scaler: StandardScaler = None, ): self._clf = clf self._title = title @@ -58,6 +60,7 @@ class Snode: self._features = features self._impurity = impurity self._partition_column: int = -1 + self._scaler = scaler @classmethod def copy(cls, node: "Snode") -> "Snode": @@ -68,6 +71,8 @@ class Snode: node._features, node._impurity, node._title, + node._sample_weight, + node._scaler, ) def set_partition_column(self, col: int): @@ -178,6 +183,7 @@ class Splitter: criteria: str = None, min_samples_split: int = None, random_state=None, + normalize=False, ): self._clf = clf self._random_state = random_state @@ -187,6 +193,7 @@ class Splitter: self._min_samples_split = min_samples_split self._criteria = criteria self._splitter_type = splitter_type + self._normalize = normalize if clf is None: raise ValueError(f"clf has to be a sklearn estimator, got({clf})") @@ -486,8 +493,7 @@ class Splitter: origin[down] if any(down) else None, ] - @staticmethod - def _distances(node: Snode, data: np.ndarray) -> np.array: + def _distances(self, node: Snode, data: np.ndarray) -> np.array: """Compute distances of the samples to the hyperplane of the node Parameters @@ -503,7 +509,10 @@ class Splitter: array of shape (m, nc) with the distances of every sample to the hyperplane of every class. nc = # of classes """ - return node._clf.decision_function(data[:, node._features]) + X_transformed = data[:, node._features] + if self._normalize: + X_transformed = node._scaler.transform(X_transformed) + return node._clf.decision_function(X_transformed) class Stree(BaseEstimator, ClassifierMixin): @@ -529,6 +538,7 @@ class Stree(BaseEstimator, ClassifierMixin): min_samples_split: int = 0, max_features=None, splitter: str = "random", + normalize: bool = False, ): self.max_iter = max_iter self.C = C @@ -543,6 +553,7 @@ class Stree(BaseEstimator, ClassifierMixin): self.max_features = max_features self.criterion = criterion self.splitter = splitter + self.normalize = normalize def _more_tags(self) -> dict: """Required by sklearn to supply features of the classifier @@ -606,6 +617,7 @@ class Stree(BaseEstimator, ClassifierMixin): criteria=self.split_criteria, random_state=self.random_state, min_samples_split=self.min_samples_split, + normalize=self.normalize, ) if self.random_state is not None: random.seed(self.random_state) @@ -660,7 +672,8 @@ class Stree(BaseEstimator, ClassifierMixin): y = y[~indices_zero] sample_weight = sample_weight[~indices_zero] self.depth_ = max(depth, self.depth_) - node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight) + scaler = StandardScaler() + node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight, scaler) if np.unique(y).shape[0] == 1: # only 1 class => pure dataset node.set_title(title + ", ") @@ -668,6 +681,9 @@ class Stree(BaseEstimator, ClassifierMixin): # Train the model clf = self._build_clf() Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) + if self.normalize: + scaler.fit(Xs) + Xs = scaler.transform(Xs) clf.fit(Xs, y, sample_weight=sample_weight) node.set_impurity(self.splitter_.partition_impurity(y)) node.set_classifier(clf) diff --git a/stree/tests/Snode_test.py b/stree/tests/Snode_test.py index b1e2728..d60cbfc 100644 --- a/stree/tests/Snode_test.py +++ b/stree/tests/Snode_test.py @@ -1,8 +1,6 @@ import os import unittest - import numpy as np - from stree import Stree, Snode from .utils import load_dataset @@ -119,3 +117,5 @@ class Snode_test(unittest.TestCase): self.assertEqual("test", computed._title) self.assertIsInstance(computed._clf, Stree) self.assertEqual(test._partition_column, computed._partition_column) + self.assertEqual(test._sample_weight, computed._sample_weight) + self.assertEqual(test._scaler, computed._scaler) diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index ce50ae5..c954126 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -378,9 +378,14 @@ class Stree_test(unittest.TestCase): n_samples=500, ) clf = Stree(kernel="rbf", random_state=self._random_state) + clf2 = Stree( + kernel="rbf", random_state=self._random_state, normalize=True + ) self.assertEqual(0.768, clf.fit(X, y).score(X, y)) + self.assertEqual(0.814, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y)) + self.assertEqual(1.0, clf2.fit(X, y).score(X, y)) def test_score_multiclass_poly(self): X, y = load_dataset( @@ -392,9 +397,16 @@ class Stree_test(unittest.TestCase): clf = Stree( kernel="poly", random_state=self._random_state, C=10, degree=5 ) + clf2 = Stree( + kernel="poly", + random_state=self._random_state, + normalize=True, + ) self.assertEqual(0.786, clf.fit(X, y).score(X, y)) + self.assertEqual(0.818, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y)) + self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y)) def test_score_multiclass_linear(self): X, y = load_dataset( @@ -405,8 +417,14 @@ class Stree_test(unittest.TestCase): ) clf = Stree(kernel="linear", random_state=self._random_state) self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y)) + # Check with context based standardization + clf2 = Stree( + kernel="linear", random_state=self._random_state, normalize=True + ) + self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y)) X, y = load_wine(return_X_y=True) self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y)) + self.assertEqual(1.0, clf2.fit(X, y).score(X, y)) def test_zero_all_sample_weights(self): X, y = load_dataset(self._random_state)