Implement hyperparam. context based normalization

This commit is contained in:
2021-04-15 02:13:30 +02:00
parent b55f59a3ec
commit 9cb69ebc75
3 changed files with 40 additions and 6 deletions

View File

@@ -15,6 +15,7 @@ from typing import Optional
import numpy as np import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_consistent_length from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.multiclass import check_classification_targets
from sklearn.exceptions import ConvergenceWarning from sklearn.exceptions import ConvergenceWarning
@@ -41,6 +42,7 @@ class Snode:
impurity: float, impurity: float,
title: str, title: str,
weight: np.ndarray = None, weight: np.ndarray = None,
scaler: StandardScaler = None,
): ):
self._clf = clf self._clf = clf
self._title = title self._title = title
@@ -58,6 +60,7 @@ class Snode:
self._features = features self._features = features
self._impurity = impurity self._impurity = impurity
self._partition_column: int = -1 self._partition_column: int = -1
self._scaler = scaler
@classmethod @classmethod
def copy(cls, node: "Snode") -> "Snode": def copy(cls, node: "Snode") -> "Snode":
@@ -68,6 +71,8 @@ class Snode:
node._features, node._features,
node._impurity, node._impurity,
node._title, node._title,
node._sample_weight,
node._scaler,
) )
def set_partition_column(self, col: int): def set_partition_column(self, col: int):
@@ -178,6 +183,7 @@ class Splitter:
criteria: str = None, criteria: str = None,
min_samples_split: int = None, min_samples_split: int = None,
random_state=None, random_state=None,
normalize=False,
): ):
self._clf = clf self._clf = clf
self._random_state = random_state self._random_state = random_state
@@ -187,6 +193,7 @@ class Splitter:
self._min_samples_split = min_samples_split self._min_samples_split = min_samples_split
self._criteria = criteria self._criteria = criteria
self._splitter_type = splitter_type self._splitter_type = splitter_type
self._normalize = normalize
if clf is None: if clf is None:
raise ValueError(f"clf has to be a sklearn estimator, got({clf})") raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
@@ -486,8 +493,7 @@ class Splitter:
origin[down] if any(down) else None, origin[down] if any(down) else None,
] ]
@staticmethod def _distances(self, node: Snode, data: np.ndarray) -> np.array:
def _distances(node: Snode, data: np.ndarray) -> np.array:
"""Compute distances of the samples to the hyperplane of the node """Compute distances of the samples to the hyperplane of the node
Parameters Parameters
@@ -503,7 +509,10 @@ class Splitter:
array of shape (m, nc) with the distances of every sample to array of shape (m, nc) with the distances of every sample to
the hyperplane of every class. nc = # of classes the hyperplane of every class. nc = # of classes
""" """
return node._clf.decision_function(data[:, node._features]) X_transformed = data[:, node._features]
if self._normalize:
X_transformed = node._scaler.transform(X_transformed)
return node._clf.decision_function(X_transformed)
class Stree(BaseEstimator, ClassifierMixin): class Stree(BaseEstimator, ClassifierMixin):
@@ -529,6 +538,7 @@ class Stree(BaseEstimator, ClassifierMixin):
min_samples_split: int = 0, min_samples_split: int = 0,
max_features=None, max_features=None,
splitter: str = "random", splitter: str = "random",
normalize: bool = False,
): ):
self.max_iter = max_iter self.max_iter = max_iter
self.C = C self.C = C
@@ -543,6 +553,7 @@ class Stree(BaseEstimator, ClassifierMixin):
self.max_features = max_features self.max_features = max_features
self.criterion = criterion self.criterion = criterion
self.splitter = splitter self.splitter = splitter
self.normalize = normalize
def _more_tags(self) -> dict: def _more_tags(self) -> dict:
"""Required by sklearn to supply features of the classifier """Required by sklearn to supply features of the classifier
@@ -606,6 +617,7 @@ class Stree(BaseEstimator, ClassifierMixin):
criteria=self.split_criteria, criteria=self.split_criteria,
random_state=self.random_state, random_state=self.random_state,
min_samples_split=self.min_samples_split, min_samples_split=self.min_samples_split,
normalize=self.normalize,
) )
if self.random_state is not None: if self.random_state is not None:
random.seed(self.random_state) random.seed(self.random_state)
@@ -660,7 +672,8 @@ class Stree(BaseEstimator, ClassifierMixin):
y = y[~indices_zero] y = y[~indices_zero]
sample_weight = sample_weight[~indices_zero] sample_weight = sample_weight[~indices_zero]
self.depth_ = max(depth, self.depth_) self.depth_ = max(depth, self.depth_)
node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight) scaler = StandardScaler()
node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight, scaler)
if np.unique(y).shape[0] == 1: if np.unique(y).shape[0] == 1:
# only 1 class => pure dataset # only 1 class => pure dataset
node.set_title(title + ", <pure>") node.set_title(title + ", <pure>")
@@ -668,6 +681,9 @@ class Stree(BaseEstimator, ClassifierMixin):
# Train the model # Train the model
clf = self._build_clf() clf = self._build_clf()
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
if self.normalize:
scaler.fit(Xs)
Xs = scaler.transform(Xs)
clf.fit(Xs, y, sample_weight=sample_weight) clf.fit(Xs, y, sample_weight=sample_weight)
node.set_impurity(self.splitter_.partition_impurity(y)) node.set_impurity(self.splitter_.partition_impurity(y))
node.set_classifier(clf) node.set_classifier(clf)

View File

@@ -1,8 +1,6 @@
import os import os
import unittest import unittest
import numpy as np import numpy as np
from stree import Stree, Snode from stree import Stree, Snode
from .utils import load_dataset from .utils import load_dataset
@@ -119,3 +117,5 @@ class Snode_test(unittest.TestCase):
self.assertEqual("test", computed._title) self.assertEqual("test", computed._title)
self.assertIsInstance(computed._clf, Stree) self.assertIsInstance(computed._clf, Stree)
self.assertEqual(test._partition_column, computed._partition_column) self.assertEqual(test._partition_column, computed._partition_column)
self.assertEqual(test._sample_weight, computed._sample_weight)
self.assertEqual(test._scaler, computed._scaler)

View File

@@ -378,9 +378,14 @@ class Stree_test(unittest.TestCase):
n_samples=500, n_samples=500,
) )
clf = Stree(kernel="rbf", random_state=self._random_state) clf = Stree(kernel="rbf", random_state=self._random_state)
clf2 = Stree(
kernel="rbf", random_state=self._random_state, normalize=True
)
self.assertEqual(0.768, clf.fit(X, y).score(X, y)) self.assertEqual(0.768, clf.fit(X, y).score(X, y))
self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y)) self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_score_multiclass_poly(self): def test_score_multiclass_poly(self):
X, y = load_dataset( X, y = load_dataset(
@@ -392,9 +397,16 @@ class Stree_test(unittest.TestCase):
clf = Stree( clf = Stree(
kernel="poly", random_state=self._random_state, C=10, degree=5 kernel="poly", random_state=self._random_state, C=10, degree=5
) )
clf2 = Stree(
kernel="poly",
random_state=self._random_state,
normalize=True,
)
self.assertEqual(0.786, clf.fit(X, y).score(X, y)) self.assertEqual(0.786, clf.fit(X, y).score(X, y))
self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y)) self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
def test_score_multiclass_linear(self): def test_score_multiclass_linear(self):
X, y = load_dataset( X, y = load_dataset(
@@ -405,8 +417,14 @@ class Stree_test(unittest.TestCase):
) )
clf = Stree(kernel="linear", random_state=self._random_state) clf = Stree(kernel="linear", random_state=self._random_state)
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y)) self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
# Check with context based standardization
clf2 = Stree(
kernel="linear", random_state=self._random_state, normalize=True
)
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
X, y = load_wine(return_X_y=True) X, y = load_wine(return_X_y=True)
self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y)) self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y))
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
def test_zero_all_sample_weights(self): def test_zero_all_sample_weights(self):
X, y = load_dataset(self._random_state) X, y = load_dataset(self._random_state)