mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-17 16:36:01 +00:00
Compare commits
1 Commits
graphviz
...
context-sc
Author | SHA1 | Date | |
---|---|---|---|
9cb69ebc75
|
@@ -15,6 +15,7 @@ from typing import Optional
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
from sklearn.svm import SVC, LinearSVC
|
from sklearn.svm import SVC, LinearSVC
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
from sklearn.utils import check_consistent_length
|
from sklearn.utils import check_consistent_length
|
||||||
from sklearn.utils.multiclass import check_classification_targets
|
from sklearn.utils.multiclass import check_classification_targets
|
||||||
from sklearn.exceptions import ConvergenceWarning
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
@@ -41,6 +42,7 @@ class Snode:
|
|||||||
impurity: float,
|
impurity: float,
|
||||||
title: str,
|
title: str,
|
||||||
weight: np.ndarray = None,
|
weight: np.ndarray = None,
|
||||||
|
scaler: StandardScaler = None,
|
||||||
):
|
):
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._title = title
|
self._title = title
|
||||||
@@ -58,6 +60,7 @@ class Snode:
|
|||||||
self._features = features
|
self._features = features
|
||||||
self._impurity = impurity
|
self._impurity = impurity
|
||||||
self._partition_column: int = -1
|
self._partition_column: int = -1
|
||||||
|
self._scaler = scaler
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def copy(cls, node: "Snode") -> "Snode":
|
def copy(cls, node: "Snode") -> "Snode":
|
||||||
@@ -68,6 +71,8 @@ class Snode:
|
|||||||
node._features,
|
node._features,
|
||||||
node._impurity,
|
node._impurity,
|
||||||
node._title,
|
node._title,
|
||||||
|
node._sample_weight,
|
||||||
|
node._scaler,
|
||||||
)
|
)
|
||||||
|
|
||||||
def set_partition_column(self, col: int):
|
def set_partition_column(self, col: int):
|
||||||
@@ -178,6 +183,7 @@ class Splitter:
|
|||||||
criteria: str = None,
|
criteria: str = None,
|
||||||
min_samples_split: int = None,
|
min_samples_split: int = None,
|
||||||
random_state=None,
|
random_state=None,
|
||||||
|
normalize=False,
|
||||||
):
|
):
|
||||||
self._clf = clf
|
self._clf = clf
|
||||||
self._random_state = random_state
|
self._random_state = random_state
|
||||||
@@ -187,6 +193,7 @@ class Splitter:
|
|||||||
self._min_samples_split = min_samples_split
|
self._min_samples_split = min_samples_split
|
||||||
self._criteria = criteria
|
self._criteria = criteria
|
||||||
self._splitter_type = splitter_type
|
self._splitter_type = splitter_type
|
||||||
|
self._normalize = normalize
|
||||||
|
|
||||||
if clf is None:
|
if clf is None:
|
||||||
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
raise ValueError(f"clf has to be a sklearn estimator, got({clf})")
|
||||||
@@ -486,8 +493,7 @@ class Splitter:
|
|||||||
origin[down] if any(down) else None,
|
origin[down] if any(down) else None,
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
def _distances(self, node: Snode, data: np.ndarray) -> np.array:
|
||||||
def _distances(node: Snode, data: np.ndarray) -> np.array:
|
|
||||||
"""Compute distances of the samples to the hyperplane of the node
|
"""Compute distances of the samples to the hyperplane of the node
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
@@ -503,7 +509,10 @@ class Splitter:
|
|||||||
array of shape (m, nc) with the distances of every sample to
|
array of shape (m, nc) with the distances of every sample to
|
||||||
the hyperplane of every class. nc = # of classes
|
the hyperplane of every class. nc = # of classes
|
||||||
"""
|
"""
|
||||||
return node._clf.decision_function(data[:, node._features])
|
X_transformed = data[:, node._features]
|
||||||
|
if self._normalize:
|
||||||
|
X_transformed = node._scaler.transform(X_transformed)
|
||||||
|
return node._clf.decision_function(X_transformed)
|
||||||
|
|
||||||
|
|
||||||
class Stree(BaseEstimator, ClassifierMixin):
|
class Stree(BaseEstimator, ClassifierMixin):
|
||||||
@@ -529,6 +538,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
min_samples_split: int = 0,
|
min_samples_split: int = 0,
|
||||||
max_features=None,
|
max_features=None,
|
||||||
splitter: str = "random",
|
splitter: str = "random",
|
||||||
|
normalize: bool = False,
|
||||||
):
|
):
|
||||||
self.max_iter = max_iter
|
self.max_iter = max_iter
|
||||||
self.C = C
|
self.C = C
|
||||||
@@ -543,6 +553,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
self.max_features = max_features
|
self.max_features = max_features
|
||||||
self.criterion = criterion
|
self.criterion = criterion
|
||||||
self.splitter = splitter
|
self.splitter = splitter
|
||||||
|
self.normalize = normalize
|
||||||
|
|
||||||
def _more_tags(self) -> dict:
|
def _more_tags(self) -> dict:
|
||||||
"""Required by sklearn to supply features of the classifier
|
"""Required by sklearn to supply features of the classifier
|
||||||
@@ -606,6 +617,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
criteria=self.split_criteria,
|
criteria=self.split_criteria,
|
||||||
random_state=self.random_state,
|
random_state=self.random_state,
|
||||||
min_samples_split=self.min_samples_split,
|
min_samples_split=self.min_samples_split,
|
||||||
|
normalize=self.normalize,
|
||||||
)
|
)
|
||||||
if self.random_state is not None:
|
if self.random_state is not None:
|
||||||
random.seed(self.random_state)
|
random.seed(self.random_state)
|
||||||
@@ -660,7 +672,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
y = y[~indices_zero]
|
y = y[~indices_zero]
|
||||||
sample_weight = sample_weight[~indices_zero]
|
sample_weight = sample_weight[~indices_zero]
|
||||||
self.depth_ = max(depth, self.depth_)
|
self.depth_ = max(depth, self.depth_)
|
||||||
node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight)
|
scaler = StandardScaler()
|
||||||
|
node = Snode(None, X, y, X.shape[1], 0.0, title, sample_weight, scaler)
|
||||||
if np.unique(y).shape[0] == 1:
|
if np.unique(y).shape[0] == 1:
|
||||||
# only 1 class => pure dataset
|
# only 1 class => pure dataset
|
||||||
node.set_title(title + ", <pure>")
|
node.set_title(title + ", <pure>")
|
||||||
@@ -668,6 +681,9 @@ class Stree(BaseEstimator, ClassifierMixin):
|
|||||||
# Train the model
|
# Train the model
|
||||||
clf = self._build_clf()
|
clf = self._build_clf()
|
||||||
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
|
||||||
|
if self.normalize:
|
||||||
|
scaler.fit(Xs)
|
||||||
|
Xs = scaler.transform(Xs)
|
||||||
clf.fit(Xs, y, sample_weight=sample_weight)
|
clf.fit(Xs, y, sample_weight=sample_weight)
|
||||||
node.set_impurity(self.splitter_.partition_impurity(y))
|
node.set_impurity(self.splitter_.partition_impurity(y))
|
||||||
node.set_classifier(clf)
|
node.set_classifier(clf)
|
||||||
|
@@ -1,8 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from stree import Stree, Snode
|
from stree import Stree, Snode
|
||||||
from .utils import load_dataset
|
from .utils import load_dataset
|
||||||
|
|
||||||
@@ -119,3 +117,5 @@ class Snode_test(unittest.TestCase):
|
|||||||
self.assertEqual("test", computed._title)
|
self.assertEqual("test", computed._title)
|
||||||
self.assertIsInstance(computed._clf, Stree)
|
self.assertIsInstance(computed._clf, Stree)
|
||||||
self.assertEqual(test._partition_column, computed._partition_column)
|
self.assertEqual(test._partition_column, computed._partition_column)
|
||||||
|
self.assertEqual(test._sample_weight, computed._sample_weight)
|
||||||
|
self.assertEqual(test._scaler, computed._scaler)
|
||||||
|
@@ -378,9 +378,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
n_samples=500,
|
n_samples=500,
|
||||||
)
|
)
|
||||||
clf = Stree(kernel="rbf", random_state=self._random_state)
|
clf = Stree(kernel="rbf", random_state=self._random_state)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="rbf", random_state=self._random_state, normalize=True
|
||||||
|
)
|
||||||
self.assertEqual(0.768, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.768, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.814, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.6741573033707865, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_poly(self):
|
def test_score_multiclass_poly(self):
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
@@ -392,9 +397,16 @@ class Stree_test(unittest.TestCase):
|
|||||||
clf = Stree(
|
clf = Stree(
|
||||||
kernel="poly", random_state=self._random_state, C=10, degree=5
|
kernel="poly", random_state=self._random_state, C=10, degree=5
|
||||||
)
|
)
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="poly",
|
||||||
|
random_state=self._random_state,
|
||||||
|
normalize=True,
|
||||||
|
)
|
||||||
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.786, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.818, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.702247191011236, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(0.6067415730337079, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_score_multiclass_linear(self):
|
def test_score_multiclass_linear(self):
|
||||||
X, y = load_dataset(
|
X, y = load_dataset(
|
||||||
@@ -405,8 +417,14 @@ class Stree_test(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
clf = Stree(kernel="linear", random_state=self._random_state)
|
clf = Stree(kernel="linear", random_state=self._random_state)
|
||||||
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
|
||||||
|
# Check with context based standardization
|
||||||
|
clf2 = Stree(
|
||||||
|
kernel="linear", random_state=self._random_state, normalize=True
|
||||||
|
)
|
||||||
|
self.assertEqual(0.9526666666666667, clf2.fit(X, y).score(X, y))
|
||||||
X, y = load_wine(return_X_y=True)
|
X, y = load_wine(return_X_y=True)
|
||||||
self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y))
|
self.assertEqual(0.9831460674157303, clf.fit(X, y).score(X, y))
|
||||||
|
self.assertEqual(1.0, clf2.fit(X, y).score(X, y))
|
||||||
|
|
||||||
def test_zero_all_sample_weights(self):
|
def test_zero_all_sample_weights(self):
|
||||||
X, y = load_dataset(self._random_state)
|
X, y = load_dataset(self._random_state)
|
||||||
|
Reference in New Issue
Block a user