mirror of
https://github.com/Doctorado-ML/STree.git
synced 2025-08-16 16:06:01 +00:00
* Add first doc info to sources * Update doc to separate classes in api * Refactor build_predictor * Fix random_sate issue in non linear kernels * Refactor score method using base class implementation * Some quality refactoring * Fix codecov config. * Add sigmoid kernel * Refactor setup and add Makefile
This commit is contained in:
committed by
GitHub
parent
02de394c96
commit
e19d10f6a7
@@ -1,9 +1,5 @@
|
||||
"""
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
|
||||
__license__ = "MIT"
|
||||
__version__ = "0.9"
|
||||
Build an oblique tree classifier based on SVM nodes
|
||||
Oblique decision tree classifier based on SVM nodes
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -17,7 +13,6 @@ from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.svm import SVC, LinearSVC
|
||||
from sklearn.feature_selection import SelectKBest
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.utils import check_consistent_length
|
||||
from sklearn.utils.multiclass import check_classification_targets
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils.validation import (
|
||||
@@ -26,7 +21,6 @@ from sklearn.utils.validation import (
|
||||
check_is_fitted,
|
||||
_check_sample_weight,
|
||||
)
|
||||
from sklearn.metrics._classification import _weighted_sum, _check_targets
|
||||
|
||||
|
||||
class Snode:
|
||||
@@ -147,12 +141,11 @@ class Snode:
|
||||
f"{self._belief: .6f} impurity={self._impurity:.4f} "
|
||||
f"counts={count_values}"
|
||||
)
|
||||
else:
|
||||
return (
|
||||
f"{self._title} feaures={self._features} impurity="
|
||||
f"{self._impurity:.4f} "
|
||||
f"counts={count_values}"
|
||||
)
|
||||
return (
|
||||
f"{self._title} feaures={self._features} impurity="
|
||||
f"{self._impurity:.4f} "
|
||||
f"counts={count_values}"
|
||||
)
|
||||
|
||||
|
||||
class Siterator:
|
||||
@@ -298,6 +291,23 @@ class Splitter:
|
||||
def _select_best_set(
|
||||
self, dataset: np.array, labels: np.array, features_sets: list
|
||||
) -> list:
|
||||
"""Return the best set of features among feature_sets, the criterion is
|
||||
the information gain
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : np.array
|
||||
array of samples (# samples, # features)
|
||||
labels : np.array
|
||||
array of labels
|
||||
features_sets : list
|
||||
list of features sets to check
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
best feature set
|
||||
"""
|
||||
max_gain = 0
|
||||
selected = None
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
@@ -451,6 +461,15 @@ class Splitter:
|
||||
def partition(self, samples: np.array, node: Snode, train: bool):
|
||||
"""Set the criteria to split arrays. Compute the indices of the samples
|
||||
that should go to one side of the tree (up)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
samples : np.array
|
||||
array of samples (# samples, # features)
|
||||
node : Snode
|
||||
Node of the tree where partition is going to be made
|
||||
train : bool
|
||||
Train time - True / Test time - False
|
||||
"""
|
||||
# data contains the distances of every sample to every class hyperplane
|
||||
# array of (m, nc) nc = # classes
|
||||
@@ -602,7 +621,9 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
f"Maximum depth has to be greater than 1... got (max_depth=\
|
||||
{self.max_depth})"
|
||||
)
|
||||
|
||||
kernels = ["linear", "rbf", "poly", "sigmoid"]
|
||||
if self.kernel not in kernels:
|
||||
raise ValueError(f"Kernel {self.kernel} not in {kernels}")
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
sample_weight = _check_sample_weight(
|
||||
@@ -633,7 +654,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
self.n_features_in_ = X.shape[1]
|
||||
self.max_features_ = self._initialize_max_features()
|
||||
self.tree_ = self.train(X, y, sample_weight, 1, "root")
|
||||
self._build_predictor()
|
||||
self.X_ = X
|
||||
self.y_ = y
|
||||
return self
|
||||
@@ -681,6 +701,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
if np.unique(y).shape[0] == 1:
|
||||
# only 1 class => pure dataset
|
||||
node.set_title(title + ", <pure>")
|
||||
node.make_predictor()
|
||||
return node
|
||||
# Train the model
|
||||
clf = self._build_clf()
|
||||
@@ -699,6 +720,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
if X_U is None or X_D is None:
|
||||
# didn't part anything
|
||||
node.set_title(title + ", <cgaf>")
|
||||
node.make_predictor()
|
||||
return node
|
||||
node.set_up(
|
||||
self.train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
|
||||
@@ -710,20 +732,8 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
)
|
||||
return node
|
||||
|
||||
def _build_predictor(self):
|
||||
"""Process the leaves to make them predictors"""
|
||||
|
||||
def run_tree(node: Snode):
|
||||
if node.is_leaf():
|
||||
node.make_predictor()
|
||||
return
|
||||
run_tree(node.get_down())
|
||||
run_tree(node.get_up())
|
||||
|
||||
run_tree(self.tree_)
|
||||
|
||||
def _build_clf(self):
|
||||
"""Build the correct classifier for the node"""
|
||||
"""Build the right classifier for the node"""
|
||||
return (
|
||||
LinearSVC(
|
||||
max_iter=self.max_iter,
|
||||
@@ -739,6 +749,7 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
C=self.C,
|
||||
gamma=self.gamma,
|
||||
degree=self.degree,
|
||||
random_state=self.random_state,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -820,36 +831,6 @@ class Stree(BaseEstimator, ClassifierMixin):
|
||||
)
|
||||
return self.classes_[result]
|
||||
|
||||
def score(
|
||||
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||
) -> float:
|
||||
"""Compute accuracy of the prediction
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : np.array
|
||||
dataset of samples to make predictions
|
||||
y : np.array
|
||||
samples labels
|
||||
sample_weight : np.array, optional
|
||||
weights of the samples. Rescale C per sample, by default None
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
accuracy of the prediction
|
||||
"""
|
||||
# sklearn check
|
||||
check_is_fitted(self)
|
||||
check_classification_targets(y)
|
||||
X, y = check_X_y(X, y)
|
||||
y_pred = self.predict(X).reshape(y.shape)
|
||||
# Compute accuracy for each possible representation
|
||||
_, y_true, y_pred = _check_targets(y, y_pred)
|
||||
check_consistent_length(y_true, y_pred, sample_weight)
|
||||
score = y_true == y_pred
|
||||
return _weighted_sum(score, sample_weight, normalize=True)
|
||||
|
||||
def nodes_leaves(self) -> tuple:
|
||||
"""Compute the number of nodes and leaves in the built tree
|
||||
|
||||
|
@@ -1,3 +1,11 @@
|
||||
from .Strees import Stree, Snode, Siterator, Splitter
|
||||
|
||||
__version__ = "1.0"
|
||||
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
|
||||
__license__ = "MIT License"
|
||||
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
||||
__url__ = "https://github.com/doctorado-ml/stree"
|
||||
|
||||
__all__ = ["Stree", "Snode", "Siterator", "Splitter"]
|
||||
|
@@ -21,6 +21,21 @@ class Stree_test(unittest.TestCase):
|
||||
def setUp(cls):
|
||||
os.environ["TESTING"] = "1"
|
||||
|
||||
def test_valid_kernels(self):
|
||||
valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
|
||||
X, y = load_dataset()
|
||||
for kernel in valid_kernels:
|
||||
clf = Stree(kernel=kernel)
|
||||
clf.fit(X, y)
|
||||
self.assertIsNotNone(clf.tree_)
|
||||
|
||||
def test_bogus_kernel(self):
|
||||
kernel = "other"
|
||||
X, y = load_dataset()
|
||||
clf = Stree(kernel=kernel)
|
||||
with self.assertRaises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
def _check_tree(self, node: Snode):
|
||||
"""Check recursively that the nodes that are not leaves have the
|
||||
correct number of labels and its sons have the right number of elements
|
||||
@@ -484,13 +499,13 @@ class Stree_test(unittest.TestCase):
|
||||
clf.fit(X, y)
|
||||
nodes, leaves = clf.nodes_leaves()
|
||||
self.assertEqual(25, nodes)
|
||||
self.assertEquals(13, leaves)
|
||||
self.assertEqual(13, leaves)
|
||||
X, y = load_wine(return_X_y=True)
|
||||
clf = Stree(random_state=self._random_state)
|
||||
clf.fit(X, y)
|
||||
nodes, leaves = clf.nodes_leaves()
|
||||
self.assertEqual(9, nodes)
|
||||
self.assertEquals(5, leaves)
|
||||
self.assertEqual(5, leaves)
|
||||
|
||||
def test_nodes_leaves_artificial(self):
|
||||
n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")
|
||||
|
Reference in New Issue
Block a user