Package doc #7 (#34)

* Add first doc info to sources * Update doc to separate classes in api * Refactor build_predictor * Fix random_sate issue in non linear kernels * Refactor score method using base class implementation * Some quality refactoring * Fix codecov config. * Add sigmoid kernel * Refactor setup and add Makefile
2025-08-16 16:06:01 +00:00 · 2021-04-26 09:10:01 +02:00
parent 02de394c96
commit e19d10f6a7
23 changed files with 369 additions and 106 deletions
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -1,9 +1,5 @@
 """
-__author__ = "Ricardo Montañana Gómez"
-__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
-__license__ = "MIT"
-__version__ = "0.9"
-Build an oblique tree classifier based on SVM nodes
+Oblique decision tree classifier based on SVM nodes
 """

 import os
@@ -17,7 +13,6 @@ from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.svm import SVC, LinearSVC
 from sklearn.feature_selection import SelectKBest
 from sklearn.preprocessing import StandardScaler
-from sklearn.utils import check_consistent_length
 from sklearn.utils.multiclass import check_classification_targets
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.utils.validation import (
@@ -26,7 +21,6 @@ from sklearn.utils.validation import (
    check_is_fitted,
    _check_sample_weight,
 )
-from sklearn.metrics._classification import _weighted_sum, _check_targets


 class Snode:
@@ -147,12 +141,11 @@ class Snode:
                f"{self._belief: .6f} impurity={self._impurity:.4f} "
                f"counts={count_values}"
            )
-        else:
-            return (
-                f"{self._title} feaures={self._features} impurity="
-                f"{self._impurity:.4f} "
-                f"counts={count_values}"
-            )
+        return (
+            f"{self._title} feaures={self._features} impurity="
+            f"{self._impurity:.4f} "
+            f"counts={count_values}"
+        )


 class Siterator:
@@ -298,6 +291,23 @@ class Splitter:
    def _select_best_set(
        self, dataset: np.array, labels: np.array, features_sets: list
    ) -> list:
+        """Return the best set of features among feature_sets, the criterion is
+        the information gain
+
+        Parameters
+        ----------
+        dataset : np.array
+            array of samples (# samples, # features)
+        labels : np.array
+            array of labels
+        features_sets : list
+            list of features sets to check
+
+        Returns
+        -------
+        list
+            best feature set
+        """
        max_gain = 0
        selected = None
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
@@ -451,6 +461,15 @@ class Splitter:
    def partition(self, samples: np.array, node: Snode, train: bool):
        """Set the criteria to split arrays. Compute the indices of the samples
        that should go to one side of the tree (up)
+
+        Parameters
+        ----------
+        samples : np.array
+            array of samples (# samples, # features)
+        node : Snode
+            Node of the tree where partition is going to be made
+        train : bool
+            Train time - True / Test time - False
        """
        # data contains the distances of every sample to every class hyperplane
        # array of (m, nc) nc = # classes
@@ -602,7 +621,9 @@ class Stree(BaseEstimator, ClassifierMixin):
                f"Maximum depth has to be greater than 1... got (max_depth=\
                    {self.max_depth})"
            )
-
+        kernels = ["linear", "rbf", "poly", "sigmoid"]
+        if self.kernel not in kernels:
+            raise ValueError(f"Kernel {self.kernel} not in {kernels}")
        check_classification_targets(y)
        X, y = check_X_y(X, y)
        sample_weight = _check_sample_weight(
@@ -633,7 +654,6 @@ class Stree(BaseEstimator, ClassifierMixin):
        self.n_features_in_ = X.shape[1]
        self.max_features_ = self._initialize_max_features()
        self.tree_ = self.train(X, y, sample_weight, 1, "root")
-        self._build_predictor()
        self.X_ = X
        self.y_ = y
        return self
@@ -681,6 +701,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        if np.unique(y).shape[0] == 1:
            # only 1 class => pure dataset
            node.set_title(title + ", <pure>")
+            node.make_predictor()
            return node
        # Train the model
        clf = self._build_clf()
@@ -699,6 +720,7 @@ class Stree(BaseEstimator, ClassifierMixin):
        if X_U is None or X_D is None:
            # didn't part anything
            node.set_title(title + ", <cgaf>")
+            node.make_predictor()
            return node
        node.set_up(
            self.train(X_U, y_u, sw_u, depth + 1, title + f" - Up({depth+1})")
@@ -710,20 +732,8 @@ class Stree(BaseEstimator, ClassifierMixin):
        )
        return node

-    def _build_predictor(self):
-        """Process the leaves to make them predictors"""
-
-        def run_tree(node: Snode):
-            if node.is_leaf():
-                node.make_predictor()
-                return
-            run_tree(node.get_down())
-            run_tree(node.get_up())
-
-        run_tree(self.tree_)
-
    def _build_clf(self):
-        """Build the correct classifier for the node"""
+        """Build the right classifier for the node"""
        return (
            LinearSVC(
                max_iter=self.max_iter,
@@ -739,6 +749,7 @@ class Stree(BaseEstimator, ClassifierMixin):
                C=self.C,
                gamma=self.gamma,
                degree=self.degree,
+                random_state=self.random_state,
            )
        )

@@ -820,36 +831,6 @@ class Stree(BaseEstimator, ClassifierMixin):
        )
        return self.classes_[result]

-    def score(
-        self, X: np.array, y: np.array, sample_weight: np.array = None
-    ) -> float:
-        """Compute accuracy of the prediction
-
-        Parameters
-        ----------
-        X : np.array
-            dataset of samples to make predictions
-        y : np.array
-            samples labels
-        sample_weight : np.array, optional
-            weights of the samples. Rescale C per sample, by default None
-
-        Returns
-        -------
-        float
-            accuracy of the prediction
-        """
-        # sklearn check
-        check_is_fitted(self)
-        check_classification_targets(y)
-        X, y = check_X_y(X, y)
-        y_pred = self.predict(X).reshape(y.shape)
-        # Compute accuracy for each possible representation
-        _, y_true, y_pred = _check_targets(y, y_pred)
-        check_consistent_length(y_true, y_pred, sample_weight)
-        score = y_true == y_pred
-        return _weighted_sum(score, sample_weight, normalize=True)
-
    def nodes_leaves(self) -> tuple:
        """Compute the number of nodes and leaves in the built tree

--- a/stree/init.py
+++ b/stree/init.py
@@ -1,3 +1,11 @@
 from .Strees import Stree, Snode, Siterator, Splitter

+__version__ = "1.0"
+
+__author__ = "Ricardo Montañana Gómez"
+__copyright__ = "Copyright 2020-2021, Ricardo Montañana Gómez"
+__license__ = "MIT License"
+__author_email__ = "ricardo.montanana@alu.uclm.es"
+__url__ = "https://github.com/doctorado-ml/stree"
+
 __all__ = ["Stree", "Snode", "Siterator", "Splitter"]
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -21,6 +21,21 @@ class Stree_test(unittest.TestCase):
    def setUp(cls):
        os.environ["TESTING"] = "1"

+    def test_valid_kernels(self):
+        valid_kernels = ["linear", "rbf", "poly", "sigmoid"]
+        X, y = load_dataset()
+        for kernel in valid_kernels:
+            clf = Stree(kernel=kernel)
+            clf.fit(X, y)
+            self.assertIsNotNone(clf.tree_)
+
+    def test_bogus_kernel(self):
+        kernel = "other"
+        X, y = load_dataset()
+        clf = Stree(kernel=kernel)
+        with self.assertRaises(ValueError):
+            clf.fit(X, y)
+
    def _check_tree(self, node: Snode):
        """Check recursively that the nodes that are not leaves have the
        correct number of labels and its sons have the right number of elements
@@ -484,13 +499,13 @@ class Stree_test(unittest.TestCase):
        clf.fit(X, y)
        nodes, leaves = clf.nodes_leaves()
        self.assertEqual(25, nodes)
-        self.assertEquals(13, leaves)
+        self.assertEqual(13, leaves)
        X, y = load_wine(return_X_y=True)
        clf = Stree(random_state=self._random_state)
        clf.fit(X, y)
        nodes, leaves = clf.nodes_leaves()
        self.assertEqual(9, nodes)
-        self.assertEquals(5, leaves)
+        self.assertEqual(5, leaves)

    def test_nodes_leaves_artificial(self):
        n1 = Snode(None, [1, 2, 3, 4], [1, 0, 1, 1], [], 0.0, "test1")