Complete source comments (#22)

* Add Hyperparameters description to README Comment get_subspace method Add environment info for binder (runtime.txt) * Complete source comments Change docstring type to numpy update hyperameters table and explanation * Update Jupyter notebooks
2025-08-16 07:56:06 +00:00 · 2021-01-19 10:44:59 +01:00
parent e4ac5075e5
commit 3bdac9bd60
10 changed files with 958 additions and 875 deletions
--- a/stree/Strees.py
+++ b/stree/Strees.py
@@ -3,7 +3,7 @@ __author__ = "Ricardo Montañana Gómez"
 __copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
 __license__ = "MIT"
 __version__ = "0.9"
-Build an oblique tree classifier based on SVM Trees
+Build an oblique tree classifier based on SVM nodes
 """

 import os
@@ -197,6 +197,18 @@ class Splitter:

    @staticmethod
    def _entropy(y: np.array) -> float:
+        """Compute entropy of a labels set
+
+        Parameters
+        ----------
+        y : np.array
+            set of labels
+
+        Returns
+        -------
+        float
+            entropy
+        """
        n_labels = len(y)
        if n_labels <= 1:
            return 0
@@ -215,6 +227,22 @@ class Splitter:
    def information_gain(
        self, labels: np.array, labels_up: np.array, labels_dn: np.array
    ) -> float:
+        """Compute information gain of a split candidate
+
+        Parameters
+        ----------
+        labels : np.array
+            labels of the dataset
+        labels_up : np.array
+            labels of one side
+        labels_dn : np.array
+            labels on the other side
+
+        Returns
+        -------
+        float
+            information gain
+        """
        imp_prev = self.criterion_function(labels)
        card_up = card_dn = imp_up = imp_dn = 0
        if labels_up is not None:
@@ -255,6 +283,20 @@ class Splitter:

    @staticmethod
    def _generate_spaces(features: int, max_features: int) -> list:
+        """Generate at most 5 feature random combinations
+
+        Parameters
+        ----------
+        features : int
+            number of features in each combination
+        max_features : int
+            number of features in dataset
+
+        Returns
+        -------
+        list
+            list with up to 5 combination of features randomly selected
+        """
        comb = set()
        # Generate at most 5 combinations
        if max_features == features:
@@ -273,6 +315,24 @@ class Splitter:
    def _get_subspaces_set(
        self, dataset: np.array, labels: np.array, max_features: int
    ) -> np.array:
+        """Compute the indices of the features selected by splitter depending
+        on the self._splitter_type hyper parameter
+
+        Parameters
+        ----------
+        dataset : np.array
+            array of samples
+        labels : np.array
+            labels of the dataset
+        max_features : int
+            number of features of the subspace
+            (<= number of features in dataset)
+
+        Returns
+        -------
+        np.array
+            indices of the features selected
+        """
        features_sets = self._generate_spaces(dataset.shape[1], max_features)
        if len(features_sets) > 1:
            if self._splitter_type == "random":
@@ -286,19 +346,41 @@ class Splitter:
    def get_subspace(
        self, dataset: np.array, labels: np.array, max_features: int
    ) -> tuple:
-        """Return the best/random subspace to make a split"""
+        """Return a subspace of the selected dataset of max_features length.
+        Depending on hyperparmeter
+
+        Parameters
+        ----------
+        dataset : np.array
+            array of samples (# samples, # features)
+        labels : np.array
+            labels of the dataset
+        max_features : int
+            number of features to form the subspace
+
+        Returns
+        -------
+        tuple
+            tuple with the dataset with only the features selected  and the
+            indices of the features selected
+        """
        indices = self._get_subspaces_set(dataset, labels, max_features)
        return dataset[:, indices], indices

    def _impurity(self, data: np.array, y: np.array) -> np.array:
        """return column of dataset to be taken into account to split dataset

-        :param data: distances to hyper plane of every class
-        :type data: np.array (m, n_classes)
-        :param y: vector of labels (classes)
-        :type y: np.array (m,)
-        :return: column of dataset to be taken into account to split dataset
-        :rtype: int
+        Parameters
+        ----------
+        data : np.array
+            distances to hyper plane of every class
+        y : np.array
+            vector of labels (classes)
+
+        Returns
+        -------
+        np.array
+            column of dataset to be taken into account to split dataset
        """
        max_gain = 0
        selected = -1
@@ -315,12 +397,17 @@ class Splitter:
    def _max_samples(data: np.array, y: np.array) -> np.array:
        """return column of dataset to be taken into account to split dataset

-        :param data: distances to hyper plane of every class
-        :type data: np.array (m, n_classes)
-        :param y: vector of labels (classes)
-        :type y: np.array (m,)
-        :return: column of dataset to be taken into account to split dataset
-        :rtype: int
+        Parameters
+        ----------
+        data : np.array
+            distances to hyper plane of every class
+        y : np.array
+            column of dataset to be taken into account to split dataset
+
+        Returns
+        -------
+        np.array
+            column of dataset to be taken into account to split dataset
        """
        # select the class with max number of samples
        _, samples = np.unique(y, return_counts=True)
@@ -328,8 +415,7 @@ class Splitter:

    def partition(self, samples: np.array, node: Snode, train: bool):
        """Set the criteria to split arrays. Compute the indices of the samples
-        that should go to one side of the tree (down)
-
+        that should go to one side of the tree (up)
        """
        # data contains the distances of every sample to every class hyperplane
        # array of (m, nc) nc = # classes
@@ -357,15 +443,18 @@ class Splitter:
        self._up = data > 0

    def part(self, origin: np.array) -> list:
-        """Split an array in two based on indices (down) and its complement
-        partition has to be called first to establish down indices
+        """Split an array in two based on indices (self._up) and its complement
+        partition has to be called first to establish up indices

-        :param origin: dataset to split
-        :type origin: np.array
-        :param down: indices to use to split array
-        :type down: np.array
-        :return: list with two splits of the array
-        :rtype: list
+        Parameters
+        ----------
+        origin : np.array
+            dataset to split
+
+        Returns
+        -------
+        list
+            list with two splits of the array
        """
        down = ~self._up
        return [
@@ -377,13 +466,18 @@ class Splitter:
    def _distances(node: Snode, data: np.ndarray) -> np.array:
        """Compute distances of the samples to the hyperplane of the node

-        :param node: node containing the svm classifier
-        :type node: Snode
-        :param data: samples to find out distance to hyperplane
-        :type data: np.ndarray
-        :return: array of shape (m, nc) with the distances of every sample to
-        the hyperplane of every class. nc = # of classes
-        :rtype: np.array
+        Parameters
+        ----------
+        node : Snode
+            node containing the svm classifier
+        data : np.ndarray
+            samples to compute distance to hyperplane
+
+        Returns
+        -------
+        np.array
+            array of shape (m, nc) with the distances of every sample to
+            the hyperplane of every class. nc = # of classes
        """
        return node._clf.decision_function(data[:, node._features])

@@ -428,6 +522,7 @@ class Stree(BaseEstimator, ClassifierMixin):

    def _more_tags(self) -> dict:
        """Required by sklearn to supply features of the classifier
+        make mandatory the labels array

        :return: the tag required
        :rtype: dict
@@ -439,16 +534,19 @@ class Stree(BaseEstimator, ClassifierMixin):
    ) -> "Stree":
        """Build the tree based on the dataset of samples and its labels

-        :param X: dataset of samples to make predictions
-        :type X: np.array
-        :param y: samples labels
-        :type y: np.array
-        :param sample_weight: weights of the samples. Rescale C per sample.
-        Hi' weights force the classifier to put more emphasis on these points
-        :type sample_weight: np.array optional
-        :raises ValueError: if parameters C or max_depth are out of bounds
-        :return: itself to be able to chain actions: fit().predict() ...
-        :rtype: Stree
+        Returns
+        -------
+        Stree
+            itself to be able to chain actions: fit().predict() ...
+
+        Raises
+        ------
+        ValueError
+            if C < 0
+        ValueError
+            if max_depth < 1
+        ValueError
+            if all samples have 0 or negative weights
        """
        # Check parameters are Ok.
        if self.C < 0:
@@ -471,6 +569,10 @@ class Stree(BaseEstimator, ClassifierMixin):
        sample_weight = _check_sample_weight(
            sample_weight, X, dtype=np.float64
        )
+        if not any(sample_weight):
+            raise ValueError(
+                "Invalid input - all samples have zero or negative weights."
+            )
        check_classification_targets(y)
        # Initialize computed parameters
        self.splitter_ = Splitter(
@@ -492,6 +594,8 @@ class Stree(BaseEstimator, ClassifierMixin):
        self.max_features_ = self._initialize_max_features()
        self.tree_ = self.train(X, y, sample_weight, 1, "root")
        self._build_predictor()
+        self.X_ = X
+        self.y_ = y
        return self

    def train(
@@ -505,19 +609,23 @@ class Stree(BaseEstimator, ClassifierMixin):
        """Recursive function to split the original dataset into predictor
        nodes (leaves)

-        :param X: samples dataset
-        :type X: np.ndarray
-        :param y: samples labels
-        :type y: np.ndarray
-        :param sample_weight: weight of samples. Rescale C per sample.
-        Hi weights force the classifier to put more emphasis on these points.
-        :type sample_weight: np.ndarray
-        :param depth: actual depth in the tree
-        :type depth: int
-        :param title: description of the node
-        :type title: str
-        :return: binary tree
-        :rtype: Snode
+        Parameters
+        ----------
+        X : np.ndarray
+            samples dataset
+        y : np.ndarray
+            samples labels
+        sample_weight : np.ndarray
+            weight of samples. Rescale C per sample.
+        depth : int
+            actual depth in the tree
+        title : str
+            description of the node
+
+        Returns
+        -------
+        Optional[Snode]
+            binary tree
        """
        if depth > self.__max_depth:
            return None
@@ -602,12 +710,17 @@ class Stree(BaseEstimator, ClassifierMixin):
    def _reorder_results(y: np.array, indices: np.array) -> np.array:
        """Reorder an array based on the array of indices passed

-        :param y: data untidy
-        :type y: np.array
-        :param indices: indices used to set order
-        :type indices: np.array
-        :return: array y ordered
-        :rtype: np.array
+        Parameters
+        ----------
+        y : np.array
+            data untidy
+        indices : np.array
+            indices used to set order
+
+        Returns
+        -------
+        np.array
+            array y ordered
        """
        # return array of same type given in y
        y_ordered = y.copy()
@@ -619,10 +732,22 @@ class Stree(BaseEstimator, ClassifierMixin):
    def predict(self, X: np.array) -> np.array:
        """Predict labels for each sample in dataset passed

-        :param X: dataset of samples
-        :type X: np.array
-        :return: array of labels
-        :rtype: np.array
+        Parameters
+        ----------
+        X : np.array
+            dataset of samples
+
+        Returns
+        -------
+        np.array
+            array of labels
+
+        Raises
+        ------
+        ValueError
+            if dataset with inconsistent number of features
+        NotFittedError
+            if model is not fitted
        """

        def predict_class(
@@ -664,15 +789,19 @@ class Stree(BaseEstimator, ClassifierMixin):
    ) -> float:
        """Compute accuracy of the prediction

-        :param X: dataset of samples to make predictions
-        :type X: np.array
-        :param y_true: samples labels
-        :type y_true: np.array
-        :param sample_weight: weights of the samples. Rescale C per sample.
-        Hi' weights force the classifier to put more emphasis on these points
-        :type sample_weight: np.array optional
-        :return: accuracy of the prediction
-        :rtype: float
+        Parameters
+        ----------
+        X : np.array
+            dataset of samples to make predictions
+        y : np.array
+            samples labels
+        sample_weight : np.array, optional
+            weights of the samples. Rescale C per sample, by default None
+
+        Returns
+        -------
+        float
+            accuracy of the prediction
        """
        # sklearn check
        check_is_fitted(self)
@@ -689,8 +818,10 @@ class Stree(BaseEstimator, ClassifierMixin):
        """Create an iterator to be able to visit the nodes of the tree in
        preorder, can make a list with all the nodes in preorder

-        :return: an iterator, can for i in... and list(...)
-        :rtype: Siterator
+        Returns
+        -------
+        Siterator
+            an iterator, can for i in... and list(...)
        """
        try:
            tree = self.tree_
@@ -701,8 +832,10 @@ class Stree(BaseEstimator, ClassifierMixin):
    def __str__(self) -> str:
        """String representation of the tree

-        :return: description of nodes in the tree in preorder
-        :rtype: str
+        Returns
+        -------
+        str
+            description of nodes in the tree in preorder
        """
        output = ""
        for i in self:
--- a/stree/tests/Stree_test.py
+++ b/stree/tests/Stree_test.py
@@ -26,8 +26,10 @@ class Stree_test(unittest.TestCase):
        correct number of labels and its sons have the right number of elements
        in their dataset

-        Arguments:
-            node {Snode} -- node to check
+        Parameters
+        ----------
+        node : Snode
+            node to check
        """
        if node.is_leaf():
            return
@@ -320,43 +322,6 @@ class Stree_test(unittest.TestCase):
        with self.assertRaises(ValueError):
            clf.fit(*load_dataset())

-    def test_weights_removing_class(self):
-        # This patch solves an stderr message from sklearn svm lib
-        # "WARNING: class label x specified in weight is not found"
-        X = np.array(
-            [
-                [0.1, 0.1],
-                [0.1, 0.2],
-                [0.2, 0.1],
-                [5, 6],
-                [8, 9],
-                [6, 7],
-                [0.2, 0.2],
-            ]
-        )
-        y = np.array([0, 0, 0, 1, 1, 1, 0])
-        epsilon = 1e-5
-        weights = [1, 1, 1, 0, 0, 0, 1]
-        weights = np.array(weights, dtype="float64")
-        weights_epsilon = [x + epsilon for x in weights]
-        weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
-        original = weights_no_zero.copy()
-        clf = Stree()
-        clf.fit(X, y)
-        node = clf.train(
-            X,
-            y,
-            weights,
-            1,
-            "test",
-        )
-        # if a class is lost with zero weights the patch adds epsilon
-        self.assertListEqual(weights.tolist(), weights_epsilon)
-        self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
-        # zero weights are ok when they don't erase a class
-        _ = clf.train(X, y, weights_no_zero, 1, "test")
-        self.assertListEqual(weights_no_zero.tolist(), original.tolist())
-
    def test_multiclass_classifier_integrity(self):
        """Checks if the multiclass operation is done right"""
        X, y = load_iris(return_X_y=True)
@@ -442,3 +407,45 @@ class Stree_test(unittest.TestCase):
        self.assertEqual(0.9533333333333334, clf.fit(X, y).score(X, y))
        X, y = load_wine(return_X_y=True)
        self.assertEqual(0.9550561797752809, clf.fit(X, y).score(X, y))
+
+    def test_zero_all_sample_weights(self):
+        X, y = load_dataset(self._random_state)
+        with self.assertRaises(ValueError):
+            Stree().fit(X, y, np.zeros(len(y)))
+
+    def test_weights_removing_class(self):
+        # This patch solves an stderr message from sklearn svm lib
+        # "WARNING: class label x specified in weight is not found"
+        X = np.array(
+            [
+                [0.1, 0.1],
+                [0.1, 0.2],
+                [0.2, 0.1],
+                [5, 6],
+                [8, 9],
+                [6, 7],
+                [0.2, 0.2],
+            ]
+        )
+        y = np.array([0, 0, 0, 1, 1, 1, 0])
+        epsilon = 1e-5
+        weights = [1, 1, 1, 0, 0, 0, 1]
+        weights = np.array(weights, dtype="float64")
+        weights_epsilon = [x + epsilon for x in weights]
+        weights_no_zero = np.array([1, 1, 1, 0, 0, 2, 1])
+        original = weights_no_zero.copy()
+        clf = Stree()
+        clf.fit(X, y)
+        node = clf.train(
+            X,
+            y,
+            weights,
+            1,
+            "test",
+        )
+        # if a class is lost with zero weights the patch adds epsilon
+        self.assertListEqual(weights.tolist(), weights_epsilon)
+        self.assertListEqual(node._sample_weight.tolist(), weights_epsilon)
+        # zero weights are ok when they don't erase a class
+        _ = clf.train(X, y, weights_no_zero, 1, "test")
+        self.assertListEqual(weights_no_zero.tolist(), original.tolist())