stree/stree/Strees.py

"""
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020, Ricardo Montañana Gómez"
__license__ = "MIT"
__version__ = "0.9"
Build an oblique tree classifier based on SVM Trees
"""

import os
import numbers
import random
import warnings
from math import log
from itertools import combinations
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import check_consistent_length
from sklearn.utils.multiclass import check_classification_targets
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.validation import (
    check_X_y,
    check_array,
    check_is_fitted,
    _check_sample_weight,
)
from sklearn.metrics._classification import _weighted_sum, _check_targets


class Snode:
    """Nodes of the tree that keeps the svm classifier and if testing the
    dataset assigned to it
    """

    def __init__(
        self,
        clf: SVC,
        X: np.ndarray,
        y: np.ndarray,
        features: np.array,
        impurity: float,
        title: str,
    ):
        self._clf = clf
        self._title = title
        self._belief = 0.0
        # Only store dataset in Testing
        self._X = X if os.environ.get("TESTING", "NS") != "NS" else None
        self._y = y
        self._down = None
        self._up = None
        self._class = None
        self._feature = None
        self._sample_weight = None
        self._features = features
        self._impurity = impurity

    @classmethod
    def copy(cls, node: "Snode") -> "Snode":
        return cls(
            node._clf,
            node._X,
            node._y,
            node._features,
            node._impurity,
            node._title,
        )

    def set_down(self, son):
        self._down = son

    def set_up(self, son):
        self._up = son

    def is_leaf(self) -> bool:
        return self._up is None and self._down is None

    def get_down(self) -> "Snode":
        return self._down

    def get_up(self) -> "Snode":
        return self._up

    def make_predictor(self):
        """Compute the class of the predictor and its belief based on the
        subdataset of the node only if it is a leaf
        """
        if not self.is_leaf():
            return
        classes, card = np.unique(self._y, return_counts=True)
        if len(classes) > 1:
            max_card = max(card)
            min_card = min(card)
            self._class = classes[card == max_card][0]
            self._belief = max_card / (max_card + min_card)
        else:
            self._belief = 1
            try:
                self._class = classes[0]
            except IndexError:
                self._class = None

    def __str__(self) -> str:
        if self.is_leaf():
            count_values = np.unique(self._y, return_counts=True)
            result = (
                f"{self._title} - Leaf class={self._class} belief="
                f"{self._belief: .6f} impurity={self._impurity:.4f} "
                f"counts={count_values}"
            )
            return result
        else:
            return (
                f"{self._title} feaures={self._features} impurity="
                f"{self._impurity:.4f}"
            )


class Siterator:
    """Stree preorder iterator
    """

    def __init__(self, tree: Snode):
        self._stack = []
        self._push(tree)

    def _push(self, node: Snode):
        if node is not None:
            self._stack.append(node)

    def __next__(self) -> Snode:
        if len(self._stack) == 0:
            raise StopIteration()
        node = self._stack.pop()
        self._push(node.get_up())
        self._push(node.get_down())
        return node


class Splitter:
    def __init__(
        self,
        clf: SVC = None,
        criterion: str = None,
        splitter_type: str = None,
        criteria: str = None,
        min_samples_split: int = None,
        random_state=None,
    ):
        self._clf = clf
        self._random_state = random_state
        if random_state is not None:
            random.seed(random_state)
        self._criterion = criterion
        self._min_samples_split = min_samples_split
        self._criteria = criteria
        self._splitter_type = splitter_type

        if clf is None:
            raise ValueError(f"clf has to be a sklearn estimator, got({clf})")

        if criterion not in ["gini", "entropy"]:
            raise ValueError(
                f"criterion must be gini or entropy got({criterion})"
            )

        if criteria not in ["min_distance", "max_samples", "max_distance"]:
            raise ValueError(
                "split_criteria has to be min_distance "
                f"max_distance or max_samples got ({criteria})"
            )

        if splitter_type not in ["random", "best"]:
            raise ValueError(
                f"splitter must be either random or best got({splitter_type})"
            )
        self.criterion_function = getattr(self, f"_{self._criterion}")
        self.decision_criteria = getattr(self, f"_{self._criteria}")

    def impurity(self, y: np.array) -> np.array:
        return self.criterion_function(y)

    @staticmethod
    def _gini(y: np.array) -> float:
        _, count = np.unique(y, return_counts=True)
        return 1 - np.sum(np.square(count / np.sum(count)))

    @staticmethod
    def _entropy(y: np.array) -> float:
        n_labels = len(y)
        if n_labels <= 1:
            return 0
        counts = np.bincount(y)
        proportions = counts / n_labels
        n_classes = np.count_nonzero(proportions)
        if n_classes <= 1:
            return 0
        entropy = 0.0
        # Compute standard entropy.
        for prop in proportions:
            if prop != 0.0:
                entropy -= prop * log(prop, n_classes)
        return entropy

    def information_gain(
        self, labels: np.array, labels_up: np.array, labels_dn: np.array
    ) -> float:
        imp_prev = self.criterion_function(labels)
        card_up = card_dn = imp_up = imp_dn = 0
        if labels_up is not None:
            card_up = labels_up.shape[0]
            imp_up = self.criterion_function(labels_up)
        if labels_dn is not None:
            card_dn = labels_dn.shape[0] if labels_dn is not None else 0
            imp_dn = self.criterion_function(labels_dn)
        samples = card_up + card_dn
        if samples == 0:
            return 0
        else:
            result = (
                imp_prev
                - (card_up / samples) * imp_up
                - (card_dn / samples) * imp_dn
            )
            return result

    def _select_best_set(
        self, dataset: np.array, labels: np.array, features_sets: list
    ) -> list:
        max_gain = 0
        selected = None
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        for feature_set in features_sets:
            self._clf.fit(dataset[:, feature_set], labels)
            node = Snode(
                self._clf, dataset, labels, feature_set, 0.0, "subset"
            )
            self.partition(dataset, node)
            y1, y2 = self.part(labels)
            gain = self.information_gain(labels, y1, y2)
            if gain > max_gain:
                max_gain = gain
                selected = feature_set

        return selected if selected is not None else feature_set

    def _get_subspaces_set(
        self, dataset: np.array, labels: np.array, max_features: int
    ) -> np.array:
        features = range(dataset.shape[1])
        features_sets = list(combinations(features, max_features))
        if len(features_sets) > 1:
            if self._splitter_type == "random":
                index = random.randint(0, len(features_sets) - 1)
                return features_sets[index]
            else:
                return self._select_best_set(dataset, labels, features_sets)
        else:
            return features_sets[0]

    def get_subspace(
        self, dataset: np.array, labels: np.array, max_features: int
    ) -> list:
        """Return the best subspace to make a split
        """
        indices = self._get_subspaces_set(dataset, labels, max_features)
        return dataset[:, indices], indices

    @staticmethod
    def _min_distance(data: np.array, _) -> np.array:
        """Assign class to min distances

        return a vector of classes so partition can separate class 0 from
        the rest of classes, ie. class 0 goes to one splitted node and the
        rest of classes go to the other
        :param data: distances to hyper plane of every class
        :type data: np.array (m, n_classes)
        :param _: enable call compat with other measures
        :type _: None
        :return: vector with the class assigned to each sample
        :rtype: np.array shape (m,)
        """
        return np.argmin(data, axis=1)

    @staticmethod
    def _max_distance(data: np.array, _) -> np.array:
        """Assign class to max distances

        return a vector of classes so partition can separate class 0 from
        the rest of classes, ie. class 0 goes to one splitted node and the
        rest of classes go to the other
        :param data: distances to hyper plane of every class
        :type data: np.array (m, n_classes)
        :param _: enable call compat with other measures
        :type _: None
        :return: vector with the class assigned to each sample values
        (can be 0, 1, ...)
        :rtype: np.array shape (m,)
        """
        return np.argmax(data, axis=1)

    @staticmethod
    def _max_samples(data: np.array, y: np.array) -> np.array:
        """return distances of the class with more samples

        :param data: distances to hyper plane of every class
        :type data: np.array (m, n_classes)
        :param y: vector of labels (classes)
        :type y: np.array (m,)
        :return: vector with distances to hyperplane (can be positive or neg.)
        :rtype: np.array shape (m,)
        """
        # select the class with max number of samples
        _, samples = np.unique(y, return_counts=True)
        selected = np.argmax(samples)
        return data[:, selected]

    def partition(self, samples: np.array, node: Snode):
        """Set the criteria to split arrays. Compute the indices of the samples
        that should go to one side of the tree (down)

        """
        data = self._distances(node, samples)
        if data.shape[0] < self._min_samples_split:
            self._down = np.ones((data.shape[0]), dtype=bool)
            return
        if data.ndim > 1:
            # split criteria for multiclass
            data = self.decision_criteria(data, node._y)
        self._down = data > 0

    @staticmethod
    def _distances(node: Snode, data: np.ndarray) -> np.array:
        """Compute distances of the samples to the hyperplane of the node

        :param node: node containing the svm classifier
        :type node: Snode
        :param data: samples to find out distance to hyperplane
        :type data: np.ndarray
        :return: array of shape (m, 1) with the distances of every sample to
        the hyperplane of the node
        :rtype: np.array
        """
        return node._clf.decision_function(data[:, node._features])

    def part(self, origin: np.array) -> list:
        """Split an array in two based on indices (down) and its complement

        :param origin: dataset to split
        :type origin: np.array
        :param down: indices to use to split array
        :type down: np.array
        :return: list with two splits of the array
        :rtype: list
        """
        up = ~self._down
        return [
            origin[up] if any(up) else None,
            origin[self._down] if any(self._down) else None,
        ]


class Stree(BaseEstimator, ClassifierMixin):
    """Estimator that is based on binary trees of svm nodes
    can deal with sample_weights in predict, used in boosting sklearn methods
    inheriting from BaseEstimator implements get_params and set_params methods
    inheriting from ClassifierMixin implement the attribute _estimator_type
    with "classifier" as value
    """

    def __init__(
        self,
        C: float = 1.0,
        kernel: str = "linear",
        max_iter: int = 1000,
        random_state: int = None,
        max_depth: int = None,
        tol: float = 1e-4,
        degree: int = 3,
        gamma="scale",
        split_criteria: str = "max_samples",
        criterion: str = "gini",
        min_samples_split: int = 0,
        max_features=None,
        splitter: str = "random",
    ):
        self.max_iter = max_iter
        self.C = C
        self.kernel = kernel
        self.random_state = random_state
        self.max_depth = max_depth
        self.tol = tol
        self.gamma = gamma
        self.degree = degree
        self.min_samples_split = min_samples_split
        self.split_criteria = split_criteria
        self.max_features = max_features
        self.criterion = criterion
        self.splitter = splitter

    def _more_tags(self) -> dict:
        """Required by sklearn to supply features of the classifier

        :return: the tag required
        :rtype: dict
        """
        return {"requires_y": True}

    def fit(
        self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None
    ) -> "Stree":
        """Build the tree based on the dataset of samples and its labels

        :param X: dataset of samples to make predictions
        :type X: np.array
        :param y: samples labels
        :type y: np.array
        :param sample_weight: weights of the samples. Rescale C per sample.
        Hi' weights force the classifier to put more emphasis on these points
        :type sample_weight: np.array optional
        :raises ValueError: if parameters C or max_depth are out of bounds
        :return: itself to be able to chain actions: fit().predict() ...
        :rtype: Stree
        """
        # Check parameters are Ok.
        if self.C < 0:
            raise ValueError(
                f"Penalty term must be positive... got (C={self.C:f})"
            )
        self.__max_depth = (
            np.iinfo(np.int32).max
            if self.max_depth is None
            else self.max_depth
        )
        if self.__max_depth < 1:
            raise ValueError(
                f"Maximum depth has to be greater than 1... got (max_depth=\
                    {self.max_depth})"
            )

        check_classification_targets(y)
        X, y = check_X_y(X, y)
        sample_weight = _check_sample_weight(sample_weight, X)
        check_classification_targets(y)
        # Initialize computed parameters
        self.splitter_ = Splitter(
            clf=self._build_clf(),
            criterion=self.criterion,
            splitter_type=self.splitter,
            criteria=self.split_criteria,
            random_state=self.random_state,
            min_samples_split=self.min_samples_split,
        )
        if self.random_state is not None:
            random.seed(self.random_state)
        self.classes_, y = np.unique(y, return_inverse=True)
        self.n_classes_ = self.classes_.shape[0]
        self.n_iter_ = self.max_iter
        self.depth_ = 0
        self.n_features_ = X.shape[1]
        self.n_features_in_ = X.shape[1]
        self.max_features_ = self._initialize_max_features()
        self.tree_ = self.train(X, y, sample_weight, 1, "root")
        self._build_predictor()
        return self

    def train(
        self,
        X: np.ndarray,
        y: np.ndarray,
        sample_weight: np.ndarray,
        depth: int,
        title: str,
    ) -> Snode:
        """Recursive function to split the original dataset into predictor
        nodes (leaves)

        :param X: samples dataset
        :type X: np.ndarray
        :param y: samples labels
        :type y: np.ndarray
        :param sample_weight: weight of samples. Rescale C per sample.
        Hi weights force the classifier to put more emphasis on these points.
        :type sample_weight: np.ndarray
        :param depth: actual depth in the tree
        :type depth: int
        :param title: description of the node
        :type title: str
        :return: binary tree
        :rtype: Snode
        """
        if depth > self.__max_depth:
            return None
        if np.unique(y).shape[0] == 1:
            # only 1 class => pure dataset
            return Snode(
                clf=None,
                X=X,
                y=y,
                features=X.shape[1],
                impurity=0.0,
                title=title + ", <pure>",
            )
        # Train the model
        clf = self._build_clf()
        Xs, features = self.splitter_.get_subspace(X, y, self.max_features_)
        clf.fit(Xs, y, sample_weight=sample_weight)
        impurity = self.splitter_.impurity(y)
        node = Snode(clf, X, y, features, impurity, title)
        self.depth_ = max(depth, self.depth_)
        self.splitter_.partition(X, node)
        X_U, X_D = self.splitter_.part(X)
        y_u, y_d = self.splitter_.part(y)
        sw_u, sw_d = self.splitter_.part(sample_weight)
        if X_U is None or X_D is None:
            # didn't part anything
            return Snode(
                clf,
                X,
                y,
                features=X.shape[1],
                impurity=impurity,
                title=title + ", <cgaf>",
            )
        node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up"))
        node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down"))
        return node

    def _build_predictor(self):
        """Process the leaves to make them predictors
        """

        def run_tree(node: Snode):
            if node.is_leaf():
                node.make_predictor()
                return
            run_tree(node.get_down())
            run_tree(node.get_up())

        run_tree(self.tree_)

    def _build_clf(self):
        """ Build the correct classifier for the node
        """
        return (
            LinearSVC(
                max_iter=self.max_iter,
                random_state=self.random_state,
                C=self.C,
                tol=self.tol,
            )
            if self.kernel == "linear"
            else SVC(
                kernel=self.kernel,
                max_iter=self.max_iter,
                tol=self.tol,
                C=self.C,
                gamma=self.gamma,
                degree=self.degree,
            )
        )

    @staticmethod
    def _reorder_results(y: np.array, indices: np.array) -> np.array:
        """Reorder an array based on the array of indices passed

        :param y: data untidy
        :type y: np.array
        :param indices: indices used to set order
        :type indices: np.array
        :return: array y ordered
        :rtype: np.array
        """
        # return array of same type given in y
        y_ordered = y.copy()
        indices = indices.astype(int)
        for i, index in enumerate(indices):
            y_ordered[index] = y[i]
        return y_ordered

    def predict(self, X: np.array) -> np.array:
        """Predict labels for each sample in dataset passed

        :param X: dataset of samples
        :type X: np.array
        :return: array of labels
        :rtype: np.array
        """

        def predict_class(
            xp: np.array, indices: np.array, node: Snode
        ) -> np.array:
            if xp is None:
                return [], []
            if node.is_leaf():
                # set a class for every sample in dataset
                prediction = np.full((xp.shape[0], 1), node._class)
                return prediction, indices
            self.splitter_.partition(xp, node)
            x_u, x_d = self.splitter_.part(xp)
            i_u, i_d = self.splitter_.part(indices)
            prx_u, prin_u = predict_class(x_u, i_u, node.get_up())
            prx_d, prin_d = predict_class(x_d, i_d, node.get_down())
            return np.append(prx_u, prx_d), np.append(prin_u, prin_d)

        # sklearn check
        check_is_fitted(self, ["tree_"])
        # Input validation
        X = check_array(X)
        if X.shape[1] != self.n_features_:
            raise ValueError(
                f"Expected {self.n_features_} features but got "
                f"({X.shape[1]})"
            )
        # setup prediction & make it happen
        indices = np.arange(X.shape[0])
        result = (
            self._reorder_results(*predict_class(X, indices, self.tree_))
            .astype(int)
            .ravel()
        )
        return self.classes_[result]

    def score(
        self, X: np.array, y: np.array, sample_weight: np.array = None
    ) -> float:
        """Compute accuracy of the prediction

        :param X: dataset of samples to make predictions
        :type X: np.array
        :param y_true: samples labels
        :type y_true: np.array
        :param sample_weight: weights of the samples. Rescale C per sample.
        Hi' weights force the classifier to put more emphasis on these points
        :type sample_weight: np.array optional
        :return: accuracy of the prediction
        :rtype: float
        """
        # sklearn check
        check_is_fitted(self)
        check_classification_targets(y)
        X, y = check_X_y(X, y)
        y_pred = self.predict(X).reshape(y.shape)
        # Compute accuracy for each possible representation
        _, y_true, y_pred = _check_targets(y, y_pred)
        check_consistent_length(y_true, y_pred, sample_weight)
        score = y_true == y_pred
        return _weighted_sum(score, sample_weight, normalize=True)

    def __iter__(self) -> Siterator:
        """Create an iterator to be able to visit the nodes of the tree in
        preorder, can make a list with all the nodes in preorder

        :return: an iterator, can for i in... and list(...)
        :rtype: Siterator
        """
        try:
            tree = self.tree_
        except AttributeError:
            tree = None
        return Siterator(tree)

    def __str__(self) -> str:
        """String representation of the tree

        :return: description of nodes in the tree in preorder
        :rtype: str
        """
        output = ""
        for i in self:
            output += str(i) + "\n"
        return output

    def _initialize_max_features(self) -> int:
        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    "Invalid value for max_features. "
                    "Allowed string values are 'auto', "
                    "'sqrt' or 'log2'."
                )
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, numbers.Integral):
            max_features = self.max_features
        else:  # float
            if self.max_features > 0.0:
                max_features = max(
                    1, int(self.max_features * self.n_features_)
                )
            else:
                raise ValueError(
                    "Invalid value for max_features."
                    "Allowed float must be in range (0, 1] "
                    f"got ({self.max_features})"
                )
        return max_features