diff --git a/notebooks/benchmark.ipynb b/notebooks/benchmark.ipynb index 76901aa..9e19ff0 100644 --- a/notebooks/benchmark.ipynb +++ b/notebooks/benchmark.ipynb @@ -68,9 +68,11 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "2020-05-23 19:42:08\n" + "output_type": "stream", + "text": [ + "2020-06-14 23:45:42\n" + ] } ], "source": [ @@ -102,9 +104,12 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.173% 492\nValid: 99.827% 284,315\n" + "output_type": "stream", + "text": [ + "Fraud: 0.173% 492\n", + "Valid: 99.827% 284,315\n" + ] } ], "source": [ @@ -130,9 +135,12 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "X shape: (284807, 29)\ny shape: (284807,)\n" + "output_type": "stream", + "text": [ + "X shape: (284807, 29)\n", + "y shape: (284807,)\n" + ] } ], "source": [ @@ -248,9 +256,168 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "************************** Linear Tree **********************\nTrain Model Linear Tree took: 16.99 seconds\n=========== Linear Tree - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Linear Tree - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999578 0.999613 0.999596 85295\n 1 0.772414 0.756757 0.764505 148\n\n accuracy 0.999192 85443\n macro avg 0.885996 0.878185 0.882050 85443\nweighted avg 0.999184 0.999192 0.999188 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85262 33]\n [ 36 112]]\n************************** Random Forest **********************\nTrain Model Random Forest took: 175.7 seconds\n=========== Random Forest - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 1.000000 1.000000 1.000000 199020\n 1 1.000000 1.000000 1.000000 344\n\n accuracy 1.000000 199364\n macro avg 1.000000 1.000000 1.000000 199364\nweighted avg 1.000000 1.000000 1.000000 199364\n\n=========== Random Forest - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999660 0.999965 0.999812 85295\n 1 0.975410 0.804054 0.881481 148\n\n accuracy 0.999625 85443\n macro avg 0.987535 0.902009 0.940647 85443\nweighted avg 0.999618 0.999625 0.999607 85443\n\nConfusion Matrix in Train\n[[199020 0]\n [ 0 344]]\nConfusion Matrix in Test\n[[85292 3]\n [ 29 119]]\n************************** Stree (SVM Tree) **********************\nTrain Model Stree (SVM Tree) took: 39.64 seconds\n=========== Stree (SVM Tree) - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999613 0.999869 0.999741 199020\n 1 0.911263 0.776163 0.838305 344\n\n accuracy 0.999483 199364\n macro avg 0.955438 0.888016 0.919023 199364\nweighted avg 0.999461 0.999483 0.999463 199364\n\n=========== Stree (SVM Tree) - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999613 0.999883 0.999748 85295\n 1 0.920000 0.777027 0.842491 148\n\n accuracy 0.999497 85443\n macro avg 0.959807 0.888455 0.921119 85443\nweighted avg 0.999475 0.999497 0.999476 85443\n\nConfusion Matrix in Train\n[[198994 26]\n [ 77 267]]\nConfusion Matrix in Test\n[[85285 10]\n [ 33 115]]\n************************** AdaBoost model **********************\nTrain Model AdaBoost model took: 48.29 seconds\n=========== AdaBoost model - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999392 0.999678 0.999535 199020\n 1 0.777003 0.648256 0.706815 344\n\n accuracy 0.999072 199364\n macro avg 0.888198 0.823967 0.853175 199364\nweighted avg 0.999008 0.999072 0.999030 199364\n\n=========== AdaBoost model - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.999484 0.999707 0.999596 85295\n 1 0.806202 0.702703 0.750903 148\n\n accuracy 0.999192 85443\n macro avg 0.902843 0.851205 0.875249 85443\nweighted avg 0.999149 0.999192 0.999165 85443\n\nConfusion Matrix in Train\n[[198956 64]\n [ 121 223]]\nConfusion Matrix in Test\n[[85270 25]\n [ 44 104]]\n************************** Gradient Boost. **********************\nTrain Model Gradient Boost. took: 251.6 seconds\n=========== Gradient Boost. - Train 199,364 samples =============\n precision recall f1-score support\n\n 0 0.999096 0.999854 0.999475 199020\n 1 0.849741 0.476744 0.610801 344\n\n accuracy 0.998952 199364\n macro avg 0.924419 0.738299 0.805138 199364\nweighted avg 0.998839 0.998952 0.998804 199364\n\n=========== Gradient Boost. - Test 85,443 samples =============\n precision recall f1-score support\n\n 0 0.998981 0.999730 0.999355 85295\n 1 0.726190 0.412162 0.525862 148\n\n accuracy 0.998713 85443\n macro avg 0.862586 0.705946 0.762609 85443\nweighted avg 0.998508 0.998713 0.998535 85443\n\nConfusion Matrix in Train\n[[198991 29]\n [ 180 164]]\nConfusion Matrix in Test\n[[85272 23]\n [ 87 61]]\n" + "output_type": "stream", + "text": [ + "************************** Linear Tree **********************\n", + "Train Model Linear Tree took: 13.52 seconds\n", + "=========== Linear Tree - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 1.000000 1.000000 1.000000 199020\n", + " 1 1.000000 1.000000 1.000000 344\n", + "\n", + " accuracy 1.000000 199364\n", + " macro avg 1.000000 1.000000 1.000000 199364\n", + "weighted avg 1.000000 1.000000 1.000000 199364\n", + "\n", + "=========== Linear Tree - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999578 0.999613 0.999596 85295\n", + " 1 0.772414 0.756757 0.764505 148\n", + "\n", + " accuracy 0.999192 85443\n", + " macro avg 0.885996 0.878185 0.882050 85443\n", + "weighted avg 0.999184 0.999192 0.999188 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[199020 0]\n", + " [ 0 344]]\n", + "Confusion Matrix in Test\n", + "[[85262 33]\n", + " [ 36 112]]\n", + "************************** Random Forest **********************\n", + "Train Model Random Forest took: 152.5 seconds\n", + "=========== Random Forest - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 1.000000 1.000000 1.000000 199020\n", + " 1 1.000000 1.000000 1.000000 344\n", + "\n", + " accuracy 1.000000 199364\n", + " macro avg 1.000000 1.000000 1.000000 199364\n", + "weighted avg 1.000000 1.000000 1.000000 199364\n", + "\n", + "=========== Random Forest - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999660 0.999965 0.999812 85295\n", + " 1 0.975410 0.804054 0.881481 148\n", + "\n", + " accuracy 0.999625 85443\n", + " macro avg 0.987535 0.902009 0.940647 85443\n", + "weighted avg 0.999618 0.999625 0.999607 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[199020 0]\n", + " [ 0 344]]\n", + "Confusion Matrix in Test\n", + "[[85292 3]\n", + " [ 29 119]]\n", + "************************** Stree (SVM Tree) **********************\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n", + "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n", + "/Users/rmontanana/.virtualenvs/general/lib/python3.7/site-packages/sklearn/svm/_base.py:977: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train Model Stree (SVM Tree) took: 32.55 seconds\n", + "=========== Stree (SVM Tree) - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999623 0.999864 0.999744 199020\n", + " 1 0.908784 0.781977 0.840625 344\n", + "\n", + " accuracy 0.999488 199364\n", + " macro avg 0.954204 0.890921 0.920184 199364\n", + "weighted avg 0.999467 0.999488 0.999469 199364\n", + "\n", + "=========== Stree (SVM Tree) - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999637 0.999918 0.999777 85295\n", + " 1 0.943548 0.790541 0.860294 148\n", + "\n", + " accuracy 0.999555 85443\n", + " macro avg 0.971593 0.895229 0.930036 85443\n", + "weighted avg 0.999540 0.999555 0.999536 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[198993 27]\n", + " [ 75 269]]\n", + "Confusion Matrix in Test\n", + "[[85288 7]\n", + " [ 31 117]]\n", + "************************** AdaBoost model **********************\n", + "Train Model AdaBoost model took: 47.34 seconds\n", + "=========== AdaBoost model - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999392 0.999678 0.999535 199020\n", + " 1 0.777003 0.648256 0.706815 344\n", + "\n", + " accuracy 0.999072 199364\n", + " macro avg 0.888198 0.823967 0.853175 199364\n", + "weighted avg 0.999008 0.999072 0.999030 199364\n", + "\n", + "=========== AdaBoost model - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999484 0.999707 0.999596 85295\n", + " 1 0.806202 0.702703 0.750903 148\n", + "\n", + " accuracy 0.999192 85443\n", + " macro avg 0.902843 0.851205 0.875249 85443\n", + "weighted avg 0.999149 0.999192 0.999165 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[198956 64]\n", + " [ 121 223]]\n", + "Confusion Matrix in Test\n", + "[[85270 25]\n", + " [ 44 104]]\n", + "************************** Gradient Boost. **********************\n", + "Train Model Gradient Boost. took: 244.1 seconds\n", + "=========== Gradient Boost. - Train 199,364 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.999096 0.999854 0.999475 199020\n", + " 1 0.849741 0.476744 0.610801 344\n", + "\n", + " accuracy 0.998952 199364\n", + " macro avg 0.924419 0.738299 0.805138 199364\n", + "weighted avg 0.998839 0.998952 0.998804 199364\n", + "\n", + "=========== Gradient Boost. - Test 85,443 samples =============\n", + " precision recall f1-score support\n", + "\n", + " 0 0.998981 0.999730 0.999355 85295\n", + " 1 0.726190 0.412162 0.525862 148\n", + "\n", + " accuracy 0.998713 85443\n", + " macro avg 0.862586 0.705946 0.762609 85443\n", + "weighted avg 0.998508 0.998713 0.998535 85443\n", + "\n", + "Confusion Matrix in Train\n", + "[[198991 29]\n", + " [ 180 164]]\n", + "Confusion Matrix in Test\n", + "[[85272 23]\n", + " [ 87 61]]\n" + ] } ], "source": [ @@ -277,9 +444,18 @@ "metadata": {}, "outputs": [ { - "output_type": "stream", "name": "stdout", - "text": "**************************************************************************************************************\n*The best f1 model is Random Forest, with a f1 score: 0.8815 in 175.717 seconds with 0.7 samples in train dataset\n**************************************************************************************************************\nModel: Linear Tree\t Time: 16.99 seconds\t f1: 0.7645\nModel: Random Forest\t Time: 175.72 seconds\t f1: 0.8815\nModel: Stree (SVM Tree)\t Time: 39.64 seconds\t f1: 0.8425\nModel: AdaBoost model\t Time: 48.29 seconds\t f1: 0.7509\nModel: Gradient Boost.\t Time: 251.58 seconds\t f1: 0.5259\n" + "output_type": "stream", + "text": [ + "**************************************************************************************************************\n", + "*The best f1 model is Random Forest, with a f1 score: 0.8815 in 152.54 seconds with 0.7 samples in train dataset\n", + "**************************************************************************************************************\n", + "Model: Linear Tree\t Time: 13.52 seconds\t f1: 0.7645\n", + "Model: Random Forest\t Time: 152.54 seconds\t f1: 0.8815\n", + "Model: Stree (SVM Tree)\t Time: 32.55 seconds\t f1: 0.8603\n", + "Model: AdaBoost model\t Time: 47.34 seconds\t f1: 0.7509\n", + "Model: Gradient Boost.\t Time: 244.12 seconds\t f1: 0.5259\n" + ] } ], "source": [ @@ -325,7 +501,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6-final" + "version": "3.7.6" }, "toc": { "base_numbering": 1, @@ -379,4 +555,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/stree/Strees.py b/stree/Strees.py index 1a37b4e..e2e33c3 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -9,12 +9,14 @@ Build an oblique tree classifier based on SVM Trees import os import numbers import random +import warnings from itertools import combinations import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import SVC, LinearSVC from sklearn.utils import check_consistent_length from sklearn.utils.multiclass import check_classification_targets +from sklearn.exceptions import ConvergenceWarning from sklearn.utils.validation import ( check_X_y, check_array, @@ -134,6 +136,168 @@ class Siterator: return node +class Splitter: + def __init__( + self, + clf: SVC = None, + criterion: str = None, + splitter_type: str = None, + criteria: str = None, + min_samples_split: int = None, + random_state=None, + ): + self._clf = clf + self._random_state = random_state + if random_state is not None: + random.seed(random_state) + self._criterion = criterion + self._min_samples_split = min_samples_split + self._criteria = criteria + self._splitter_type = splitter_type + + if clf is None: + raise ValueError(f"clf has to be a sklearn estimator, got({clf})") + + if criterion not in ["gini", "entropy"]: + raise ValueError( + f"criterion must be gini or entropy got({criterion})" + ) + + if criteria not in ["min_distance", "max_samples"]: + raise ValueError( + f"split_criteria has to be min_distance or \ + max_samples got ({criteria})" + ) + + if splitter_type not in ["random", "best"]: + raise ValueError( + f"splitter must be either random or best got({splitter_type})" + ) + self.criterion_function = getattr(self, f"_{self._criterion}") + self.decision_criteria = getattr(self, f"_{self._criteria}") + + def impurity(self, y: np.array) -> np.array: + return self.criterion_function(y) + + @staticmethod + def _gini(y: np.array) -> float: + _, count = np.unique(y, return_counts=True) + return 1 - np.sum(np.square(count / np.sum(count))) + + @staticmethod + def _entropy(y: np.array) -> float: + _, count = np.unique(y, return_counts=True) + proportion = count / np.sum(count) + return -np.sum(proportion * np.log2(proportion)) + + def information_gain( + self, labels_up: np.array, labels_dn: np.array + ) -> float: + card_up = labels_up.shape[0] + card_dn = labels_dn.shape[0] + samples = card_up + card_dn + up = card_up / samples * self.criterion_function(labels_up) + dn = card_dn / samples * self.criterion_function(labels_dn) + return up + dn + + def _select_best_set( + self, dataset: np.array, labels: np.array, features_sets: list + ) -> list: + min_impurity = 1 + selected = None + warnings.filterwarnings("ignore", category=ConvergenceWarning) + for feature_set in features_sets: + self._clf.fit(dataset[:, feature_set], labels) + node = Snode( + self._clf, dataset, labels, feature_set, 0.0, "subset" + ) + self.partition(dataset, node) + y1, y2 = self.part(labels) + impurity = self.information_gain(y1, y2) + if impurity < min_impurity: + min_impurity = impurity + selected = feature_set + return selected + + def _get_subspaces_set( + self, dataset: np.array, labels: np.array, max_features: int + ) -> np.array: + features = range(dataset.shape[1]) + features_sets = list(combinations(features, max_features)) + if len(features_sets) > 1: + if self._splitter_type == "random": + return features_sets[random.randint(0, len(features_sets) - 1)] + else: + return self._select_best_set(dataset, labels, features_sets) + else: + return features_sets[0] + + def get_subspace( + self, dataset: np.array, labels: np.array, max_features: int + ) -> list: + """Return the best subspace to make a split + """ + indices = self._get_subspaces_set(dataset, labels, max_features) + return dataset[:, indices], indices + + @staticmethod + def _min_distance(data: np.array, _) -> np.array: + # chooses the lowest distance of every sample + indices = np.argmin(np.abs(data), axis=1) + return np.array( + [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] + ) + + @staticmethod + def _max_samples(data: np.array, y: np.array) -> np.array: + # select the class with max number of samples + _, samples = np.unique(y, return_counts=True) + selected = np.argmax(samples) + return data[:, selected] + + def partition(self, samples: np.array, node: Snode): + """Set the criteria to split arrays + + """ + data = self._distances(node, samples) + if data.shape[0] < self._min_samples_split: + self._down = np.ones((data.shape[0]), dtype=bool) + return + if data.ndim > 1: + # split criteria for multiclass + data = self.decision_criteria(data, node._y) + self._down = data > 0 + + def _distances(self, node: Snode, data: np.ndarray) -> np.array: + """Compute distances of the samples to the hyperplane of the node + + :param node: node containing the svm classifier + :type node: Snode + :param data: samples to find out distance to hyperplane + :type data: np.ndarray + :return: array of shape (m, 1) with the distances of every sample to + the hyperplane of the node + :rtype: np.array + """ + return node._clf.decision_function(data[:, node._features]) + + def part(self, origin: np.array) -> list: + """Split an array in two based on indices (down) and its complement + + :param origin: dataset to split + :type origin: np.array + :param down: indices to use to split array + :type down: np.array + :return: list with two splits of the array + :rtype: list + """ + up = ~self._down + return [ + origin[up] if any(up) else None, + origin[self._down] if any(self._down) else None, + ] + + class Stree(BaseEstimator, ClassifierMixin): """Estimator that is based on binary trees of svm nodes can deal with sample_weights in predict, used in boosting sklearn methods @@ -156,6 +320,7 @@ class Stree(BaseEstimator, ClassifierMixin): criterion: str = "gini", min_samples_split: int = 0, max_features=None, + splitter: str = "random", ): self.max_iter = max_iter self.C = C @@ -169,6 +334,7 @@ class Stree(BaseEstimator, ClassifierMixin): self.split_criteria = split_criteria self.max_features = max_features self.criterion = criterion + self.splitter = splitter def _more_tags(self) -> dict: """Required by sklearn to supply features of the classifier @@ -178,68 +344,6 @@ class Stree(BaseEstimator, ClassifierMixin): """ return {"requires_y": True} - def _split_array(self, origin: np.array, down: np.array) -> list: - """Split an array in two based on indices (down) and its complement - - :param origin: dataset to split - :type origin: np.array - :param down: indices to use to split array - :type down: np.array - :return: list with two splits of the array - :rtype: list - """ - up = ~down - return [ - origin[up] if any(up) else None, - origin[down] if any(down) else None, - ] - - def _distances(self, node: Snode, data: np.ndarray) -> np.array: - """Compute distances of the samples to the hyperplane of the node - - :param node: node containing the svm classifier - :type node: Snode - :param data: samples to find out distance to hyperplane - :type data: np.ndarray - :return: array of shape (m, 1) with the distances of every sample to - the hyperplane of the node - :rtype: np.array - """ - return node._clf.decision_function(data[:, node._features]) - - def _min_distance(self, data: np.array, _) -> np.array: - # chooses the lowest distance of every sample - indices = np.argmin(np.abs(data), axis=1) - return np.array( - [data[x, y] for x, y in zip(range(len(data[:, 0])), indices)] - ) - - def _max_samples(self, data: np.array, y: np.array) -> np.array: - # select the class with max number of samples - _, samples = np.unique(y, return_counts=True) - selected = np.argmax(samples) - return data[:, selected] - - def _split_criteria(self, data: np.array, node: Snode) -> np.array: - """Set the criteria to split arrays - - :param data: distances of samples to hyperplanes shape (m, nclasses) - if nclasses > 2 else (m,) - :type data: np.array - :param node: node containing the svm classifier - :type node: Snode - :return: array of booleans of samples under or above zero - :rtype: np.array - """ - - if data.shape[0] < self.min_samples_split: - return np.ones((data.shape[0]), dtype=bool) - if data.ndim > 1: - # split criteria for multiclass - data = getattr(self, f"_{self.split_criteria}")(data, node._y) - res = data > 0 - return res - def fit( self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None ) -> "Stree": @@ -271,21 +375,20 @@ class Stree(BaseEstimator, ClassifierMixin): f"Maximum depth has to be greater than 1... got (max_depth=\ {self.max_depth})" ) - if self.split_criteria not in ["min_distance", "max_samples"]: - raise ValueError( - f"split_criteria has to be min_distance or \ - max_samples got ({self.split_criteria})" - ) - if self.criterion not in ["gini", "entropy"]: - raise ValueError( - f"criterion must be gini or entropy got({self.criterion})" - ) check_classification_targets(y) X, y = check_X_y(X, y) sample_weight = _check_sample_weight(sample_weight, X) check_classification_targets(y) # Initialize computed parameters + self.splitter_ = Splitter( + clf=self._build_clf(), + criterion=self.criterion, + splitter_type=self.splitter, + criteria=self.split_criteria, + random_state=self.random_state, + min_samples_split=self.min_samples_split, + ) if self.random_state is not None: random.seed(self.random_state) self.classes_, y = np.unique(y, return_inverse=True) @@ -295,7 +398,6 @@ class Stree(BaseEstimator, ClassifierMixin): self.n_features_ = X.shape[1] self.n_features_in_ = X.shape[1] self.max_features_ = self._initialize_max_features() - self.criterion_function_ = getattr(self, f"_{self.criterion}") self.tree_ = self.train(X, y, sample_weight, 1, "root") self._build_predictor() return self @@ -339,15 +441,15 @@ class Stree(BaseEstimator, ClassifierMixin): ) # Train the model clf = self._build_clf() - Xs, features = self._get_subspace(X) + Xs, features = self.splitter_.get_subspace(X, y, self.max_features_) clf.fit(Xs, y, sample_weight=sample_weight) - impurity = self.criterion_function_(y) + impurity = self.splitter_.impurity(y) node = Snode(clf, X, y, features, impurity, title) self.depth_ = max(depth, self.depth_) - down = self._split_criteria(self._distances(node, X), node) - X_U, X_D = self._split_array(X, down) - y_u, y_d = self._split_array(y, down) - sw_u, sw_d = self._split_array(sample_weight, down) + self.splitter_.partition(X, node) + X_U, X_D = self.splitter_.part(X) + y_u, y_d = self.splitter_.part(y) + sw_u, sw_d = self.splitter_.part(sample_weight) if X_U is None or X_D is None: # didn't part anything return Snode( @@ -431,9 +533,9 @@ class Stree(BaseEstimator, ClassifierMixin): # set a class for every sample in dataset prediction = np.full((xp.shape[0], 1), node._class) return prediction, indices - down = self._split_criteria(self._distances(node, xp), node) - x_u, x_d = self._split_array(xp, down) - i_u, i_d = self._split_array(indices, down) + self.splitter_.partition(xp, node) + x_u, x_d = self.splitter_.part(xp) + i_u, i_d = self.splitter_.part(indices) prx_u, prin_u = predict_class(x_u, i_u, node.get_up()) prx_d, prin_d = predict_class(x_d, i_d, node.get_down()) return np.append(prx_u, prx_d), np.append(prin_u, prin_d) @@ -536,29 +638,3 @@ class Stree(BaseEstimator, ClassifierMixin): f"got ({self.max_features})" ) return max_features - - @staticmethod - def _gini(y: np.array) -> float: - _, count = np.unique(y, return_counts=True) - return 1 - np.sum(np.square(count / np.sum(count))) - - @staticmethod - def _entropy(y: np.array) -> float: - _, count = np.unique(y, return_counts=True) - proportion = count / np.sum(count) - return -np.sum(proportion * np.log2(proportion)) - - def _get_subspace(self, dataset: np.array) -> list: - """Return the best subspace to make a split - """ - - def get_subspaces_set(dataset: np.array) -> np.array: - features = range(dataset.shape[1]) - features_sets = list(combinations(features, self.max_features_)) - if len(features_sets) > 1: - return features_sets[random.randint(0, len(features_sets) - 1)] - else: - return features_sets[0] - - indices = get_subspaces_set(dataset) - return dataset[:, indices], indices diff --git a/stree/__init__.py b/stree/__init__.py index 03b8a2c..6768b82 100644 --- a/stree/__init__.py +++ b/stree/__init__.py @@ -1,3 +1,3 @@ -from .Strees import Stree, Snode, Siterator +from .Strees import Stree, Snode, Siterator, Splitter -__all__ = ["Stree", "Snode", "Siterator"] +__all__ = ["Stree", "Snode", "Siterator", "Splitter"] diff --git a/stree/tests/Splitter_test.py b/stree/tests/Splitter_test.py new file mode 100644 index 0000000..b620ce1 --- /dev/null +++ b/stree/tests/Splitter_test.py @@ -0,0 +1,142 @@ +import os +import unittest + +import numpy as np +from sklearn.svm import LinearSVC + +from stree import Splitter +from .utils import load_dataset + + +class Splitter_test(unittest.TestCase): + def __init__(self, *args, **kwargs): + self._random_state = 1 + super().__init__(*args, **kwargs) + + def build( + self, + clf=LinearSVC(), + min_samples_split=0, + splitter_type="random", + criterion="gini", + criteria="min_distance", + random_state=None, + ): + return Splitter( + clf=clf, + min_samples_split=min_samples_split, + splitter_type=splitter_type, + criterion=criterion, + criteria=criteria, + random_state=random_state, + ) + + @classmethod + def setUp(cls): + os.environ["TESTING"] = "1" + + def test_init(self): + with self.assertRaises(ValueError): + self.build(criterion="duck") + with self.assertRaises(ValueError): + self.build(splitter_type="duck") + with self.assertRaises(ValueError): + self.build(criteria="duck") + with self.assertRaises(ValueError): + self.build(clf=None) + for splitter_type in ["best", "random"]: + for criterion in ["gini", "entropy"]: + for criteria in ["min_distance", "max_samples"]: + tcl = self.build( + splitter_type=splitter_type, + criterion=criterion, + criteria=criteria, + ) + self.assertEqual(splitter_type, tcl._splitter_type) + self.assertEqual(criterion, tcl._criterion) + self.assertEqual(criteria, tcl._criteria) + + def test_gini(self): + y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] + expected = 0.48 + self.assertEqual(expected, Splitter._gini(y)) + tcl = self.build(criterion="gini") + self.assertEqual(expected, tcl.criterion_function(y)) + + def test_entropy(self): + y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] + expected = 0.9709505944546686 + self.assertAlmostEqual(expected, Splitter._entropy(y)) + tcl = self.build(criterion="entropy") + self.assertEqual(expected, tcl.criterion_function(y)) + + def test_information_gain(self): + yu = np.array([0, 1, 1, 1, 1, 1]) + yd = np.array([0, 0, 0, 1]) + values_expected = [ + ("gini", 0.31666666666666665), + ("entropy", 0.7145247027726656), + ] + for criterion, expected in values_expected: + tcl = self.build(criterion=criterion) + computed = tcl.information_gain(yu, yd) + self.assertAlmostEqual(expected, computed) + + def test_max_samples(self): + tcl = self.build(criteria="max_samples") + data = np.array( + [ + [-0.1, 0.2, -0.3], + [0.7, 0.01, -0.1], + [0.7, -0.9, 0.5], + [0.1, 0.2, 0.3], + ] + ) + expected = np.array([0.2, 0.01, -0.9, 0.2]) + y = [1, 2, 1, 0] + computed = tcl._max_samples(data, y) + self.assertEqual((4,), computed.shape) + self.assertListEqual(expected.tolist(), computed.tolist()) + + def test_min_distance(self): + tcl = self.build() + data = np.array( + [ + [-0.1, 0.2, -0.3], + [0.7, 0.01, -0.1], + [0.7, -0.9, 0.5], + [0.1, 0.2, 0.3], + ] + ) + expected = np.array([-0.1, 0.01, 0.5, 0.1]) + computed = tcl._min_distance(data, None) + self.assertEqual((4,), computed.shape) + self.assertListEqual(expected.tolist(), computed.tolist()) + + def test_splitter_parameter(self): + expected_values = [ + [1, 7, 9], + [1, 7, 9], + [1, 7, 9], + [1, 7, 9], + [0, 5, 6], + [0, 5, 6], + [0, 5, 6], + [0, 5, 6], + ] + X, y = load_dataset(self._random_state, n_features=12) + for splitter_type in ["best", "random"]: + for criterion in ["gini", "entropy"]: + for criteria in ["min_distance", "max_samples"]: + tcl = self.build( + splitter_type=splitter_type, + criterion=criterion, + criteria=criteria, + random_state=self._random_state, + ) + expected = expected_values.pop(0) + dataset, computed = tcl.get_subspace(X, y, max_features=3) + self.assertListEqual(expected, list(computed)) + self.assertListEqual( + X[:, computed].tolist(), dataset.tolist() + ) diff --git a/stree/tests/Stree_test.py b/stree/tests/Stree_test.py index 371e1d0..0fea9e5 100644 --- a/stree/tests/Stree_test.py +++ b/stree/tests/Stree_test.py @@ -204,13 +204,11 @@ class Stree_test(unittest.TestCase): self.assertEqual(0, len(list(tcl))) def test_min_samples_split(self): - tcl_split = Stree(min_samples_split=3) - tcl_nosplit = Stree(min_samples_split=4) dataset = [[1], [2], [3]], [1, 1, 0] - tcl_split.fit(*dataset) + tcl_split = Stree(min_samples_split=3).fit(*dataset) self.assertIsNotNone(tcl_split.tree_.get_down()) self.assertIsNotNone(tcl_split.tree_.get_up()) - tcl_nosplit.fit(*dataset) + tcl_nosplit = Stree(min_samples_split=4).fit(*dataset) self.assertIsNone(tcl_nosplit.tree_.get_down()) self.assertIsNone(tcl_nosplit.tree_.get_up()) @@ -265,37 +263,6 @@ class Stree_test(unittest.TestCase): outcome = outcomes[name][f"{criteria} {kernel}"] self.assertAlmostEqual(outcome, clf.score(px, py)) - def test_min_distance(self): - clf = Stree() - data = np.array( - [ - [-0.1, 0.2, -0.3], - [0.7, 0.01, -0.1], - [0.7, -0.9, 0.5], - [0.1, 0.2, 0.3], - ] - ) - expected = np.array([-0.1, 0.01, 0.5, 0.1]) - computed = clf._min_distance(data, None) - self.assertEqual((4,), computed.shape) - self.assertListEqual(expected.tolist(), computed.tolist()) - - def test_max_samples(self): - clf = Stree() - data = np.array( - [ - [-0.1, 0.2, -0.3], - [0.7, 0.01, -0.1], - [0.7, -0.9, 0.5], - [0.1, 0.2, 0.3], - ] - ) - expected = np.array([0.2, 0.01, -0.9, 0.2]) - y = [1, 2, 1, 0] - computed = clf._max_samples(data, y) - self.assertEqual((4,), computed.shape) - self.assertListEqual(expected.tolist(), computed.tolist()) - def test_max_features(self): n_features = 16 expected_values = [ @@ -334,7 +301,9 @@ class Stree_test(unittest.TestCase): for max_features, expected in expected_values: clf.set_params(**dict(max_features=max_features)) clf.fit(dataset, y) - computed, indices = clf._get_subspace(dataset) + computed, indices = clf.splitter_.get_subspace( + dataset, y, clf.max_features_ + ) self.assertListEqual( dataset[:, indices].tolist(), computed.tolist() ) @@ -345,22 +314,6 @@ class Stree_test(unittest.TestCase): with self.assertRaises(ValueError): clf.fit(*load_dataset()) - def test_gini(self): - y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] - expected = 0.48 - self.assertEqual(expected, Stree._gini(y)) - clf = Stree(criterion="gini") - clf.fit(*load_dataset()) - self.assertEqual(expected, clf.criterion_function_(y)) - - def test_entropy(self): - y = [0, 1, 1, 1, 1, 1, 0, 0, 0, 1] - expected = 0.9709505944546686 - self.assertAlmostEqual(expected, Stree._entropy(y)) - clf = Stree(criterion="entropy") - clf.fit(*load_dataset()) - self.assertEqual(expected, clf.criterion_function_(y)) - def test_predict_feature_dimensions(self): X = np.random.rand(10, 5) y = np.random.randint(0, 2, 10) @@ -374,3 +327,8 @@ class Stree_test(unittest.TestCase): clf = Stree(random_state=self._random_state, max_features=2) clf.fit(X, y) self.assertAlmostEqual(0.9426666666666667, clf.score(X, y)) + + def test_bogus_splitter_parameter(self): + clf = Stree(splitter="duck") + with self.assertRaises(ValueError): + clf.fit(*load_dataset()) diff --git a/stree/tests/__init__.py b/stree/tests/__init__.py index 625eea9..32e7a88 100644 --- a/stree/tests/__init__.py +++ b/stree/tests/__init__.py @@ -1,4 +1,5 @@ from .Stree_test import Stree_test from .Snode_test import Snode_test +from .Splitter_test import Splitter_test -__all__ = ["Stree_test", "Snode_test"] +__all__ = ["Stree_test", "Snode_test", "Splitter_test"] diff --git a/stree/tests/utils.py b/stree/tests/utils.py index a371e88..94b0506 100644 --- a/stree/tests/utils.py +++ b/stree/tests/utils.py @@ -1,10 +1,10 @@ from sklearn.datasets import make_classification -def load_dataset(random_state=0, n_classes=2): +def load_dataset(random_state=0, n_classes=2, n_features=3): X, y = make_classification( n_samples=1500, - n_features=3, + n_features=n_features, n_informative=3, n_redundant=0, n_repeated=0,