From 5e5fea9c6a0dd2f480098c0c7c0dea9547ff3421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 30 May 2020 23:10:10 +0200 Subject: [PATCH] Document & lint code --- notebooks/test2.ipynb | 20 ++-- setup.py | 4 +- stree/Strees.py | 223 +++++++++++++++++++++++++++---------- stree/tests/Strees_test.py | 100 ++++++++++------- 4 files changed, 242 insertions(+), 105 deletions(-) diff --git a/notebooks/test2.ipynb b/notebooks/test2.ipynb index 3d5a8c6..7c94d2a 100644 --- a/notebooks/test2.ipynb +++ b/notebooks/test2.ipynb @@ -48,7 +48,7 @@ { "output_type": "stream", "name": "stdout", - "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.722% 200\nValid: 83.278% 996\n" + "text": "Fraud: 0.244% 196\nValid: 99.755% 80234\nX.shape (1196, 28) y.shape (1196,)\nFraud: 16.472% 197\nValid: 83.528% 999\n" } ], "source": [ @@ -103,7 +103,7 @@ { "output_type": "stream", "name": "stdout", - "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9797\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1.0 belief=0.984127 counts=(array([0., 1.]), array([ 2, 124]))\nroot - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up\nroot - Up - Down, - Leaf class=0.0 belief=0.750000 counts=(array([0., 1.]), array([3, 1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up, - Leaf class=0.0 belief=0.980029 counts=(array([0., 1.]), array([687, 14]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9809\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([124]))\nroot - Up, - Leaf class=0.0 belief=0.977560 counts=(array([0., 1.]), array([697, 16]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9869\nClassifier's accuracy (test) : 0.9777\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([129]))\nroot - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([2]))\nroot - Up, - Leaf class=0.0 belief=0.984419 counts=(array([0., 1.]), array([695, 11]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9916\nClassifier's accuracy (test) : 0.9833\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n\n**************************************************\n0.2235 secs\n" + "text": "************** C=0.001 ****************************\nClassifier's accuracy (train): 0.9737\nClassifier's accuracy (test) : 0.9805\nroot\nroot - Down, - Leaf class=1 belief= 0.945736 counts=(array([0, 1]), array([ 7, 122]))\nroot - Up\nroot - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up, - Leaf class=0 belief= 0.978784 counts=(array([0, 1]), array([692, 15]))\n\n**************************************************\n************** C=0.01 ****************************\nClassifier's accuracy (train): 0.9809\nClassifier's accuracy (test) : 0.9805\nroot\nroot - Down, - Leaf class=1 belief= 0.983871 counts=(array([0, 1]), array([ 2, 122]))\nroot - Up\nroot - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down\nroot - Up - Up - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([2]))\nroot - Up - Up - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up - Up - Up - Up, - Leaf class=0 belief= 0.980170 counts=(array([0, 1]), array([692, 14]))\n\n**************************************************\n************** C=1 ****************************\nClassifier's accuracy (train): 0.9904\nClassifier's accuracy (test) : 0.9777\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([122]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up\nroot - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([8]))\nroot - Up - Up, - Leaf class=0 belief= 0.988669 counts=(array([0, 1]), array([698, 8]))\n\n**************************************************\n************** C=5 ****************************\nClassifier's accuracy (train): 0.9916\nClassifier's accuracy (test) : 0.9721\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([125]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up\nroot - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([5]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, - Leaf class=0 belief= 0.990071 counts=(array([0, 1]), array([698, 7]))\n\n**************************************************\n************** C=17 ****************************\nClassifier's accuracy (train): 0.9940\nClassifier's accuracy (test) : 0.9749\nroot\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([128]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, - Leaf class=0 belief= 0.992867 counts=(array([0, 1]), array([696, 5]))\n\n**************************************************\n0.2412 secs\n" } ], "source": [ @@ -123,7 +123,13 @@ "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": "[[0.97223657 0.02776343]\n [0.96965421 0.03034579]\n [0.96918057 0.03081943]\n [0.94009975 0.05990025]]\n" + } + ], "source": [ "import numpy as np\n", "from sklearn.preprocessing import StandardScaler\n", @@ -133,7 +139,7 @@ "cclf = CalibratedClassifierCV(base_estimator=LinearSVC(), cv=5)\n", "cclf.fit(Xtrain, ytrain)\n", "res = cclf.predict_proba(Xtest)\n", - "#an array containing probabilities of belonging to the 1st class" + "print(res[:4, :])" ] }, { @@ -144,7 +150,7 @@ { "output_type": "stream", "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n" + "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([128]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, - Leaf class=0 belief= 0.992867 counts=(array([0, 1]), array([696, 5]))\n" } ], "source": [ @@ -161,7 +167,7 @@ { "output_type": "stream", "name": "stdout", - "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([131]))\nroot - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([8]))\nroot - Up\nroot - Up - Down, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down\nroot - Up - Up - Down - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Down - Up, - Leaf class=0.0 belief=1.000000 counts=(array([0.]), array([5]))\nroot - Up - Up - Up\nroot - Up - Up - Up - Down, - Leaf class=1.0 belief=1.000000 counts=(array([1.]), array([1]))\nroot - Up - Up - Up - Up, - Leaf class=0.0 belief=0.989855 counts=(array([0., 1.]), array([683, 7]))\n" + "text": "root\nroot - Down\nroot - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([128]))\nroot - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([2]))\nroot - Up\nroot - Up - Down\nroot - Up - Down - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([4]))\nroot - Up - Down - Up, - Leaf class=0 belief= 1.000000 counts=(array([0]), array([1]))\nroot - Up - Up\nroot - Up - Up - Down, - Leaf class=1 belief= 1.000000 counts=(array([1]), array([1]))\nroot - Up - Up - Up, - Leaf class=0 belief= 0.992867 counts=(array([0, 1]), array([696, 5]))\n" } ], "source": [ @@ -189,7 +195,7 @@ { "output_type": "stream", "name": "stdout", - "text": "1 functools.partial(, 'Stree')\n2 functools.partial(, 'Stree')\n3 functools.partial(, 'Stree')\n4 functools.partial(, 'Stree')\n5 functools.partial(, 'Stree')\n6 functools.partial(, 'Stree')\n7 functools.partial(, 'Stree')\n8 functools.partial(, 'Stree')\n9 functools.partial(, 'Stree')\n10 functools.partial(, 'Stree', readonly_memmap=True)\n11 functools.partial(, 'Stree')\n12 functools.partial(, 'Stree')\n13 functools.partial(, 'Stree')\n14 functools.partial(, 'Stree')\n15 functools.partial(, 'Stree')\n16 functools.partial(, 'Stree')\n17 functools.partial(, 'Stree')\n18 functools.partial(, 'Stree')\n19 functools.partial(, 'Stree')\n20 functools.partial(, 'Stree')\n21 functools.partial(, 'Stree')\n22 functools.partial(, 'Stree')\n23 functools.partial(, 'Stree')\n24 functools.partial(, 'Stree', readonly_memmap=True)\n25 functools.partial(, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(, 'Stree')\n27 functools.partial(, 'Stree')\n28 functools.partial(, 'Stree')\n29 functools.partial(, 'Stree')\n30 functools.partial(, 'Stree')\n31 functools.partial(, 'Stree')\n32 functools.partial(, 'Stree')\n33 functools.partial(, 'Stree')\n34 functools.partial(, 'Stree')\n35 functools.partial(, 'Stree')\n36 functools.partial(, 'Stree')\n37 functools.partial(, 'Stree')\n38 functools.partial(, 'Stree')\n39 functools.partial(, 'Stree')\n40 functools.partial(, 'Stree')\n41 functools.partial(, 'Stree')\n42 functools.partial(, 'Stree')\n" + "text": "1 functools.partial(, 'Stree')\n2 functools.partial(, 'Stree')\n3 functools.partial(, 'Stree')\n4 functools.partial(, 'Stree')\n5 functools.partial(, 'Stree')\n6 functools.partial(, 'Stree')\n7 functools.partial(, 'Stree')\n8 functools.partial(, 'Stree')\n9 functools.partial(, 'Stree')\n10 functools.partial(, 'Stree', readonly_memmap=True)\n11 functools.partial(, 'Stree')\n12 functools.partial(, 'Stree')\n13 functools.partial(, 'Stree')\n14 functools.partial(, 'Stree')\n15 functools.partial(, 'Stree')\n16 functools.partial(, 'Stree')\n17 functools.partial(, 'Stree')\n18 functools.partial(, 'Stree')\n19 functools.partial(, 'Stree')\n20 functools.partial(, 'Stree')\n21 functools.partial(, 'Stree')\n22 functools.partial(, 'Stree')\n23 functools.partial(, 'Stree')\n24 functools.partial(, 'Stree', readonly_memmap=True)\n25 functools.partial(, 'Stree', readonly_memmap=True, X_dtype='float32')\n26 functools.partial(, 'Stree')\n27 functools.partial(, 'Stree')\n28 functools.partial(, 'Stree')\n29 functools.partial(, 'Stree')\n30 functools.partial(, 'Stree')\n31 functools.partial(, 'Stree')\n32 functools.partial(, 'Stree')\n33 functools.partial(, 'Stree')\n34 functools.partial(, 'Stree')\n35 functools.partial(, 'Stree')\n36 functools.partial(, 'Stree')\n37 functools.partial(, 'Stree')\n38 functools.partial(, 'Stree')\n39 functools.partial(, 'Stree')\n40 functools.partial(, 'Stree')\n41 functools.partial(, 'Stree')\n42 functools.partial(, 'Stree')\n43 functools.partial(, 'Stree')\n" } ], "source": [ diff --git a/setup.py b/setup.py index b31585f..95c0906 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,7 @@ import setuptools __version__ = "0.9rc3" __author__ = "Ricardo Montañana Gómez" + def readme(): with open('README.md') as f: return f.read() @@ -19,7 +20,8 @@ setuptools.setup( url='https://github.com/doctorado-ml/stree', author=__author__, author_email='ricardo.montanana@alu.uclm.es', - keywords='scikit-learn oblique-classifier oblique-decision-tree decision-tree svm svc', + keywords='scikit-learn oblique-classifier oblique-decision-tree decision-\ + tree svm svc', classifiers=[ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: MIT License', diff --git a/stree/Strees.py b/stree/Strees.py index 5ed2e03..a910d3c 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -7,23 +7,28 @@ Build an oblique tree classifier based on SVM Trees Uses LinearSVC ''' -import typing import os import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.svm import LinearSVC from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, _check_sample_weight, check_random_state +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted, \ + _check_sample_weight class Snode: - def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, title: str): + """Nodes of the tree that keeps the svm classifier and if testing the + dataset assigned to it + """ + + def __init__(self, clf: LinearSVC, X: np.ndarray, y: np.ndarray, + title: str): self._clf = clf self._vector = None if clf is None else clf.coef_ self._interceptor = 0. if clf is None else clf.intercept_ self._title = title - self._belief = 0. # belief of the prediction in a leaf node based on samples + self._belief = 0. # Only store dataset in Testing self._X = X if os.environ.get('TESTING', 'NS') != 'NS' else None self._y = y @@ -51,8 +56,8 @@ class Snode: return self._up def make_predictor(self): - """Compute the class of the predictor and its belief based on the subdataset of the node - only if it is a leaf + """Compute the class of the predictor and its belief based on the + subdataset of the node only if it is a leaf """ if not self.is_leaf(): return @@ -62,7 +67,7 @@ class Snode: min_card = min(card) try: self._belief = max_card / (max_card + min_card) - except: + except ZeroDivisionError: self._belief = 0. self._class = classes[card == max_card][0] else: @@ -71,7 +76,10 @@ class Snode: def __str__(self) -> str: if self.is_leaf(): - return f"{self._title} - Leaf class={self._class} belief={self._belief:.6f} counts={np.unique(self._y, return_counts=True)}" + count_values = np.unique(self._y, return_counts=True) + result = f"{self._title} - Leaf class={self._class} belief="\ + f"{self._belief: .6f} counts={count_values}" + return result else: return f"{self._title}" @@ -101,11 +109,16 @@ class Siterator: class Stree(BaseEstimator, ClassifierMixin): - """ + """Estimator that is based on binary trees of svm nodes + can deal with sample_weights in predict, used in boosting sklearn methods + inheriting from BaseEstimator implements get_params and set_params methods + inheriting from ClassifierMixin implement the attribute _estimator_type + with "classifier" as value """ - def __init__(self, C: float = 1.0, max_iter: int = 1000, random_state: int = None, - max_depth: int=None, tol: float=1e-4, use_predictions: bool = False): + def __init__(self, C: float = 1.0, max_iter: int = 1000, + random_state: int = None, max_depth: int = None, + tol: float = 1e-4, use_predictions: bool = False): self.max_iter = max_iter self.C = C self.random_state = random_state @@ -113,65 +126,100 @@ class Stree(BaseEstimator, ClassifierMixin): self.max_depth = max_depth self.tol = tol - def get_params(self, deep: bool=True) -> dict: - """Get dict with hyperparameters and its values to accomplish sklearn rules - """ - return { - 'C': self.C, - 'random_state': self.random_state, - 'max_iter': self.max_iter, - 'use_predictions': self.use_predictions, - 'max_depth': self.max_depth, - 'tol': self.tol - } - - def set_params(self, **parameters: dict): - """Set hyperparmeters as specified by sklearn, needed in Gridsearchs - """ - for parameter, value in parameters.items(): - setattr(self, parameter, value) - return self - - # Added binary_only tag as required by sklearn check_estimator def _more_tags(self) -> dict: - return {'binary_only': True} + """Required by sklearn to tell that this estimator is a binary classifier + + :return: the tag required + :rtype: dict + """ + return {'binary_only': True, 'requires_y': True} def _linear_function(self, data: np.array, node: Snode) -> np.array: + """Compute the distance of set of samples to a hyperplane, in + multiclass classification it should compute the distance to a + hyperplane of each class + + :param data: dataset of samples + :type data: np.array + :param node: the node that contains the hyperplance coefficients + :type node: Snode + :return: array of distances of each sample to the hyperplane + :rtype: np.array + """ coef = node._vector[0, :].reshape(-1, data.shape[1]) return data.dot(coef.T) + node._interceptor[0] def _split_array(self, origin: np.array, down: np.array) -> list: + """Split an array in two based on indices passed as down and its complement + + :param origin: dataset to split + :type origin: np.array + :param down: indices to use to split array + :type down: np.array + :return: list with two splits of the array + :rtype: list + """ up = ~down return origin[up[:, 0]] if any(up) else None, \ origin[down[:, 0]] if any(down) else None def _distances(self, node: Snode, data: np.ndarray) -> np.array: + """Compute distances of the samples to the hyperplane of the node + + :param node: node containing the svm classifier + :type node: Snode + :param data: samples to find out distance to hyperplane + :type data: np.ndarray + :return: array of shape (m, 1) with the distances of every sample to + the hyperplane of the node + :rtype: np.array + """ if self.use_predictions: res = np.expand_dims(node._clf.decision_function(data), 1) else: - # doesn't work with multiclass as each sample has to do inner product with its own coeficients - # computes positition of every sample is w.r.t. the hyperplane + """doesn't work with multiclass as each sample has to do inner + product with its own coefficients computes positition of every + sample is w.r.t. the hyperplane + """ res = self._linear_function(data, node) return res def _split_criteria(self, data: np.array) -> np.array: + """Set the criteria to split arrays + + :param data: [description] + :type data: np.array + :return: [description] + :rtype: np.array + """ return data > 0 - def fit(self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None) -> 'Stree': + def fit(self, X: np.ndarray, y: np.ndarray, + sample_weight: np.array = None) -> 'Stree': + """Build the tree based on the dataset of samples and its labels + + :raises ValueError: if parameters C or max_depth are out of bounds + :return: itself to be able to chain actions: fit().predict() ... + :rtype: Stree + """ # Check parameters are Ok. if type(y).__name__ == 'np.ndarray': y = y.ravel() if self.C < 0: - raise ValueError(f"Penalty term must be positive... got (C={self.C:f})") - self.__max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth + raise ValueError( + f"Penalty term must be positive... got (C={self.C:f})") + self.__max_depth = np.iinfo( + np.int32).max if self.max_depth is None else self.max_depth if self.__max_depth < 1: - raise ValueError(f"Maximum depth has to be greater than 1... got (max_depth={self.max_depth})") + raise ValueError( + f"Maximum depth has to be greater than 1... got (max_depth=\ + {self.max_depth})") check_classification_targets(y) X, y = check_X_y(X, y) sample_weight = _check_sample_weight(sample_weight, X) check_classification_targets(y) # Initialize computed parameters - self.classes_ = np.unique(y) + self.classes_, y = np.unique(y, return_inverse=True) self.n_iter_ = self.max_iter self.depth_ = 0 self.n_features_in_ = X.shape[1] @@ -182,7 +230,6 @@ class Stree(BaseEstimator, ClassifierMixin): def _build_predictor(self): """Process the leaves to make them predictors """ - def run_tree(node: Snode): if node.is_leaf(): node.make_predictor() @@ -192,16 +239,32 @@ class Stree(BaseEstimator, ClassifierMixin): run_tree(self.tree_) - def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray, depth: int, title: str) -> Snode: - + def train(self, X: np.ndarray, y: np.ndarray, sample_weight: np.ndarray, + depth: int, title: str) -> Snode: + """Recursive function to split the original dataset into predictor + nodes (leaves) + + :param X: samples dataset + :type X: np.ndarray + :param y: samples labels + :type y: np.ndarray + :param sample_weight: weight of samples (used in boosting) + :type sample_weight: np.ndarray + :param depth: actual depth in the tree + :type depth: int + :param title: description of the node + :type title: str + :return: binary tree + :rtype: Snode + """ if depth > self.__max_depth: return None - if np.unique(y).shape[0] == 1 : + if np.unique(y).shape[0] == 1: # only 1 class => pure dataset return Snode(None, X, y, title + ', ') # Train the model clf = LinearSVC(max_iter=self.max_iter, random_state=self.random_state, - C=self.C) #, sample_weight=sample_weight) + C=self.C) # , sample_weight=sample_weight) clf.fit(X, y, sample_weight=sample_weight) tree = Snode(clf, X, y, title) self.depth_ = max(depth, self.depth_) @@ -217,6 +280,15 @@ class Stree(BaseEstimator, ClassifierMixin): return tree def _reorder_results(self, y: np.array, indices: np.array) -> np.array: + """Reorder an array based on the array of indices passed + + :param y: data untidy + :type y: np.array + :param indices: indices used to set order + :type indices: np.array + :return: array y ordered + :rtype: np.array + """ if y.ndim > 1 and y.shape[1] > 1: # if predict_proba return np.array of floats y_ordered = np.zeros(y.shape, dtype=float) @@ -229,7 +301,15 @@ class Stree(BaseEstimator, ClassifierMixin): return y_ordered def predict(self, X: np.array) -> np.array: - def predict_class(xp: np.array, indices: np.array, node: Snode) -> np.array: + """Predict labels for each sample in dataset passed + + :param X: dataset of samples + :type X: np.array + :return: array of labels + :rtype: np.array + """ + def predict_class(xp: np.array, indices: np.array, + node: Snode) -> np.array: if xp is None: return [], [] if node.is_leaf(): @@ -242,29 +322,36 @@ class Stree(BaseEstimator, ClassifierMixin): prx_u, prin_u = predict_class(X_U, i_u, node.get_up()) prx_d, prin_d = predict_class(X_D, i_d, node.get_down()) return np.append(prx_u, prx_d), np.append(prin_u, prin_d) - # sklearn check check_is_fitted(self, ['tree_']) # Input validation X = check_array(X) # setup prediction & make it happen indices = np.arange(X.shape[0]) - return self._reorder_results(*predict_class(X, indices, self.tree_)).ravel() + result = self._reorder_results( + *predict_class(X, indices, self.tree_)).astype(int).ravel() + return self.classes_[result] def predict_proba(self, X: np.array) -> np.array: - """Computes an approximation of the probability of samples belonging to class 0 and 1 + """Computes an approximation of the probability of samples belonging to + class 0 and 1 :param X: dataset :type X: np.array + :return: array array of shape (m, num_classes), probability of being + each class + :rtype: np.array """ - - def predict_class(xp: np.array, indices: np.array, dist: np.array, node: Snode) -> np.array: + def predict_class(xp: np.array, indices: np.array, dist: np.array, + node: Snode) -> np.array: """Run the tree to compute predictions :param xp: subdataset of samples :type xp: np.array - :param indices: indices of subdataset samples to rebuild original order + :param indices: indices of subdataset samples to rebuild original + order :type indices: np.array - :param dist: distances of every sample to the hyperplane or the father node + :param dist: distances of every sample to the hyperplane or the + father node :type dist: np.array :param node: node of the leaf with the class :type node: Snode @@ -280,7 +367,6 @@ class Stree(BaseEstimator, ClassifierMixin): return np.append(prediction, prediction_proba, axis=1), indices distances = self._distances(node, xp) down = self._split_criteria(distances) - X_U, X_D = self._split_array(xp, down) i_u, i_d = self._split_array(indices, down) di_u, di_d = self._split_array(distances, down) @@ -297,15 +383,24 @@ class Stree(BaseEstimator, ClassifierMixin): empty_dist = np.empty((X.shape[0], 1), dtype=float) result, indices = predict_class(X, indices, empty_dist, self.tree_) result = result.reshape(X.shape[0], 2) - # Turn distances to hyperplane into probabilities based on fitting distances - # of samples to its hyperplane that classified them, to the sigmoid function + # Turn distances to hyperplane into probabilities based on fitting + # distances of samples to its hyperplane that classified them, to the + # sigmoid function # Probability of being 1 result[:, 1] = 1 / (1 + np.exp(-result[:, 1])) - result[:, 0] = 1 - result[:, 1] # Probability of being 0 + # Probability of being 0 + result[:, 0] = 1 - result[:, 1] return self._reorder_results(result, indices) def score(self, X: np.array, y: np.array) -> float: - """Return accuracy + """Compute accuracy of the prediction + + :param X: dataset of samples to make predictions + :type X: np.array + :param y: samples labels + :type y: np.array + :return: accuracy of the prediction + :rtype: float """ # sklearn check check_is_fitted(self) @@ -313,15 +408,25 @@ class Stree(BaseEstimator, ClassifierMixin): return np.mean(yp == y) def __iter__(self) -> Siterator: + """Create an iterator to be able to visit the nodes of the tree in preorder, + can make a list with all the nodes in preorder + + :return: an iterator, can for i in... and list(...) + :rtype: Siterator + """ try: tree = self.tree_ - except: + except AttributeError: tree = None return Siterator(tree) def __str__(self) -> str: + """String representation of the tree + + :return: description of nodes in the tree in preorder + :rtype: str + """ output = '' for i in self: output += str(i) + '\n' return output - diff --git a/stree/tests/Strees_test.py b/stree/tests/Strees_test.py index 7e32cfe..ef0b211 100644 --- a/stree/tests/Strees_test.py +++ b/stree/tests/Strees_test.py @@ -1,4 +1,3 @@ -import csv import os import unittest @@ -22,18 +21,22 @@ class Stree_test(unittest.TestCase): def tearDownClass(cls): try: os.environ.pop('TESTING') - except: + except KeyError: pass def _get_Xy(self): - X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, - n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, - class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state) + X, y = make_classification(n_samples=1500, n_features=3, + n_informative=3, n_redundant=0, + n_repeated=0, n_classes=2, + n_clusters_per_class=2, class_sep=1.5, + flip_y=0, weights=[0.5, 0.5], + random_state=self._random_state) return X, y def _check_tree(self, node: Snode): - """Check recursively that the nodes that are not leaves have the correct - number of labels and its sons have the right number of elements in their dataset + """Check recursively that the nodes that are not leaves have the + correct number of labels and its sons have the right number of elements + in their dataset Arguments: node {Snode} -- node to check @@ -53,11 +56,11 @@ class Stree_test(unittest.TestCase): for i in unique_y: try: number_down = count_d[i] - except: + except IndexError: number_down = 0 try: number_up = count_u[i] - except: + except IndexError: number_up = 0 self.assertEqual(count_y[i], number_down + number_up) # Is the partition made the same as the prediction? @@ -89,7 +92,8 @@ class Stree_test(unittest.TestCase): fx = np.delete(data, column_y, axis=1) return fx, fy - def _find_out(self, px: np.array, x_original: np.array, y_original) -> list: + def _find_out(self, px: np.array, x_original: np.array, + y_original) -> list: """Find the original values of y for a given array of samples Arguments: @@ -128,16 +132,18 @@ class Stree_test(unittest.TestCase): self.assertGreater(accuracy_score, 0.9) def test_single_predict_proba(self): - """Check that element 28 has a prediction different that the current label + """Check that element 28 has a prediction different that the current + label """ # Element 28 has a different prediction than the truth decimals = 5 prob = 0.29026400766 X, y = self._get_Xy() yp = self._clf.predict_proba(X[28, :].reshape(-1, X.shape[1])) - self.assertEqual(np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals)) + self.assertEqual(np.round(1 - prob, decimals), + np.round(yp[0:, 0], decimals)) self.assertEqual(1, y[28]) - + self.assertAlmostEqual( round(prob, decimals), round(yp[0, 1], decimals), @@ -150,11 +156,16 @@ class Stree_test(unittest.TestCase): decimals = 5 X, y = self._get_Xy() yp = self._clf.predict_proba(X[:num, :]) - self.assertListEqual(y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()) - expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, 0.14269291, 0.85193236, - 0.29876058, 0.7282164, 0.85958616, 0.89517877, 0.99745224, 0.18860349, - 0.30756427, 0.8318412, 0.18981198, 0.15564624, 0.25740655, 0.22923355, - 0.87365959, 0.49928689, 0.95574351, 0.28761257, 0.28906333, 0.32643692, + self.assertListEqual( + y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist()) + expected_proba = [0.88395641, 0.36746962, 0.84158767, 0.34106833, + 0.14269291, 0.85193236, + 0.29876058, 0.7282164, 0.85958616, 0.89517877, + 0.99745224, 0.18860349, + 0.30756427, 0.8318412, 0.18981198, 0.15564624, + 0.25740655, 0.22923355, + 0.87365959, 0.49928689, 0.95574351, 0.28761257, + 0.28906333, 0.32643692, 0.29788483, 0.01657364, 0.81149083] expected = np.round(expected_proba, decimals=decimals).tolist() computed = np.round(yp[:, 1], decimals=decimals).tolist() @@ -162,9 +173,10 @@ class Stree_test(unittest.TestCase): self.assertAlmostEqual(expected[i], computed[i], decimals) def build_models(self): - """Build and train two models, model_clf will use the sklearn classifier to - compute predictions and split data. model_computed will use vector of - coefficients to compute both predictions and splitted data + """Build and train two models, model_clf will use the sklearn + classifier to compute predictions and split data. model_computed will + use vector of coefficients to compute both predictions and splitted + data """ model_clf = Stree(random_state=self._random_state, use_predictions=True) @@ -176,8 +188,9 @@ class Stree_test(unittest.TestCase): return model_clf, model_computed, X, y def test_use_model_predict(self): - """Check that we get the same results wether we use the estimator in nodes - to compute labels or we use the hyperplane and the position of samples wrt to it + """Check that we get the same results wether we use the estimator in + nodes to compute labels or we use the hyperplane and the position of + samples wrt to it """ use_clf, use_math, X, _ = self.build_models() self.assertListEqual( @@ -202,14 +215,15 @@ class Stree_test(unittest.TestCase): ) def test_single_vs_multiple_prediction(self): - """Check if predicting sample by sample gives the same result as predicting - all samples at once + """Check if predicting sample by sample gives the same result as + predicting all samples at once """ X, _ = self._get_Xy() # Compute prediction line by line yp_line = np.array([], dtype=int) for xp in X: - yp_line = np.append(yp_line, self._clf.predict(xp.reshape(-1, X.shape[1]))) + yp_line = np.append(yp_line, self._clf.predict( + xp.reshape(-1, X.shape[1]))) # Compute prediction at once yp_once = self._clf.predict(X) # @@ -221,11 +235,15 @@ class Stree_test(unittest.TestCase): expected = [ 'root', 'root - Down', - 'root - Down - Down, - Leaf class=1 belief=0.975989 counts=(array([0, 1]), array([ 17, 691]))', + 'root - Down - Down, - Leaf class=1 belief= 0.975989 counts' + '=(array([0, 1]), array([ 17, 691]))', 'root - Down - Up', - 'root - Down - Up - Down, - Leaf class=1 belief=0.750000 counts=(array([0, 1]), array([1, 3]))', - 'root - Down - Up - Up, - Leaf class=0 belief=1.000000 counts=(array([0]), array([7]))', - 'root - Up, - Leaf class=0 belief=0.928297 counts=(array([0, 1]), array([725, 56]))', + 'root - Down - Up - Down, - Leaf class=1 belief= 0.750000 ' + 'counts=(array([0, 1]), array([1, 3]))', + 'root - Down - Up - Up, - Leaf class=0 belief= 1.000000 ' + 'counts=(array([0]), array([7]))', + 'root - Up, - Leaf class=0 belief= 0.928297 counts=(array(' + '[0, 1]), array([725, 56]))', ] computed = [] for node in self._clf: @@ -253,10 +271,10 @@ class Stree_test(unittest.TestCase): with self.assertRaises(ValueError): tcl = Stree(max_depth=-1) tcl.fit(*self._get_Xy()) - + def test_check_max_depth(self): depth = 3 - tcl = Stree(random_state=self._random_state, max_depth=depth) + tcl = Stree(random_state=self._random_state, max_depth=depth) tcl.fit(*self._get_Xy()) self.assertEqual(depth, tcl.depth_) @@ -264,6 +282,7 @@ class Stree_test(unittest.TestCase): tcl = Stree() self.assertEqual(0, len(list(tcl))) + class Snode_test(unittest.TestCase): def __init__(self, *args, **kwargs): @@ -276,19 +295,24 @@ class Snode_test(unittest.TestCase): @classmethod def tearDownClass(cls): + """[summary] + """ try: os.environ.pop('TESTING') - except: + except KeyError: pass def _get_Xy(self): - X, y = make_classification(n_samples=1500, n_features=3, n_informative=3, - n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=2, - class_sep=1.5, flip_y=0, weights=[0.5, 0.5], random_state=self._random_state) + X, y = make_classification(n_samples=1500, n_features=3, + n_informative=3, n_redundant=0, n_classes=2, + n_repeated=0, n_clusters_per_class=2, + class_sep=1.5, flip_y=0, weights=[0.5, 0.5], + random_state=self._random_state) return X, y def test_attributes_in_leaves(self): - """Check if the attributes in leaves have correct values so they form a predictor + """Check if the attributes in leaves have correct values so they form a + predictor """ def check_leave(node: Snode): @@ -303,7 +327,7 @@ class Snode_test(unittest.TestCase): if len(classes) > 1: try: belief = max_card / (max_card + min_card) - except: + except ZeroDivisionError: belief = 0. else: belief = 1