From f360a2640c54c9a3c01a01f438faec3c100dbf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 11 Jun 2020 13:10:52 +0200 Subject: [PATCH 1/2] #6 - Add multiclass support Removed (by now) predict_proba. Created a notebook in jupyter Added split_criteria parameter with min_distance and max_samples values Refactor _distances Refactor _split_criteria Refactor _reorder_results --- main.py | 11 -- stree/Strees.py | 235 +++++++++++++++---------------------- stree/tests/Strees_test.py | 109 +++++++++-------- 3 files changed, 156 insertions(+), 199 deletions(-) diff --git a/main.py b/main.py index b74b0c8..30d36de 100644 --- a/main.py +++ b/main.py @@ -75,14 +75,3 @@ print(f"Took {time.time() - now:.2f} seconds to train") print(clf) print(f"Classifier's accuracy (train): {clf.score(Xtrain, ytrain):.4f}") print(f"Classifier's accuracy (test) : {clf.score(Xtest, ytest):.4f}") -proba = clf.predict_proba(Xtest) -print( - "Checking that we have correct probabilities, these are probabilities of " - "sample belonging to class 1" -) -res0 = proba[proba[:, 0] == 0] -res1 = proba[proba[:, 0] == 1] -print("++++++++++res0 > .8++++++++++++") -print(res0[res0[:, 1] > 0.8]) -print("**********res1 < .4************") -print(res1[res1[:, 1] < 0.4]) diff --git a/stree/Strees.py b/stree/Strees.py index e5ce526..85a073e 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -131,6 +131,7 @@ class Stree(BaseEstimator, ClassifierMixin): tol: float = 1e-4, degree: int = 3, gamma="scale", + split_criteria="max_samples", min_samples_split: int = 0, ): self.max_iter = max_iter @@ -142,17 +143,18 @@ class Stree(BaseEstimator, ClassifierMixin): self.gamma = gamma self.degree = degree self.min_samples_split = min_samples_split + self.split_criteria = split_criteria def _more_tags(self) -> dict: - """Required by sklearn to tell that this estimator is a binary classifier + """Required by sklearn to supply features of the classifier :return: the tag required :rtype: dict """ - return {"binary_only": True, "requires_y": True} + return {"requires_y": True} def _split_array(self, origin: np.array, down: np.array) -> list: - """Split an array in two based on indices passed as down and its complement + """Split an array in two based on indices (down) and its complement :param origin: dataset to split :type origin: np.array @@ -163,8 +165,8 @@ class Stree(BaseEstimator, ClassifierMixin): """ up = ~down return ( - origin[up[:, 0]] if any(up) else None, - origin[down[:, 0]] if any(down) else None, + origin[up] if any(up) else None, + origin[down] if any(down) else None, ) def _distances(self, node: Snode, data: np.ndarray) -> np.array: @@ -178,27 +180,38 @@ class Stree(BaseEstimator, ClassifierMixin): the hyperplane of the node :rtype: np.array """ - res = node._clf.decision_function(data) - if res.ndim == 1: - return np.expand_dims(res, 1) - elif res.shape[1] > 1: - # remove multiclass info - res = np.delete(res, slice(1, res.shape[1]), axis=1) - return res + return node._clf.decision_function(data) - def _split_criteria(self, data: np.array) -> np.array: + def _min_distance(self, data: np.array, _) -> np.array: + # chooses the lowest distance of every sample + indices = np.argmin(np.abs(data), axis=1) + return np.take(data, indices) + + def _max_samples(self, data: np.array, y: np.array) -> np.array: + # select the class with max number of samples + _, samples = np.unique(y, return_counts=True) + selected = np.argmax(samples) + return data[:, selected] + + def _split_criteria(self, data: np.array, node: Snode) -> np.array: """Set the criteria to split arrays - :param data: [description] + :param data: distances of samples to hyperplanes shape (m, nclasses) + if nclasses > 2 else (m,) :type data: np.array - :return: [description] + :param node: node containing the svm classifier + :type node: Snode + :return: array of booleans of samples under or above zero :rtype: np.array """ - return ( - data > 0 - if data.shape[0] >= self.min_samples_split - else np.ones((data.shape[0], 1), dtype=bool) - ) + + if data.shape[0] < self.min_samples_split: + return np.ones((data.shape[0]), dtype=bool) + if data.ndim > 1: + # split criteria for multiclass + data = getattr(self, f"_{self.split_criteria}")(data, node._y) + res = data > 0 + return res def fit( self, X: np.ndarray, y: np.ndarray, sample_weight: np.array = None @@ -231,12 +244,19 @@ class Stree(BaseEstimator, ClassifierMixin): f"Maximum depth has to be greater than 1... got (max_depth=\ {self.max_depth})" ) + if self.split_criteria not in ["min_distance", "max_samples"]: + raise ValueError( + f"split_criteria has to be min_distance or \ + max_samples got ({self.split_criteria})" + ) + check_classification_targets(y) X, y = check_X_y(X, y) sample_weight = _check_sample_weight(sample_weight, X) check_classification_targets(y) # Initialize computed parameters self.classes_, y = np.unique(y, return_inverse=True) + self.n_classes_ = self.classes_.shape[0] self.n_iter_ = self.max_iter self.depth_ = 0 self.n_features_in_ = X.shape[1] @@ -244,6 +264,52 @@ class Stree(BaseEstimator, ClassifierMixin): self._build_predictor() return self + def train( + self, + X: np.ndarray, + y: np.ndarray, + sample_weight: np.ndarray, + depth: int, + title: str, + ) -> Snode: + """Recursive function to split the original dataset into predictor + nodes (leaves) + + :param X: samples dataset + :type X: np.ndarray + :param y: samples labels + :type y: np.ndarray + :param sample_weight: weight of samples. Rescale C per sample. + Hi weights force the classifier to put more emphasis on these points. + :type sample_weight: np.ndarray + :param depth: actual depth in the tree + :type depth: int + :param title: description of the node + :type title: str + :return: binary tree + :rtype: Snode + """ + if depth > self.__max_depth: + return None + if np.unique(y).shape[0] == 1: + # only 1 class => pure dataset + return Snode(None, X, y, title + ", ") + # Train the model + clf = self._build_clf() + clf.fit(X, y, sample_weight=sample_weight) + node = Snode(clf, X, y, title) + self.depth_ = max(depth, self.depth_) + down = self._split_criteria(self._distances(node, X), node) + X_U, X_D = self._split_array(X, down) + y_u, y_d = self._split_array(y, down) + sw_u, sw_d = self._split_array(sample_weight, down) + if X_U is None or X_D is None: + # didn't part anything + return Snode(clf, X, y, title + ", ") + node.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up")) + node.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down")) + return node + def _build_predictor(self): """Process the leaves to make them predictors """ @@ -278,52 +344,6 @@ class Stree(BaseEstimator, ClassifierMixin): ) ) - def train( - self, - X: np.ndarray, - y: np.ndarray, - sample_weight: np.ndarray, - depth: int, - title: str, - ) -> Snode: - """Recursive function to split the original dataset into predictor - nodes (leaves) - - :param X: samples dataset - :type X: np.ndarray - :param y: samples labels - :type y: np.ndarray - :param sample_weight: weight of samples. Rescale C per sample. - Hi weights force the classifier to put more emphasis on these points. - :type sample_weight: np.ndarray - :param depth: actual depth in the tree - :type depth: int - :param title: description of the node - :type title: str - :return: binary tree - :rtype: Snode - """ - if depth > self.__max_depth: - return None - if np.unique(y).shape[0] == 1: - # only 1 class => pure dataset - return Snode(None, X, y, title + ", ") - # Train the model - clf = self._build_clf() - clf.fit(X, y, sample_weight=sample_weight) - tree = Snode(clf, X, y, title) - self.depth_ = max(depth, self.depth_) - down = self._split_criteria(self._distances(tree, X)) - X_U, X_D = self._split_array(X, down) - y_u, y_d = self._split_array(y, down) - sw_u, sw_d = self._split_array(sample_weight, down) - if X_U is None or X_D is None: - # didn't part anything - return Snode(clf, X, y, title + ", ") - tree.set_up(self.train(X_U, y_u, sw_u, depth + 1, title + " - Up")) - tree.set_down(self.train(X_D, y_d, sw_d, depth + 1, title + " - Down")) - return tree - def _reorder_results(self, y: np.array, indices: np.array) -> np.array: """Reorder an array based on the array of indices passed @@ -334,12 +354,8 @@ class Stree(BaseEstimator, ClassifierMixin): :return: array y ordered :rtype: np.array """ - if y.ndim > 1 and y.shape[1] > 1: - # if predict_proba return np.array of floats - y_ordered = np.zeros(y.shape, dtype=float) - else: - # return array of same type given in y - y_ordered = y.copy() + # return array of same type given in y + y_ordered = y.copy() indices = indices.astype(int) for i, index in enumerate(indices): y_ordered[index] = y[i] @@ -363,11 +379,11 @@ class Stree(BaseEstimator, ClassifierMixin): # set a class for every sample in dataset prediction = np.full((xp.shape[0], 1), node._class) return prediction, indices - down = self._split_criteria(self._distances(node, xp)) - X_U, X_D = self._split_array(xp, down) + down = self._split_criteria(self._distances(node, xp), node) + x_u, x_d = self._split_array(xp, down) i_u, i_d = self._split_array(indices, down) - prx_u, prin_u = predict_class(X_U, i_u, node.get_up()) - prx_d, prin_d = predict_class(X_D, i_d, node.get_down()) + prx_u, prin_u = predict_class(x_u, i_u, node.get_up()) + prx_d, prin_d = predict_class(x_d, i_d, node.get_down()) return np.append(prx_u, prx_d), np.append(prin_u, prin_d) # sklearn check @@ -383,68 +399,6 @@ class Stree(BaseEstimator, ClassifierMixin): ) return self.classes_[result] - def predict_proba(self, X: np.array) -> np.array: - """Computes an approximation of the probability of samples belonging to - class 0 and 1 - :param X: dataset - :type X: np.array - :return: array array of shape (m, num_classes), probability of being - each class - :rtype: np.array - """ - - def predict_class( - xp: np.array, indices: np.array, dist: np.array, node: Snode - ) -> np.array: - """Run the tree to compute predictions - - :param xp: subdataset of samples - :type xp: np.array - :param indices: indices of subdataset samples to rebuild original - order - :type indices: np.array - :param dist: distances of every sample to the hyperplane or the - father node - :type dist: np.array - :param node: node of the leaf with the class - :type node: Snode - :return: array of labels and distances, array of indices - :rtype: np.array - """ - if xp is None: - return [], [] - if node.is_leaf(): - # set a class for every sample in dataset - prediction = np.full((xp.shape[0], 1), node._class) - prediction_proba = dist - return np.append(prediction, prediction_proba, axis=1), indices - distances = self._distances(node, xp) - down = self._split_criteria(distances) - X_U, X_D = self._split_array(xp, down) - i_u, i_d = self._split_array(indices, down) - di_u, di_d = self._split_array(distances, down) - prx_u, prin_u = predict_class(X_U, i_u, di_u, node.get_up()) - prx_d, prin_d = predict_class(X_D, i_d, di_d, node.get_down()) - return np.append(prx_u, prx_d), np.append(prin_u, prin_d) - - # sklearn check - check_is_fitted(self, ["tree_"]) - # Input validation - X = check_array(X) - # setup prediction & make it happen - indices = np.arange(X.shape[0]) - empty_dist = np.empty((X.shape[0], 1), dtype=float) - result, indices = predict_class(X, indices, empty_dist, self.tree_) - result = result.reshape(X.shape[0], 2) - # Turn distances to hyperplane into probabilities based on fitting - # distances of samples to its hyperplane that classified them, to the - # sigmoid function - # Probability of being 1 - result[:, 1] = 1 / (1 + np.exp(-result[:, 1])) - # Probability of being 0 - result[:, 0] = 1 - result[:, 1] - return self._reorder_results(result, indices) - def score( self, X: np.array, y: np.array, sample_weight: np.array = None ) -> float: @@ -473,12 +427,11 @@ class Stree(BaseEstimator, ClassifierMixin): score = differing_labels == 0 else: score = y_true == y_pred - return _weighted_sum(score, sample_weight, normalize=True) def __iter__(self) -> Siterator: - """Create an iterator to be able to visit the nodes of the tree in preorder, - can make a list with all the nodes in preorder + """Create an iterator to be able to visit the nodes of the tree in + preorder, can make a list with all the nodes in preorder :return: an iterator, can for i in... and list(...) :rtype: Siterator diff --git a/stree/tests/Strees_test.py b/stree/tests/Strees_test.py index 8bda05b..4bb8087 100644 --- a/stree/tests/Strees_test.py +++ b/stree/tests/Strees_test.py @@ -2,23 +2,22 @@ import os import unittest import numpy as np -from sklearn.datasets import make_classification +from sklearn.datasets import make_classification, load_iris from stree import Stree, Snode -def get_dataset(random_state=0): +def get_dataset(random_state=0, n_classes=2): X, y = make_classification( n_samples=1500, n_features=3, n_informative=3, n_redundant=0, n_repeated=0, - n_classes=2, + n_classes=n_classes, n_clusters_per_class=2, class_sep=1.5, flip_y=0, - weights=[0.5, 0.5], random_state=random_state, ) return X, y @@ -104,9 +103,8 @@ class Stree_test(unittest.TestCase): return res def test_single_prediction(self): - probs = [0.29026400766, 0.73105613, 0.0307635] X, y = get_dataset(self._random_state) - for kernel, prob in zip(self._kernels, probs): + for kernel in self._kernels: clf = Stree(kernel=kernel, random_state=self._random_state) yp = clf.fit(X, y).predict((X[0, :].reshape(-1, X.shape[1]))) self.assertEqual(yp[0], y[0]) @@ -122,10 +120,12 @@ class Stree_test(unittest.TestCase): def test_score(self): X, y = get_dataset(self._random_state) - for kernel, accuracy_expected in zip( - self._kernels, - [0.9506666666666667, 0.9606666666666667, 0.9433333333333334], - ): + accuracies = [ + 0.9506666666666667, + 0.9606666666666667, + 0.9433333333333334, + ] + for kernel, accuracy_expected in zip(self._kernels, accuracies): clf = Stree(random_state=self._random_state, kernel=kernel,) clf.fit(X, y) accuracy_score = clf.score(X, y) @@ -134,38 +134,6 @@ class Stree_test(unittest.TestCase): self.assertEqual(accuracy_score, accuracy_computed) self.assertAlmostEqual(accuracy_expected, accuracy_score) - def test_single_predict_proba(self): - """Check the element 28 probability of being 1 - """ - decimals = 5 - element = 28 - probs = [0.29026400766, 0.73105613, 0.0307635] - X, y = get_dataset(self._random_state) - self.assertEqual(1, y[element]) - for kernel, prob in zip(self._kernels, probs): - clf = Stree(kernel=kernel, random_state=self._random_state) - yp = clf.fit(X, y).predict_proba( - X[element, :].reshape(-1, X.shape[1]) - ) - self.assertAlmostEqual( - np.round(1 - prob, decimals), np.round(yp[0:, 0], decimals) - ) - self.assertAlmostEqual( - round(prob, decimals), round(yp[0, 1], decimals), decimals - ) - - def test_multiple_predict_proba(self): - # First 27 elements the predictions are the same as the truth - num = 27 - X, y = get_dataset(self._random_state) - for kernel in self._kernels: - clf = Stree(kernel=kernel, random_state=self._random_state) - clf.fit(X, y) - yp = clf.predict_proba(X[:num, :]) - self.assertListEqual( - y[:num].tolist(), np.argmax(yp[:num], axis=1).tolist() - ) - def test_single_vs_multiple_prediction(self): """Check if predicting sample by sample gives the same result as predicting all samples at once @@ -225,6 +193,11 @@ class Stree_test(unittest.TestCase): with self.assertRaises(ValueError): tclf.fit(*get_dataset(self._random_state)) + def test_exception_if_bogus_split_criteria(self): + tclf = Stree(split_criteria="duck") + with self.assertRaises(ValueError): + tclf.fit(*get_dataset(self._random_state)) + def test_check_max_depth_is_positive_or_None(self): tcl = Stree() self.assertIsNone(tcl.max_depth) @@ -256,14 +229,56 @@ class Stree_test(unittest.TestCase): self.assertIsNone(tcl_nosplit.tree_.get_down()) self.assertIsNone(tcl_nosplit.tree_.get_up()) - def test_muticlass_dataset(self): + def test_simple_muticlass_dataset(self): for kernel in self._kernels: - clf = Stree(kernel=kernel, random_state=self._random_state) - px = [[1, 2], [3, 4], [5, 6]] - py = [1, 2, 3] + clf = Stree( + kernel=kernel, + split_criteria="max_samples", + random_state=self._random_state, + ) + px = [[1, 2], [5, 6], [9, 10]] + py = [0, 1, 2] clf.fit(px, py) self.assertEqual(1.0, clf.score(px, py)) - self.assertListEqual([1, 2, 3], clf.predict(px).tolist()) + self.assertListEqual(py, clf.predict(px).tolist()) + self.assertListEqual(py, clf.classes_.tolist()) + + def test_muticlass_dataset(self): + datasets = { + "Synt": get_dataset(random_state=self._random_state, n_classes=3), + "Iris": load_iris(return_X_y=True), + } + outcomes = { + "Synt": { + "max_samples linear": 0.9533333333333334, + "max_samples rbf": 0.836, + "max_samples poly": 0.9473333333333334, + "min_distance linear": 0.9533333333333334, + "min_distance rbf": 0.836, + "min_distance poly": 0.9473333333333334, + }, + "Iris": { + "max_samples linear": 0.98, + "max_samples rbf": 1.0, + "max_samples poly": 1.0, + "min_distance linear": 0.98, + "min_distance rbf": 1.0, + "min_distance poly": 1.0, + }, + } + for name, dataset in datasets.items(): + px, py = dataset + for criteria in ["max_samples", "min_distance"]: + for kernel in self._kernels: + clf = Stree( + C=1e4, + max_iter=1e4, + kernel=kernel, + random_state=self._random_state, + ) + clf.fit(px, py) + outcome = outcomes[name][f"{criteria} {kernel}"] + self.assertAlmostEqual(outcome, clf.score(px, py)) class Snode_test(unittest.TestCase): From 1d392d534f4a90aa2eb5a6c69f14ea3d8cfa489e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 11 Jun 2020 13:45:24 +0200 Subject: [PATCH 2/2] #6 - Update tests and codecov conf --- codecov.yml | 3 --- stree/Strees.py | 7 +------ stree/tests/Strees_grapher_test.py | 17 ++++++++++++++++- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/codecov.yml b/codecov.yml index 08f7b1b..222249f 100644 --- a/codecov.yml +++ b/codecov.yml @@ -3,9 +3,6 @@ overage: project: default: target: 90% - patch: - default: - target: 90% comment: layout: "reach, diff, flags, files" behavior: default diff --git a/stree/Strees.py b/stree/Strees.py index 85a073e..55ec85e 100644 --- a/stree/Strees.py +++ b/stree/Strees.py @@ -19,7 +19,6 @@ from sklearn.utils.validation import ( check_is_fitted, _check_sample_weight, ) -from sklearn.utils.sparsefuncs import count_nonzero from sklearn.metrics._classification import _weighted_sum, _check_targets @@ -422,11 +421,7 @@ class Stree(BaseEstimator, ClassifierMixin): # Compute accuracy for each possible representation y_type, y_true, y_pred = _check_targets(y, y_pred) check_consistent_length(y_true, y_pred, sample_weight) - if y_type.startswith("multilabel"): - differing_labels = count_nonzero(y_true - y_pred, axis=1) - score = differing_labels == 0 - else: - score = y_true == y_pred + score = y_true == y_pred return _weighted_sum(score, sample_weight, normalize=True) def __iter__(self) -> Siterator: diff --git a/stree/tests/Strees_grapher_test.py b/stree/tests/Strees_grapher_test.py index c7f6f15..8f39bb7 100644 --- a/stree/tests/Strees_grapher_test.py +++ b/stree/tests/Strees_grapher_test.py @@ -68,6 +68,11 @@ class Stree_grapher_test(unittest.TestCase): self.assertEqual(accuracy_score, accuracy_computed) self.assertGreater(accuracy_score, 0.86) + def test_score_4dims(self): + X, y = get_dataset(self._random_state, n_features=4) + accuracy_score = self._clf.score(X, y) + self.assertEqual(accuracy_score, 0.95) + def test_save_all(self): folder_name = os.path.join(os.sep, "tmp", "stree") if os.path.isdir(folder_name): @@ -171,11 +176,13 @@ class Snode_graph_test(unittest.TestCase): def test_plot_hyperplane_with_distribution(self): plt.close() + # select a pure node + node = self._clf._tree_gr.get_down().get_up().get_up() with warnings.catch_warnings(): warnings.simplefilter("ignore") matplotlib.use("Agg") num_figures_before = plt.gcf().number - self._clf._tree_gr.plot_hyperplane(plot_distribution=True) + node.plot_hyperplane(plot_distribution=True) num_figures_after = plt.gcf().number self.assertEqual(1, num_figures_after - num_figures_before) @@ -209,3 +216,11 @@ class Snode_graph_test(unittest.TestCase): self.assertEqual(x, xx) self.assertEqual(y, yy) self.assertEqual(z, zz) + + def test_cmap_change(self): + node = Snode_graph(Snode(None, None, None, "test")) + self.assertEqual("jet", node._get_cmap()) + # make node pure + node._belief = 1.0 + node._class = 1 + self.assertEqual("jet_r", node._get_cmap())