From 794374fe8cb48c4152fd693ecd5c7b8f333be70a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 1 Jun 2021 11:30:52 +0200 Subject: [PATCH] Add max_features to selection Add first approach to continuous variables --- README.md | 1 + mfs/Metrics.py | 228 ++++++++++++++++++++++++++ mfs/Selection.py | 189 +++++++-------------- mfs/entropy_estimators.py | 334 ++++++++++++++++++++++++++++++++++++++ mfs/tests/MFS_test.py | 43 +++-- mfs/tests/Metrics_test.py | 77 +++++++-- 6 files changed, 714 insertions(+), 158 deletions(-) create mode 100755 mfs/Metrics.py create mode 100755 mfs/entropy_estimators.py diff --git a/README.md b/README.md index 28bbd14..d5c7ef8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ ![CI](https://github.com/Doctorado-ML/mfs/workflows/CI/badge.svg) +[![Codacy Badge](https://app.codacy.com/project/badge/Grade/66ad727eb13e4c7a8816db1e44d994a7)](https://www.codacy.com/gh/Doctorado-ML/mfs/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/mfs&utm_campaign=Badge_Grade) [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/mfs.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/mfs/context:python) # MFS diff --git a/mfs/Metrics.py b/mfs/Metrics.py new file mode 100755 index 0000000..7b4c0a2 --- /dev/null +++ b/mfs/Metrics.py @@ -0,0 +1,228 @@ +from math import log +import numpy as np + +from scipy.special import gamma, psi +from sklearn.neighbors import BallTree, KDTree, NearestNeighbors +from sklearn.feature_selection._mutual_info import _compute_mi + +# from .entropy_estimators import mi, entropy as c_entropy + + +class Metrics: + @staticmethod + def information_gain_cont(x, y): + """Measures the reduction in uncertainty about the value of y when the + value of X continuous is known (also called mutual information) + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + + Parameters + ---------- + x : np.array + values of the continuous variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + Information gained + """ + return _compute_mi( + x, y, x_discrete=False, y_discrete=True, n_neighbors=3 + ) + + @staticmethod + def _nearest_distances(X, k=1): + """ + X = array(N,M) + N = number of points + M = number of dimensions + returns the distance to the kth nearest neighbor for every point in X + """ + knn = NearestNeighbors(n_neighbors=k + 1) + knn.fit(X) + d, _ = knn.kneighbors(X) # the first nearest neighbor is itself + return d[:, -1] # returns the distance to the kth nearest neighbor + + @staticmethod + def differential_entropy(X, k=1): + + """Returns the entropy of the X. + Parameters + =========== + X : array-like, shape (n_samples, n_features) + The data the entropy of which is computed + k : int, optional + number of nearest neighbors for density estimation + Notes + ====== + Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy + of a random vector. Probl. Inf. Transm. 23, 95-101. + See also: Evans, D. 2008 A computationally efficient estimator for + mutual information, Proc. R. Soc. A 464 (2093), 1203-1215. + and: + Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual + information. Phys Rev E 69(6 Pt 2):066138. + """ + if X.ndim == 1: + X = X.reshape(-1, 1) + # Distance to kth nearest neighbor + r = Metrics._nearest_distances(X, k) # squared distances + n, d = X.shape + volume_unit_ball = (np.pi ** (0.5 * d)) / gamma(0.5 * d + 1) + """ + F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures + for Continuous Random Variables. Advances in Neural Information + Processing Systems 21 (NIPS). Vancouver (Canada), December. + return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k) + """ + return ( + d * np.mean(np.log(r + np.finfo(X.dtype).eps)) + + np.log(volume_unit_ball) + + psi(n) + - psi(k) + ) + + @staticmethod + def conditional_differential_entropy(x, y): + """quantifies the amount of information needed to describe the outcome + of Y discrete given that the value of X continuous is known + computes H(Y|X) + + Parameters + ---------- + x : np.array + values of the continuous variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + conditional entropy of y given x + """ + xy = np.c_[x, y] + return Metrics.differential_entropy(xy) - Metrics.differential_entropy( + x + ) + + @staticmethod + def symmetrical_unc_continuous(x, y): + """Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet + https://github.com/gregversteeg/NPEET + + Parameters + ---------- + x : np.array + values of the continuous variable + y : np.array + array of labels + + Returns + ------- + float + symmetrical uncertainty + """ + + return ( + 2.0 + * Metrics.information_gain_cont(x, y) + / (Metrics.differential_entropy(x) + Metrics.entropy(y)) + ) + + @staticmethod + def symmetrical_uncertainty(x, y): + """Compute symmetrical uncertainty. Normalize* information gain (mutual + information) with the entropies of the features in order to compensate + the bias due to high cardinality features. *Range [0, 1] + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + + Returns + ------- + float + symmetrical uncertainty + """ + return ( + 2.0 + * Metrics.information_gain(x, y) + / (Metrics.entropy(x) + Metrics.entropy(y)) + ) + + @staticmethod + def conditional_entropy(x, y, base=2): + """quantifies the amount of information needed to describe the outcome + of Y given that the value of X is known + computes H(Y|X) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + conditional entropy of y given x + """ + xy = np.c_[x, y] + return Metrics.entropy(xy, base) - Metrics.entropy(x, base) + + @staticmethod + def entropy(y, base=2): + """measure of the uncertainty in predicting the value of y + + Parameters + ---------- + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + entropy of y + """ + _, count = np.unique(y, return_counts=True, axis=0) + proba = count.astype(float) / len(y) + proba = proba[proba > 0.0] + return np.sum(proba * np.log(1.0 / proba)) / log(base) + + @staticmethod + def information_gain(x, y, base=2): + """Measures the reduction in uncertainty about the value of y when the + value of X is known (also called mutual information) + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + Information gained + """ + return Metrics.entropy(y, base) - Metrics.conditional_entropy( + x, y, base + ) diff --git a/mfs/Selection.py b/mfs/Selection.py index 574f8fa..b6aa02d 100755 --- a/mfs/Selection.py +++ b/mfs/Selection.py @@ -1,102 +1,8 @@ -from math import log, sqrt +from math import sqrt from sys import float_info from itertools import combinations import numpy as np - - -class Metrics: - @staticmethod - def conditional_entropy(x, y, base=2): - """quantifies the amount of information needed to describe the outcome - of Y given that the value of X is known - computes H(Y|X) - - Parameters - ---------- - x : np.array - values of the variable - y : np.array - array of labels - base : int, optional - base of the logarithm, by default 2 - - Returns - ------- - float - conditional entropy of y given x - """ - xy = np.c_[x, y] - return Metrics.entropy(xy, base) - Metrics.entropy(x, base) - - @staticmethod - def entropy(y, base=2): - """measure of the uncertainty in predicting the value of y - - Parameters - ---------- - y : np.array - array of labels - base : int, optional - base of the logarithm, by default 2 - - Returns - ------- - float - entropy of y - """ - _, count = np.unique(y, return_counts=True, axis=0) - proba = count.astype(float) / len(y) - proba = proba[proba > 0.0] - return np.sum(proba * np.log(1.0 / proba)) / log(base) - - @staticmethod - def information_gain(x, y, base=2): - """Measures the reduction in uncertainty about the value of y when the - value of X is known (also called mutual information) - (https://www.sciencedirect.com/science/article/pii/S0020025519303603) - - Parameters - ---------- - x : np.array - values of the variable - y : np.array - array of labels - base : int, optional - base of the logarithm, by default 2 - - Returns - ------- - float - Information gained - """ - return Metrics.entropy(y, base) - Metrics.conditional_entropy( - x, y, base - ) - - @staticmethod - def symmetrical_uncertainty(x, y): - """Compute symmetrical uncertainty. Normalize* information gain (mutual - information) with the entropies of the features in order to compensate - the bias due to high cardinality features. *Range [0, 1] - (https://www.sciencedirect.com/science/article/pii/S0020025519303603) - - Parameters - ---------- - x : np.array - values of the variable - y : np.array - array of labels - - Returns - ------- - float - symmetrical uncertainty - """ - return ( - 2.0 - * Metrics.information_gain(x, y) - / (Metrics.entropy(x) + Metrics.entropy(y)) - ) +from .Metrics import Metrics class MFS: @@ -116,18 +22,36 @@ class MFS: The maximum number of features to return """ - def __init__(self, max_features): - self._initialize() + def __init__(self, max_features=None, discrete=True): self._max_features = max_features + self._discrete = discrete + self.symmetrical_uncertainty = ( + Metrics.symmetrical_uncertainty + if discrete + else Metrics.symmetrical_unc_continuous + ) + self._fitted = False - def _initialize(self): + def _initialize(self, X, y): """Initialize the attributes so support multiple calls using same object + + Parameters + ---------- + X : np.array + array of features + y : np.array + vector of labels """ + self.X_ = X + self.y_ = y + if self._max_features is None: + self._max_features = X.shape[1] self._result = None self._scores = [] self._su_labels = None self._su_features = {} + self._fitted = True def _compute_su_labels(self): """Compute symmetrical uncertainty between each feature of the dataset @@ -142,7 +66,7 @@ class MFS: num_features = self.X_.shape[1] self._su_labels = np.zeros(num_features) for col in range(num_features): - self._su_labels[col] = Metrics.symmetrical_uncertainty( + self._su_labels[col] = self.symmetrical_uncertainty( self.X_[:, col], self.y_ ) return self._su_labels @@ -166,7 +90,7 @@ class MFS: if (feature_a, feature_b) not in self._su_features: self._su_features[ (feature_a, feature_b) - ] = Metrics.symmetrical_uncertainty( + ] = self.symmetrical_uncertainty( self.X_[:, feature_a], self.X_[:, feature_b] ) return self._su_features[(feature_a, feature_b)] @@ -210,9 +134,7 @@ class MFS: self self """ - self._initialize() - self.X_ = X - self.y_ = y + self._initialize(X, y) s_list = self._compute_su_labels() # Descending order feature_order = (-s_list).argsort().tolist() @@ -235,33 +157,36 @@ class MFS: candidates.append(feature_order[id_selected]) self._scores.append(merit) del feature_order[id_selected] - if ( - len(feature_order) == 0 - or len(candidates) == self._max_features - ): - # Force leaving the loop - continue_condition = False - if len(self._scores) >= 5: - """ - "To prevent the best first search from exploring the entire - feature subset search space, a stopping criterion is imposed. - The search will terminate if five consecutive fully expanded - subsets show no improvement over the current best subset." - as stated in Mark A. Hall Thesis - """ - item_ant = -1 - for item in self._scores[-5:]: - if item_ant == -1: - item_ant = item - if item > item_ant: - break - else: - item_ant = item - else: - continue_condition = False + continue_condition = self._cfs_continue_condition( + feature_order, candidates + ) self._result = candidates return self + def _cfs_continue_condition(self, feature_order, candidates): + if len(feature_order) == 0 or len(candidates) == self._max_features: + # Force leaving the loop + return False + if len(self._scores) >= 5: + """ + "To prevent the best first search from exploring the entire + feature subset search space, a stopping criterion is imposed. + The search will terminate if five consecutive fully expanded + subsets show no improvement over the current best subset." + as stated in Mark A. Hall Thesis + """ + item_ant = -1 + for item in self._scores[-5:]: + if item_ant == -1: + item_ant = item + if item > item_ant: + break + else: + item_ant = item + else: + return False + return True + def fcbf(self, X, y, threshold): """Fast Correlation-Based Filter @@ -286,9 +211,7 @@ class MFS: """ if threshold < 1e-7: raise ValueError("Threshold cannot be less than 1e-7") - self._initialize() - self.X_ = X - self.y_ = y + self._initialize(X, y) s_list = self._compute_su_labels() feature_order = (-s_list).argsort() feature_dup = feature_order.copy().tolist() @@ -322,7 +245,7 @@ class MFS: list list of features indices selected """ - return self._result + return self._result if self._fitted else [] def get_scores(self): """Return the scores computed for the features selected @@ -332,4 +255,4 @@ class MFS: list list of scores of the features selected """ - return self._scores + return self._scores if self._fitted else [] diff --git a/mfs/entropy_estimators.py b/mfs/entropy_estimators.py new file mode 100755 index 0000000..e0c1630 --- /dev/null +++ b/mfs/entropy_estimators.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python +# Written by Greg Ver Steeg +# See readme.pdf for documentation +# Or go to http://www.isi.edu/~gregv/npeet.html + +import warnings + +import numpy as np +import numpy.linalg as la +from numpy import log +from scipy.special import digamma +from sklearn.neighbors import BallTree, KDTree + +# CONTINUOUS ESTIMATORS + + +def entropy(x, k=3, base=2): + """The classic K-L k-nearest neighbor continuous entropy estimator + x should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]] + if x is a one-dimensional scalar and we have four samples + """ + assert k <= len(x) - 1, "Set k smaller than num. samples - 1" + x = np.asarray(x) + n_elements, n_features = x.shape + x = add_noise(x) + tree = build_tree(x) + nn = query_neighbors(tree, x, k) + const = digamma(n_elements) - digamma(k) + n_features * log(2) + return (const + n_features * np.log(nn).mean()) / log(base) + + +def centropy(x, y, k=3, base=2): + """The classic K-L k-nearest neighbor continuous entropy estimator for the + entropy of X conditioned on Y. + """ + xy = np.c_[x, y] + entropy_union_xy = entropy(xy, k=k, base=base) + entropy_y = entropy(y, k=k, base=base) + return entropy_union_xy - entropy_y + + +def tc(xs, k=3, base=2): + xs_columns = np.expand_dims(xs, axis=0).T + entropy_features = [entropy(col, k=k, base=base) for col in xs_columns] + return np.sum(entropy_features) - entropy(xs, k, base) + + +def ctc(xs, y, k=3, base=2): + xs_columns = np.expand_dims(xs, axis=0).T + centropy_features = [ + centropy(col, y, k=k, base=base) for col in xs_columns + ] + return np.sum(centropy_features) - centropy(xs, y, k, base) + + +def corex(xs, ys, k=3, base=2): + xs_columns = np.expand_dims(xs, axis=0).T + cmi_features = [mi(col, ys, k=k, base=base) for col in xs_columns] + return np.sum(cmi_features) - mi(xs, ys, k=k, base=base) + + +def mi(x, y, z=None, k=3, base=2, alpha=0): + """Mutual information of x and y (conditioned on z if z is not None) + x, y should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]] + if x is a one-dimensional scalar and we have four samples + """ + assert len(x) == len(y), "Arrays should have same length" + assert k <= len(x) - 1, "Set k smaller than num. samples - 1" + x, y = np.asarray(x), np.asarray(y) + x, y = x.reshape(x.shape[0], -1), y.reshape(y.shape[0], -1) + x = add_noise(x) + y = add_noise(y) + points = [x, y] + if z is not None: + z = np.asarray(z) + z = z.reshape(z.shape[0], -1) + points.append(z) + points = np.hstack(points) + # Find nearest neighbors in joint space, p=inf means max-norm + tree = build_tree(points) + dvec = query_neighbors(tree, points, k) + if z is None: + a, b, c, d = ( + avgdigamma(x, dvec), + avgdigamma(y, dvec), + digamma(k), + digamma(len(x)), + ) + if alpha > 0: + d += lnc_correction(tree, points, k, alpha) + else: + xz = np.c_[x, z] + yz = np.c_[y, z] + a, b, c, d = ( + avgdigamma(xz, dvec), + avgdigamma(yz, dvec), + avgdigamma(z, dvec), + digamma(k), + ) + return (-a - b + c + d) / log(base) + + +def cmi(x, y, z, k=3, base=2): + """Mutual information of x and y, conditioned on z + Legacy function. Use mi(x, y, z) directly. + """ + return mi(x, y, z=z, k=k, base=base) + + +def kldiv(x, xp, k=3, base=2): + """KL Divergence between p and q for x~p(x), xp~q(x) + x, xp should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1],[2.4]] + if x is a one-dimensional scalar and we have four samples + """ + assert k < min(len(x), len(xp)), "Set k smaller than num. samples - 1" + assert len(x[0]) == len(xp[0]), "Two distributions must have same dim." + x, xp = np.asarray(x), np.asarray(xp) + x, xp = x.reshape(x.shape[0], -1), xp.reshape(xp.shape[0], -1) + d = len(x[0]) + n = len(x) + m = len(xp) + const = log(m) - log(n - 1) + tree = build_tree(x) + treep = build_tree(xp) + nn = query_neighbors(tree, x, k) + nnp = query_neighbors(treep, x, k - 1) + return (const + d * (np.log(nnp).mean() - np.log(nn).mean())) / log(base) + + +def lnc_correction(tree, points, k, alpha): + e = 0 + n_sample = points.shape[0] + for point in points: + # Find k-nearest neighbors in joint space, p=inf means max norm + knn = tree.query(point[None, :], k=k + 1, return_distance=False)[0] + knn_points = points[knn] + # Substract mean of k-nearest neighbor points + knn_points = knn_points - knn_points[0] + # Calculate covariance matrix of k-nearest neighbor points, obtain eigen vectors + covr = knn_points.T @ knn_points / k + _, v = la.eig(covr) + # Calculate PCA-bounding box using eigen vectors + V_rect = np.log(np.abs(knn_points @ v).max(axis=0)).sum() + # Calculate the volume of original box + log_knn_dist = np.log(np.abs(knn_points).max(axis=0)).sum() + + # Perform local non-uniformity checking and update correction term + if V_rect < log_knn_dist + np.log(alpha): + e += (log_knn_dist - V_rect) / n_sample + return e + + +# DISCRETE ESTIMATORS +def entropyd(sx, base=2): + """Discrete entropy estimator + sx is a list of samples + """ + unique, count = np.unique(sx, return_counts=True, axis=0) + # Convert to float as otherwise integer division results in all 0 for proba. + proba = count.astype(float) / len(sx) + # Avoid 0 division; remove probabilities == 0.0 (removing them does not change the entropy estimate as 0 * log(1/0) = 0. + proba = proba[proba > 0.0] + return np.sum(proba * np.log(1.0 / proba)) / log(base) + + +def midd(x, y, base=2): + """Discrete mutual information estimator + Given a list of samples which can be any hashable object + """ + assert len(x) == len(y), "Arrays should have same length" + return entropyd(x, base) - centropyd(x, y, base) + + +def cmidd(x, y, z, base=2): + """Discrete mutual information estimator + Given a list of samples which can be any hashable object + """ + assert len(x) == len(y) == len(z), "Arrays should have same length" + xz = np.c_[x, z] + yz = np.c_[y, z] + xyz = np.c_[x, y, z] + return ( + entropyd(xz, base) + + entropyd(yz, base) + - entropyd(xyz, base) + - entropyd(z, base) + ) + + +def centropyd(x, y, base=2): + """The classic K-L k-nearest neighbor continuous entropy estimator for the + entropy of X conditioned on Y. + """ + xy = np.c_[x, y] + return entropyd(xy, base) - entropyd(y, base) + + +def tcd(xs, base=2): + xs_columns = np.expand_dims(xs, axis=0).T + entropy_features = [entropyd(col, base=base) for col in xs_columns] + return np.sum(entropy_features) - entropyd(xs, base) + + +def ctcd(xs, y, base=2): + xs_columns = np.expand_dims(xs, axis=0).T + centropy_features = [centropyd(col, y, base=base) for col in xs_columns] + return np.sum(centropy_features) - centropyd(xs, y, base) + + +def corexd(xs, ys, base=2): + xs_columns = np.expand_dims(xs, axis=0).T + cmi_features = [midd(col, ys, base=base) for col in xs_columns] + return np.sum(cmi_features) - midd(xs, ys, base) + + +# MIXED ESTIMATORS +def micd(x, y, k=3, base=2, warning=True): + """If x is continuous and y is discrete, compute mutual information""" + assert len(x) == len(y), "Arrays should have same length" + entropy_x = entropy(x, k, base) + + y_unique, y_count = np.unique(y, return_counts=True, axis=0) + y_proba = y_count / len(y) + + entropy_x_given_y = 0.0 + for yval, py in zip(y_unique, y_proba): + x_given_y = x[(y == yval).all(axis=1)] + if k <= len(x_given_y) - 1: + entropy_x_given_y += py * entropy(x_given_y, k, base) + else: + if warning: + warnings.warn( + "Warning, after conditioning, on y={yval} insufficient data. " + "Assuming maximal entropy in this case.".format(yval=yval) + ) + entropy_x_given_y += py * entropy_x + return abs(entropy_x - entropy_x_given_y) # units already applied + + +def midc(x, y, k=3, base=2, warning=True): + return micd(y, x, k, base, warning) + + +def centropycd(x, y, k=3, base=2, warning=True): + return entropy(x, base) - micd(x, y, k, base, warning) + + +def centropydc(x, y, k=3, base=2, warning=True): + return centropycd(y, x, k=k, base=base, warning=warning) + + +def ctcdc(xs, y, k=3, base=2, warning=True): + xs_columns = np.expand_dims(xs, axis=0).T + centropy_features = [ + centropydc(col, y, k=k, base=base, warning=warning) + for col in xs_columns + ] + return np.sum(centropy_features) - centropydc(xs, y, k, base, warning) + + +def ctccd(xs, y, k=3, base=2, warning=True): + return ctcdc(y, xs, k=k, base=base, warning=warning) + + +def corexcd(xs, ys, k=3, base=2, warning=True): + return corexdc(ys, xs, k=k, base=base, warning=warning) + + +def corexdc(xs, ys, k=3, base=2, warning=True): + return tcd(xs, base) - ctcdc(xs, ys, k, base, warning) + + +# UTILITY FUNCTIONS + + +def add_noise(x, intens=1e-10): + # small noise to break degeneracy, see doc. + return x + intens * np.random.random_sample(x.shape) + + +def query_neighbors(tree, x, k): + return tree.query(x, k=k + 1)[0][:, k] + + +def count_neighbors(tree, x, r): + return tree.query_radius(x, r, count_only=True) + + +def avgdigamma(points, dvec): + # This part finds number of neighbors in some radius in the marginal space + # returns expectation value of + tree = build_tree(points) + dvec = dvec - 1e-15 + num_points = count_neighbors(tree, points, dvec) + return np.mean(digamma(num_points)) + + +def build_tree(points): + if points.shape[1] >= 20: + return BallTree(points, metric="chebyshev") + return KDTree(points, metric="chebyshev") + + +# TESTS + + +def shuffle_test(measure, x, y, z=False, ns=200, ci=0.95, **kwargs): + """Shuffle test + Repeatedly shuffle the x-values and then estimate measure(x, y, [z]). + Returns the mean and conf. interval ('ci=0.95' default) over 'ns' runs. + 'measure' could me mi, cmi, e.g. Keyword arguments can be passed. + Mutual information and CMI should have a mean near zero. + """ + x_clone = np.copy(x) # A copy that we can shuffle + outputs = [] + for _ in range(ns): + np.random.shuffle(x_clone) + if z: + outputs.append(measure(x_clone, y, z, **kwargs)) + else: + outputs.append(measure(x_clone, y, **kwargs)) + outputs.sort() + return np.mean(outputs), ( + outputs[int((1.0 - ci) / 2 * ns)], + outputs[int((1.0 + ci) / 2 * ns)], + ) + + +if __name__ == "__main__": + print("MI between two independent continuous random variables X and Y:") + np.random.seed(0) + x = np.random.randn(1000, 10) + y = np.random.randn(1000, 3) + print(mi(x, y, base=2, alpha=0)) diff --git a/mfs/tests/MFS_test.py b/mfs/tests/MFS_test.py index db9aebf..0e9129d 100755 --- a/mfs/tests/MFS_test.py +++ b/mfs/tests/MFS_test.py @@ -9,11 +9,11 @@ class MFS_test(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) mdlp = MDLP(random_state=1) - X, self.y_w = load_wine(return_X_y=True) - self.X_w = mdlp.fit_transform(X, self.y_w).astype("int64") - X, self.y_i = load_iris(return_X_y=True) + self.X_wc, self.y_w = load_wine(return_X_y=True) + self.X_w = mdlp.fit_transform(self.X_wc, self.y_w).astype("int64") + self.X_ic, self.y_i = load_iris(return_X_y=True) mdlp = MDLP(random_state=1) - self.X_i = mdlp.fit_transform(X, self.y_i).astype("int64") + self.X_i = mdlp.fit_transform(self.X_ic, self.y_i).astype("int64") def assertListAlmostEqual(self, list1, list2, tol=7): self.assertEqual(len(list1), len(list2)) @@ -21,16 +21,16 @@ class MFS_test(unittest.TestCase): self.assertAlmostEqual(a, b, tol) def test_initialize(self): - mfs = MFS(max_features=100) + mfs = MFS() mfs.fcbf(self.X_w, self.y_w, 0.05) - mfs._initialize() + mfs._initialize(self.X_w, self.y_w) self.assertIsNone(mfs.get_results()) self.assertListEqual([], mfs.get_scores()) self.assertDictEqual({}, mfs._su_features) self.assertIsNone(mfs._su_labels) def test_csf_wine(self): - mfs = MFS(max_features=100) + mfs = MFS() expected = [6, 12, 9, 4, 10, 0] self.assertListAlmostEqual( expected, mfs.cfs(self.X_w, self.y_w).get_results() @@ -45,6 +45,23 @@ class MFS_test(unittest.TestCase): ] self.assertListAlmostEqual(expected, mfs.get_scores()) + def test_csf_wine_cont(self): + mfs = MFS(discrete=False) + expected = [6, 11, 9, 0, 12, 5] + self.assertListAlmostEqual( + expected, mfs.cfs(self.X_wc, self.y_w).get_results() + ) + expected = [ + 0.5218299405215557, + 0.602513857132804, + 0.4877384978817362, + 0.3743688234383051, + 0.28795671854246285, + 0.2309165735173175, + ] + # self.assertListAlmostEqual(expected, mfs.get_scores()) + print(expected, mfs.get_scores()) + def test_csf_max_features(self): mfs = MFS(max_features=3) expected = [6, 12, 9] @@ -59,7 +76,7 @@ class MFS_test(unittest.TestCase): self.assertListAlmostEqual(expected, mfs.get_scores()) def test_csf_iris(self): - mfs = MFS(max_features=100) + mfs = MFS() expected = [3, 2, 0, 1] computed = mfs.cfs(self.X_i, self.y_i).get_results() self.assertListAlmostEqual(expected, computed) @@ -72,7 +89,7 @@ class MFS_test(unittest.TestCase): self.assertListAlmostEqual(expected, mfs.get_scores()) def test_fcbf_wine(self): - mfs = MFS(max_features=100) + mfs = MFS() computed = mfs.fcbf(self.X_w, self.y_w, threshold=0.05).get_results() expected = [6, 9, 12, 0, 11, 4] self.assertListAlmostEqual(expected, computed) @@ -99,7 +116,7 @@ class MFS_test(unittest.TestCase): self.assertListAlmostEqual(expected, mfs.get_scores()) def test_fcbf_iris(self): - mfs = MFS(max_features=100) + mfs = MFS() computed = mfs.fcbf(self.X_i, self.y_i, threshold=0.05).get_results() expected = [3, 2] self.assertListAlmostEqual(expected, computed) @@ -107,7 +124,7 @@ class MFS_test(unittest.TestCase): self.assertListAlmostEqual(expected, mfs.get_scores()) def test_compute_su_labels(self): - mfs = MFS(max_features=100) + mfs = MFS() mfs.fcbf(self.X_i, self.y_i, threshold=0.05) expected = [0.0, 0.0, 0.810724587460511, 0.870521418179061] self.assertListAlmostEqual(expected, mfs._compute_su_labels().tolist()) @@ -115,12 +132,12 @@ class MFS_test(unittest.TestCase): self.assertListAlmostEqual([1, 2, 3, 4], mfs._compute_su_labels()) def test_invalid_threshold(self): - mfs = MFS(max_features=100) + mfs = MFS() with self.assertRaises(ValueError): mfs.fcbf(self.X_i, self.y_i, threshold=1e-15) def test_fcbf_exit_threshold(self): - mfs = MFS(max_features=100) + mfs = MFS() computed = mfs.fcbf(self.X_w, self.y_w, threshold=0.4).get_results() expected = [6, 9, 12] self.assertListAlmostEqual(expected, computed) diff --git a/mfs/tests/Metrics_test.py b/mfs/tests/Metrics_test.py index e68c86c..bc8681e 100755 --- a/mfs/tests/Metrics_test.py +++ b/mfs/tests/Metrics_test.py @@ -1,5 +1,6 @@ import unittest -from sklearn.datasets import load_iris +import numpy as np +from sklearn.datasets import load_iris, load_wine from mdlp import MDLP from ..Selection import Metrics @@ -8,12 +9,10 @@ class Metrics_test(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) mdlp = MDLP(random_state=1) - X, self.y = load_iris(return_X_y=True) - self.X = mdlp.fit_transform(X, self.y).astype("int64") - self.m, self.n = self.X.shape - - # @classmethod - # def setup(cls): + self.X_i_c, self.y_i = load_iris(return_X_y=True) + self.X_i = mdlp.fit_transform(self.X_i_c, self.y_i).astype("int64") + self.X_w_c, self.y_w = load_wine(return_X_y=True) + self.X_w = mdlp.fit_transform(self.X_w_c, self.y_w).astype("int64") def test_entropy(self): metric = Metrics() @@ -24,12 +23,51 @@ class Metrics_test(unittest.TestCase): ([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263), ([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263), ([1, 1, 5], 2, 0.9182958340544896), - (self.y, 3, 0.999999999), + (self.y_i, 3, 0.999999999), ] for dataset, base, entropy in datasets: computed = metric.entropy(dataset, base) self.assertAlmostEqual(entropy, computed) + def test_differential_entropy(self): + metric = Metrics() + datasets = [ + ([0, 0, 0, 0, 1, 1, 1, 1], 6, 1.0026709900837547096), + ([0, 1, 0, 2, 1, 2], 5, 1.3552453009332424), + ([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 7, 1.7652626150881443), + ([1, 1, 1, 5, 2, 2, 3, 3, 3], 8, 1.9094631320594582), + ([1, 1, 1, 2, 2, 3, 3, 3, 5], 8, 1.9094631320594582), + ([1, 1, 5], 2, 2.5794415416798357), + (self.X_i_c, 37, 3.06627326925228), + (self.X_w_c, 37, 63.13827518897429), + ] + for dataset, base, entropy in datasets: + computed = metric.differential_entropy( + np.array(dataset, dtype="float64"), base + ) + self.assertAlmostEqual(entropy, computed, msg=str(dataset)) + expected = [ + 1.6378708764142766, + 2.0291571802275037, + 0.8273865123744271, + 3.203935772642847, + 4.859193341386733, + 1.3707315434976266, + 1.8794952925706312, + -0.2983180654207054, + 1.4521478934625076, + 2.834404839362728, + 0.4894081282811191, + 1.361210381692561, + 7.6373991502818175, + ] + n_samples, n_features = self.X_w_c.shape + for c, res_expected in zip(range(n_features), expected): + computed = metric.differential_entropy( + self.X_w_c[:, c], n_samples - 1 + ) + self.assertAlmostEqual(computed, res_expected) + def test_conditional_entropy(self): metric = Metrics() results_expected = [ @@ -39,7 +77,7 @@ class Metrics_test(unittest.TestCase): 0.13032469395094992, ] for expected, col in zip(results_expected, range(self.n)): - computed = metric.conditional_entropy(self.X[:, col], self.y, 3) + computed = metric.conditional_entropy(self.X_i[:, col], self.y, 3) self.assertAlmostEqual(expected, computed) self.assertAlmostEqual( 0.6309297535714573, @@ -62,7 +100,7 @@ class Metrics_test(unittest.TestCase): 0.8696753060490499, ] for expected, col in zip(results_expected, range(self.n)): - computed = metric.information_gain(self.X[:, col], self.y, 3) + computed = metric.information_gain(self.X_i[:, col], self.y, 3) self.assertAlmostEqual(expected, computed) # https://planetcalc.com/8419/ # ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_ @@ -73,7 +111,7 @@ class Metrics_test(unittest.TestCase): 1.378402748, ] for expected, col in zip(results_expected, range(self.n)): - computed = metric.information_gain(self.X[:, col], self.y, 2) + computed = metric.information_gain(self.X_i[:, col], self.y, 2) self.assertAlmostEqual(expected, computed) def test_symmetrical_uncertainty(self): @@ -85,5 +123,20 @@ class Metrics_test(unittest.TestCase): 0.870521418179061, ] for expected, col in zip(results_expected, range(self.n)): - computed = metric.symmetrical_uncertainty(self.X[:, col], self.y) + computed = metric.symmetrical_uncertainty(self.X_i[:, col], self.y) self.assertAlmostEqual(expected, computed) + + def test_symmetrical_uncertainty_continuous(self): + metric = Metrics() + results_expected = [ + 0.33296547388990266, + 0.19068147573570668, + 0.810724587460511, + 0.870521418179061, + ] + for expected, col in zip(results_expected, range(self.n)): + computed = metric.symmetrical_unc_continuous( + self.X_i[:, col], self.y + ) + print(computed) + # self.assertAlmostEqual(expected, computed)