From 794374fe8cb48c4152fd693ecd5c7b8f333be70a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= <rmontanana@gmail.com>
Date: Tue, 1 Jun 2021 11:30:52 +0200
Subject: [PATCH] Add max_features to selection Add first approach to
 continuous variables

---
 README.md                 |   1 +
 mfs/Metrics.py            | 228 ++++++++++++++++++++++++++
 mfs/Selection.py          | 189 +++++++--------------
 mfs/entropy_estimators.py | 334 ++++++++++++++++++++++++++++++++++++++
 mfs/tests/MFS_test.py     |  43 +++--
 mfs/tests/Metrics_test.py |  77 +++++++--
 6 files changed, 714 insertions(+), 158 deletions(-)
 create mode 100755 mfs/Metrics.py
 create mode 100755 mfs/entropy_estimators.py

diff --git a/README.md b/README.md
index 28bbd14..d5c7ef8 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 ![CI](https://github.com/Doctorado-ML/mfs/workflows/CI/badge.svg)
+[![Codacy Badge](https://app.codacy.com/project/badge/Grade/66ad727eb13e4c7a8816db1e44d994a7)](https://www.codacy.com/gh/Doctorado-ML/mfs/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/mfs&utm_campaign=Badge_Grade)
 [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/mfs.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/mfs/context:python)
 
 # MFS
diff --git a/mfs/Metrics.py b/mfs/Metrics.py
new file mode 100755
index 0000000..7b4c0a2
--- /dev/null
+++ b/mfs/Metrics.py
@@ -0,0 +1,228 @@
+from math import log
+import numpy as np
+
+from scipy.special import gamma, psi
+from sklearn.neighbors import BallTree, KDTree, NearestNeighbors
+from sklearn.feature_selection._mutual_info import _compute_mi
+
+# from .entropy_estimators import mi, entropy as c_entropy
+
+
+class Metrics:
+    @staticmethod
+    def information_gain_cont(x, y):
+        """Measures the reduction in uncertainty about the value of y when the
+        value of X continuous is known (also called mutual information)
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the continuous variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            Information gained
+        """
+        return _compute_mi(
+            x, y, x_discrete=False, y_discrete=True, n_neighbors=3
+        )
+
+    @staticmethod
+    def _nearest_distances(X, k=1):
+        """
+        X = array(N,M)
+        N = number of points
+        M = number of dimensions
+        returns the distance to the kth nearest neighbor for every point in X
+        """
+        knn = NearestNeighbors(n_neighbors=k + 1)
+        knn.fit(X)
+        d, _ = knn.kneighbors(X)  # the first nearest neighbor is itself
+        return d[:, -1]  # returns the distance to the kth nearest neighbor
+
+    @staticmethod
+    def differential_entropy(X, k=1):
+
+        """Returns the entropy of the X.
+        Parameters
+        ===========
+        X : array-like, shape (n_samples, n_features)
+            The data the entropy of which is computed
+        k : int, optional
+            number of nearest neighbors for density estimation
+        Notes
+        ======
+        Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy
+        of a random vector. Probl. Inf. Transm. 23, 95-101.
+        See also: Evans, D. 2008 A computationally efficient estimator for
+        mutual information, Proc. R. Soc. A 464 (2093), 1203-1215.
+        and:
+        Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual
+        information. Phys Rev E 69(6 Pt 2):066138.
+        """
+        if X.ndim == 1:
+            X = X.reshape(-1, 1)
+        # Distance to kth nearest neighbor
+        r = Metrics._nearest_distances(X, k)  # squared distances
+        n, d = X.shape
+        volume_unit_ball = (np.pi ** (0.5 * d)) / gamma(0.5 * d + 1)
+        """
+        F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures
+        for Continuous Random Variables. Advances in Neural Information
+        Processing Systems 21 (NIPS). Vancouver (Canada), December.
+        return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k)
+        """
+        return (
+            d * np.mean(np.log(r + np.finfo(X.dtype).eps))
+            + np.log(volume_unit_ball)
+            + psi(n)
+            - psi(k)
+        )
+
+    @staticmethod
+    def conditional_differential_entropy(x, y):
+        """quantifies the amount of information needed to describe the outcome
+        of Y discrete given that the value of X continuous is known
+        computes H(Y|X)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the continuous variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            conditional entropy of y given x
+        """
+        xy = np.c_[x, y]
+        return Metrics.differential_entropy(xy) - Metrics.differential_entropy(
+            x
+        )
+
+    @staticmethod
+    def symmetrical_unc_continuous(x, y):
+        """Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
+        https://github.com/gregversteeg/NPEET
+
+        Parameters
+        ----------
+        x : np.array
+            values of the continuous variable
+        y : np.array
+            array of labels
+
+        Returns
+        -------
+        float
+            symmetrical uncertainty
+        """
+
+        return (
+            2.0
+            * Metrics.information_gain_cont(x, y)
+            / (Metrics.differential_entropy(x) + Metrics.entropy(y))
+        )
+
+    @staticmethod
+    def symmetrical_uncertainty(x, y):
+        """Compute symmetrical uncertainty. Normalize* information gain (mutual
+        information) with the entropies of the features in order to compensate
+        the bias due to high cardinality features. *Range [0, 1]
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+
+        Returns
+        -------
+        float
+            symmetrical uncertainty
+        """
+        return (
+            2.0
+            * Metrics.information_gain(x, y)
+            / (Metrics.entropy(x) + Metrics.entropy(y))
+        )
+
+    @staticmethod
+    def conditional_entropy(x, y, base=2):
+        """quantifies the amount of information needed to describe the outcome
+        of Y given that the value of X is known
+        computes H(Y|X)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            conditional entropy of y given x
+        """
+        xy = np.c_[x, y]
+        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
+
+    @staticmethod
+    def entropy(y, base=2):
+        """measure of the uncertainty in predicting the value of y
+
+        Parameters
+        ----------
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            entropy of y
+        """
+        _, count = np.unique(y, return_counts=True, axis=0)
+        proba = count.astype(float) / len(y)
+        proba = proba[proba > 0.0]
+        return np.sum(proba * np.log(1.0 / proba)) / log(base)
+
+    @staticmethod
+    def information_gain(x, y, base=2):
+        """Measures the reduction in uncertainty about the value of y when the
+        value of X is known (also called mutual information)
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            Information gained
+        """
+        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
+            x, y, base
+        )
diff --git a/mfs/Selection.py b/mfs/Selection.py
index 574f8fa..b6aa02d 100755
--- a/mfs/Selection.py
+++ b/mfs/Selection.py
@@ -1,102 +1,8 @@
-from math import log, sqrt
+from math import sqrt
 from sys import float_info
 from itertools import combinations
 import numpy as np
-
-
-class Metrics:
-    @staticmethod
-    def conditional_entropy(x, y, base=2):
-        """quantifies the amount of information needed to describe the outcome
-        of Y given that the value of X is known
-        computes H(Y|X)
-
-        Parameters
-        ----------
-        x : np.array
-            values of the variable
-        y : np.array
-            array of labels
-        base : int, optional
-            base of the logarithm, by default 2
-
-        Returns
-        -------
-        float
-            conditional entropy of y given x
-        """
-        xy = np.c_[x, y]
-        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
-
-    @staticmethod
-    def entropy(y, base=2):
-        """measure of the uncertainty in predicting the value of y
-
-        Parameters
-        ----------
-        y : np.array
-            array of labels
-        base : int, optional
-            base of the logarithm, by default 2
-
-        Returns
-        -------
-        float
-            entropy of y
-        """
-        _, count = np.unique(y, return_counts=True, axis=0)
-        proba = count.astype(float) / len(y)
-        proba = proba[proba > 0.0]
-        return np.sum(proba * np.log(1.0 / proba)) / log(base)
-
-    @staticmethod
-    def information_gain(x, y, base=2):
-        """Measures the reduction in uncertainty about the value of y when the
-        value of X is known (also called mutual information)
-        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
-
-        Parameters
-        ----------
-        x : np.array
-            values of the variable
-        y : np.array
-            array of labels
-        base : int, optional
-            base of the logarithm, by default 2
-
-        Returns
-        -------
-        float
-            Information gained
-        """
-        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
-            x, y, base
-        )
-
-    @staticmethod
-    def symmetrical_uncertainty(x, y):
-        """Compute symmetrical uncertainty. Normalize* information gain (mutual
-        information) with the entropies of the features in order to compensate
-        the bias due to high cardinality features. *Range [0, 1]
-        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
-
-        Parameters
-        ----------
-        x : np.array
-            values of the variable
-        y : np.array
-            array of labels
-
-        Returns
-        -------
-        float
-            symmetrical uncertainty
-        """
-        return (
-            2.0
-            * Metrics.information_gain(x, y)
-            / (Metrics.entropy(x) + Metrics.entropy(y))
-        )
+from .Metrics import Metrics
 
 
 class MFS:
@@ -116,18 +22,36 @@ class MFS:
         The maximum number of features to return
     """
 
-    def __init__(self, max_features):
-        self._initialize()
+    def __init__(self, max_features=None, discrete=True):
         self._max_features = max_features
+        self._discrete = discrete
+        self.symmetrical_uncertainty = (
+            Metrics.symmetrical_uncertainty
+            if discrete
+            else Metrics.symmetrical_unc_continuous
+        )
+        self._fitted = False
 
-    def _initialize(self):
+    def _initialize(self, X, y):
         """Initialize the attributes so support multiple calls using same
         object
+
+        Parameters
+        ----------
+        X : np.array
+            array of features
+        y : np.array
+            vector of labels
         """
+        self.X_ = X
+        self.y_ = y
+        if self._max_features is None:
+            self._max_features = X.shape[1]
         self._result = None
         self._scores = []
         self._su_labels = None
         self._su_features = {}
+        self._fitted = True
 
     def _compute_su_labels(self):
         """Compute symmetrical uncertainty between each feature of the dataset
@@ -142,7 +66,7 @@ class MFS:
             num_features = self.X_.shape[1]
             self._su_labels = np.zeros(num_features)
             for col in range(num_features):
-                self._su_labels[col] = Metrics.symmetrical_uncertainty(
+                self._su_labels[col] = self.symmetrical_uncertainty(
                     self.X_[:, col], self.y_
                 )
         return self._su_labels
@@ -166,7 +90,7 @@ class MFS:
         if (feature_a, feature_b) not in self._su_features:
             self._su_features[
                 (feature_a, feature_b)
-            ] = Metrics.symmetrical_uncertainty(
+            ] = self.symmetrical_uncertainty(
                 self.X_[:, feature_a], self.X_[:, feature_b]
             )
         return self._su_features[(feature_a, feature_b)]
@@ -210,9 +134,7 @@ class MFS:
         self
             self
         """
-        self._initialize()
-        self.X_ = X
-        self.y_ = y
+        self._initialize(X, y)
         s_list = self._compute_su_labels()
         # Descending order
         feature_order = (-s_list).argsort().tolist()
@@ -235,33 +157,36 @@ class MFS:
             candidates.append(feature_order[id_selected])
             self._scores.append(merit)
             del feature_order[id_selected]
-            if (
-                len(feature_order) == 0
-                or len(candidates) == self._max_features
-            ):
-                # Force leaving the loop
-                continue_condition = False
-            if len(self._scores) >= 5:
-                """
-                "To prevent the best first search from exploring the entire
-                feature subset search space, a stopping criterion is imposed.
-                The search will terminate if five consecutive fully expanded
-                subsets show no improvement over the current best subset."
-                as stated in Mark A. Hall Thesis
-                """
-                item_ant = -1
-                for item in self._scores[-5:]:
-                    if item_ant == -1:
-                        item_ant = item
-                    if item > item_ant:
-                        break
-                    else:
-                        item_ant = item
-                else:
-                    continue_condition = False
+            continue_condition = self._cfs_continue_condition(
+                feature_order, candidates
+            )
         self._result = candidates
         return self
 
+    def _cfs_continue_condition(self, feature_order, candidates):
+        if len(feature_order) == 0 or len(candidates) == self._max_features:
+            # Force leaving the loop
+            return False
+        if len(self._scores) >= 5:
+            """
+            "To prevent the best first search from exploring the entire
+            feature subset search space, a stopping criterion is imposed.
+            The search will terminate if five consecutive fully expanded
+            subsets show no improvement over the current best subset."
+            as stated in Mark A. Hall Thesis
+            """
+            item_ant = -1
+            for item in self._scores[-5:]:
+                if item_ant == -1:
+                    item_ant = item
+                if item > item_ant:
+                    break
+                else:
+                    item_ant = item
+            else:
+                return False
+        return True
+
     def fcbf(self, X, y, threshold):
         """Fast Correlation-Based Filter
 
@@ -286,9 +211,7 @@ class MFS:
         """
         if threshold < 1e-7:
             raise ValueError("Threshold cannot be less than 1e-7")
-        self._initialize()
-        self.X_ = X
-        self.y_ = y
+        self._initialize(X, y)
         s_list = self._compute_su_labels()
         feature_order = (-s_list).argsort()
         feature_dup = feature_order.copy().tolist()
@@ -322,7 +245,7 @@ class MFS:
         list
             list of features indices selected
         """
-        return self._result
+        return self._result if self._fitted else []
 
     def get_scores(self):
         """Return the scores computed for the features selected
@@ -332,4 +255,4 @@ class MFS:
         list
             list of scores of the features selected
         """
-        return self._scores
+        return self._scores if self._fitted else []
diff --git a/mfs/entropy_estimators.py b/mfs/entropy_estimators.py
new file mode 100755
index 0000000..e0c1630
--- /dev/null
+++ b/mfs/entropy_estimators.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+# Written by Greg Ver Steeg
+# See readme.pdf for documentation
+# Or go to http://www.isi.edu/~gregv/npeet.html
+
+import warnings
+
+import numpy as np
+import numpy.linalg as la
+from numpy import log
+from scipy.special import digamma
+from sklearn.neighbors import BallTree, KDTree
+
+# CONTINUOUS ESTIMATORS
+
+
+def entropy(x, k=3, base=2):
+    """The classic K-L k-nearest neighbor continuous entropy estimator
+    x should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]]
+    if x is a one-dimensional scalar and we have four samples
+    """
+    assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
+    x = np.asarray(x)
+    n_elements, n_features = x.shape
+    x = add_noise(x)
+    tree = build_tree(x)
+    nn = query_neighbors(tree, x, k)
+    const = digamma(n_elements) - digamma(k) + n_features * log(2)
+    return (const + n_features * np.log(nn).mean()) / log(base)
+
+
+def centropy(x, y, k=3, base=2):
+    """The classic K-L k-nearest neighbor continuous entropy estimator for the
+    entropy of X conditioned on Y.
+    """
+    xy = np.c_[x, y]
+    entropy_union_xy = entropy(xy, k=k, base=base)
+    entropy_y = entropy(y, k=k, base=base)
+    return entropy_union_xy - entropy_y
+
+
+def tc(xs, k=3, base=2):
+    xs_columns = np.expand_dims(xs, axis=0).T
+    entropy_features = [entropy(col, k=k, base=base) for col in xs_columns]
+    return np.sum(entropy_features) - entropy(xs, k, base)
+
+
+def ctc(xs, y, k=3, base=2):
+    xs_columns = np.expand_dims(xs, axis=0).T
+    centropy_features = [
+        centropy(col, y, k=k, base=base) for col in xs_columns
+    ]
+    return np.sum(centropy_features) - centropy(xs, y, k, base)
+
+
+def corex(xs, ys, k=3, base=2):
+    xs_columns = np.expand_dims(xs, axis=0).T
+    cmi_features = [mi(col, ys, k=k, base=base) for col in xs_columns]
+    return np.sum(cmi_features) - mi(xs, ys, k=k, base=base)
+
+
+def mi(x, y, z=None, k=3, base=2, alpha=0):
+    """Mutual information of x and y (conditioned on z if z is not None)
+    x, y should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1], [2.4]]
+    if x is a one-dimensional scalar and we have four samples
+    """
+    assert len(x) == len(y), "Arrays should have same length"
+    assert k <= len(x) - 1, "Set k smaller than num. samples - 1"
+    x, y = np.asarray(x), np.asarray(y)
+    x, y = x.reshape(x.shape[0], -1), y.reshape(y.shape[0], -1)
+    x = add_noise(x)
+    y = add_noise(y)
+    points = [x, y]
+    if z is not None:
+        z = np.asarray(z)
+        z = z.reshape(z.shape[0], -1)
+        points.append(z)
+    points = np.hstack(points)
+    # Find nearest neighbors in joint space, p=inf means max-norm
+    tree = build_tree(points)
+    dvec = query_neighbors(tree, points, k)
+    if z is None:
+        a, b, c, d = (
+            avgdigamma(x, dvec),
+            avgdigamma(y, dvec),
+            digamma(k),
+            digamma(len(x)),
+        )
+        if alpha > 0:
+            d += lnc_correction(tree, points, k, alpha)
+    else:
+        xz = np.c_[x, z]
+        yz = np.c_[y, z]
+        a, b, c, d = (
+            avgdigamma(xz, dvec),
+            avgdigamma(yz, dvec),
+            avgdigamma(z, dvec),
+            digamma(k),
+        )
+    return (-a - b + c + d) / log(base)
+
+
+def cmi(x, y, z, k=3, base=2):
+    """Mutual information of x and y, conditioned on z
+    Legacy function. Use mi(x, y, z) directly.
+    """
+    return mi(x, y, z=z, k=k, base=base)
+
+
+def kldiv(x, xp, k=3, base=2):
+    """KL Divergence between p and q for x~p(x), xp~q(x)
+    x, xp should be a list of vectors, e.g. x = [[1.3], [3.7], [5.1],[2.4]]
+    if x is a one-dimensional scalar and we have four samples
+    """
+    assert k < min(len(x), len(xp)), "Set k smaller than num. samples - 1"
+    assert len(x[0]) == len(xp[0]), "Two distributions must have same dim."
+    x, xp = np.asarray(x), np.asarray(xp)
+    x, xp = x.reshape(x.shape[0], -1), xp.reshape(xp.shape[0], -1)
+    d = len(x[0])
+    n = len(x)
+    m = len(xp)
+    const = log(m) - log(n - 1)
+    tree = build_tree(x)
+    treep = build_tree(xp)
+    nn = query_neighbors(tree, x, k)
+    nnp = query_neighbors(treep, x, k - 1)
+    return (const + d * (np.log(nnp).mean() - np.log(nn).mean())) / log(base)
+
+
+def lnc_correction(tree, points, k, alpha):
+    e = 0
+    n_sample = points.shape[0]
+    for point in points:
+        # Find k-nearest neighbors in joint space, p=inf means max norm
+        knn = tree.query(point[None, :], k=k + 1, return_distance=False)[0]
+        knn_points = points[knn]
+        # Substract mean of k-nearest neighbor points
+        knn_points = knn_points - knn_points[0]
+        # Calculate covariance matrix of k-nearest neighbor points, obtain eigen vectors
+        covr = knn_points.T @ knn_points / k
+        _, v = la.eig(covr)
+        # Calculate PCA-bounding box using eigen vectors
+        V_rect = np.log(np.abs(knn_points @ v).max(axis=0)).sum()
+        # Calculate the volume of original box
+        log_knn_dist = np.log(np.abs(knn_points).max(axis=0)).sum()
+
+        # Perform local non-uniformity checking and update correction term
+        if V_rect < log_knn_dist + np.log(alpha):
+            e += (log_knn_dist - V_rect) / n_sample
+    return e
+
+
+# DISCRETE ESTIMATORS
+def entropyd(sx, base=2):
+    """Discrete entropy estimator
+    sx is a list of samples
+    """
+    unique, count = np.unique(sx, return_counts=True, axis=0)
+    # Convert to float as otherwise integer division results in all 0 for proba.
+    proba = count.astype(float) / len(sx)
+    # Avoid 0 division; remove probabilities == 0.0 (removing them does not change the entropy estimate as 0 * log(1/0) = 0.
+    proba = proba[proba > 0.0]
+    return np.sum(proba * np.log(1.0 / proba)) / log(base)
+
+
+def midd(x, y, base=2):
+    """Discrete mutual information estimator
+    Given a list of samples which can be any hashable object
+    """
+    assert len(x) == len(y), "Arrays should have same length"
+    return entropyd(x, base) - centropyd(x, y, base)
+
+
+def cmidd(x, y, z, base=2):
+    """Discrete mutual information estimator
+    Given a list of samples which can be any hashable object
+    """
+    assert len(x) == len(y) == len(z), "Arrays should have same length"
+    xz = np.c_[x, z]
+    yz = np.c_[y, z]
+    xyz = np.c_[x, y, z]
+    return (
+        entropyd(xz, base)
+        + entropyd(yz, base)
+        - entropyd(xyz, base)
+        - entropyd(z, base)
+    )
+
+
+def centropyd(x, y, base=2):
+    """The classic K-L k-nearest neighbor continuous entropy estimator for the
+    entropy of X conditioned on Y.
+    """
+    xy = np.c_[x, y]
+    return entropyd(xy, base) - entropyd(y, base)
+
+
+def tcd(xs, base=2):
+    xs_columns = np.expand_dims(xs, axis=0).T
+    entropy_features = [entropyd(col, base=base) for col in xs_columns]
+    return np.sum(entropy_features) - entropyd(xs, base)
+
+
+def ctcd(xs, y, base=2):
+    xs_columns = np.expand_dims(xs, axis=0).T
+    centropy_features = [centropyd(col, y, base=base) for col in xs_columns]
+    return np.sum(centropy_features) - centropyd(xs, y, base)
+
+
+def corexd(xs, ys, base=2):
+    xs_columns = np.expand_dims(xs, axis=0).T
+    cmi_features = [midd(col, ys, base=base) for col in xs_columns]
+    return np.sum(cmi_features) - midd(xs, ys, base)
+
+
+# MIXED ESTIMATORS
+def micd(x, y, k=3, base=2, warning=True):
+    """If x is continuous and y is discrete, compute mutual information"""
+    assert len(x) == len(y), "Arrays should have same length"
+    entropy_x = entropy(x, k, base)
+
+    y_unique, y_count = np.unique(y, return_counts=True, axis=0)
+    y_proba = y_count / len(y)
+
+    entropy_x_given_y = 0.0
+    for yval, py in zip(y_unique, y_proba):
+        x_given_y = x[(y == yval).all(axis=1)]
+        if k <= len(x_given_y) - 1:
+            entropy_x_given_y += py * entropy(x_given_y, k, base)
+        else:
+            if warning:
+                warnings.warn(
+                    "Warning, after conditioning, on y={yval} insufficient data. "
+                    "Assuming maximal entropy in this case.".format(yval=yval)
+                )
+            entropy_x_given_y += py * entropy_x
+    return abs(entropy_x - entropy_x_given_y)  # units already applied
+
+
+def midc(x, y, k=3, base=2, warning=True):
+    return micd(y, x, k, base, warning)
+
+
+def centropycd(x, y, k=3, base=2, warning=True):
+    return entropy(x, base) - micd(x, y, k, base, warning)
+
+
+def centropydc(x, y, k=3, base=2, warning=True):
+    return centropycd(y, x, k=k, base=base, warning=warning)
+
+
+def ctcdc(xs, y, k=3, base=2, warning=True):
+    xs_columns = np.expand_dims(xs, axis=0).T
+    centropy_features = [
+        centropydc(col, y, k=k, base=base, warning=warning)
+        for col in xs_columns
+    ]
+    return np.sum(centropy_features) - centropydc(xs, y, k, base, warning)
+
+
+def ctccd(xs, y, k=3, base=2, warning=True):
+    return ctcdc(y, xs, k=k, base=base, warning=warning)
+
+
+def corexcd(xs, ys, k=3, base=2, warning=True):
+    return corexdc(ys, xs, k=k, base=base, warning=warning)
+
+
+def corexdc(xs, ys, k=3, base=2, warning=True):
+    return tcd(xs, base) - ctcdc(xs, ys, k, base, warning)
+
+
+# UTILITY FUNCTIONS
+
+
+def add_noise(x, intens=1e-10):
+    # small noise to break degeneracy, see doc.
+    return x + intens * np.random.random_sample(x.shape)
+
+
+def query_neighbors(tree, x, k):
+    return tree.query(x, k=k + 1)[0][:, k]
+
+
+def count_neighbors(tree, x, r):
+    return tree.query_radius(x, r, count_only=True)
+
+
+def avgdigamma(points, dvec):
+    # This part finds number of neighbors in some radius in the marginal space
+    # returns expectation value of <psi(nx)>
+    tree = build_tree(points)
+    dvec = dvec - 1e-15
+    num_points = count_neighbors(tree, points, dvec)
+    return np.mean(digamma(num_points))
+
+
+def build_tree(points):
+    if points.shape[1] >= 20:
+        return BallTree(points, metric="chebyshev")
+    return KDTree(points, metric="chebyshev")
+
+
+# TESTS
+
+
+def shuffle_test(measure, x, y, z=False, ns=200, ci=0.95, **kwargs):
+    """Shuffle test
+    Repeatedly shuffle the x-values and then estimate measure(x, y, [z]).
+    Returns the mean and conf. interval ('ci=0.95' default) over 'ns' runs.
+    'measure' could me mi, cmi, e.g. Keyword arguments can be passed.
+    Mutual information and CMI should have a mean near zero.
+    """
+    x_clone = np.copy(x)  # A copy that we can shuffle
+    outputs = []
+    for _ in range(ns):
+        np.random.shuffle(x_clone)
+        if z:
+            outputs.append(measure(x_clone, y, z, **kwargs))
+        else:
+            outputs.append(measure(x_clone, y, **kwargs))
+    outputs.sort()
+    return np.mean(outputs), (
+        outputs[int((1.0 - ci) / 2 * ns)],
+        outputs[int((1.0 + ci) / 2 * ns)],
+    )
+
+
+if __name__ == "__main__":
+    print("MI between two independent continuous random variables X and Y:")
+    np.random.seed(0)
+    x = np.random.randn(1000, 10)
+    y = np.random.randn(1000, 3)
+    print(mi(x, y, base=2, alpha=0))
diff --git a/mfs/tests/MFS_test.py b/mfs/tests/MFS_test.py
index db9aebf..0e9129d 100755
--- a/mfs/tests/MFS_test.py
+++ b/mfs/tests/MFS_test.py
@@ -9,11 +9,11 @@ class MFS_test(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         mdlp = MDLP(random_state=1)
-        X, self.y_w = load_wine(return_X_y=True)
-        self.X_w = mdlp.fit_transform(X, self.y_w).astype("int64")
-        X, self.y_i = load_iris(return_X_y=True)
+        self.X_wc, self.y_w = load_wine(return_X_y=True)
+        self.X_w = mdlp.fit_transform(self.X_wc, self.y_w).astype("int64")
+        self.X_ic, self.y_i = load_iris(return_X_y=True)
         mdlp = MDLP(random_state=1)
-        self.X_i = mdlp.fit_transform(X, self.y_i).astype("int64")
+        self.X_i = mdlp.fit_transform(self.X_ic, self.y_i).astype("int64")
 
     def assertListAlmostEqual(self, list1, list2, tol=7):
         self.assertEqual(len(list1), len(list2))
@@ -21,16 +21,16 @@ class MFS_test(unittest.TestCase):
             self.assertAlmostEqual(a, b, tol)
 
     def test_initialize(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         mfs.fcbf(self.X_w, self.y_w, 0.05)
-        mfs._initialize()
+        mfs._initialize(self.X_w, self.y_w)
         self.assertIsNone(mfs.get_results())
         self.assertListEqual([], mfs.get_scores())
         self.assertDictEqual({}, mfs._su_features)
         self.assertIsNone(mfs._su_labels)
 
     def test_csf_wine(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         expected = [6, 12, 9, 4, 10, 0]
         self.assertListAlmostEqual(
             expected, mfs.cfs(self.X_w, self.y_w).get_results()
@@ -45,6 +45,23 @@ class MFS_test(unittest.TestCase):
         ]
         self.assertListAlmostEqual(expected, mfs.get_scores())
 
+    def test_csf_wine_cont(self):
+        mfs = MFS(discrete=False)
+        expected = [6, 11, 9, 0, 12, 5]
+        self.assertListAlmostEqual(
+            expected, mfs.cfs(self.X_wc, self.y_w).get_results()
+        )
+        expected = [
+            0.5218299405215557,
+            0.602513857132804,
+            0.4877384978817362,
+            0.3743688234383051,
+            0.28795671854246285,
+            0.2309165735173175,
+        ]
+        # self.assertListAlmostEqual(expected, mfs.get_scores())
+        print(expected, mfs.get_scores())
+
     def test_csf_max_features(self):
         mfs = MFS(max_features=3)
         expected = [6, 12, 9]
@@ -59,7 +76,7 @@ class MFS_test(unittest.TestCase):
         self.assertListAlmostEqual(expected, mfs.get_scores())
 
     def test_csf_iris(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         expected = [3, 2, 0, 1]
         computed = mfs.cfs(self.X_i, self.y_i).get_results()
         self.assertListAlmostEqual(expected, computed)
@@ -72,7 +89,7 @@ class MFS_test(unittest.TestCase):
         self.assertListAlmostEqual(expected, mfs.get_scores())
 
     def test_fcbf_wine(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         computed = mfs.fcbf(self.X_w, self.y_w, threshold=0.05).get_results()
         expected = [6, 9, 12, 0, 11, 4]
         self.assertListAlmostEqual(expected, computed)
@@ -99,7 +116,7 @@ class MFS_test(unittest.TestCase):
         self.assertListAlmostEqual(expected, mfs.get_scores())
 
     def test_fcbf_iris(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         computed = mfs.fcbf(self.X_i, self.y_i, threshold=0.05).get_results()
         expected = [3, 2]
         self.assertListAlmostEqual(expected, computed)
@@ -107,7 +124,7 @@ class MFS_test(unittest.TestCase):
         self.assertListAlmostEqual(expected, mfs.get_scores())
 
     def test_compute_su_labels(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         mfs.fcbf(self.X_i, self.y_i, threshold=0.05)
         expected = [0.0, 0.0, 0.810724587460511, 0.870521418179061]
         self.assertListAlmostEqual(expected, mfs._compute_su_labels().tolist())
@@ -115,12 +132,12 @@ class MFS_test(unittest.TestCase):
         self.assertListAlmostEqual([1, 2, 3, 4], mfs._compute_su_labels())
 
     def test_invalid_threshold(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         with self.assertRaises(ValueError):
             mfs.fcbf(self.X_i, self.y_i, threshold=1e-15)
 
     def test_fcbf_exit_threshold(self):
-        mfs = MFS(max_features=100)
+        mfs = MFS()
         computed = mfs.fcbf(self.X_w, self.y_w, threshold=0.4).get_results()
         expected = [6, 9, 12]
         self.assertListAlmostEqual(expected, computed)
diff --git a/mfs/tests/Metrics_test.py b/mfs/tests/Metrics_test.py
index e68c86c..bc8681e 100755
--- a/mfs/tests/Metrics_test.py
+++ b/mfs/tests/Metrics_test.py
@@ -1,5 +1,6 @@
 import unittest
-from sklearn.datasets import load_iris
+import numpy as np
+from sklearn.datasets import load_iris, load_wine
 from mdlp import MDLP
 from ..Selection import Metrics
 
@@ -8,12 +9,10 @@ class Metrics_test(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         mdlp = MDLP(random_state=1)
-        X, self.y = load_iris(return_X_y=True)
-        self.X = mdlp.fit_transform(X, self.y).astype("int64")
-        self.m, self.n = self.X.shape
-
-    # @classmethod
-    # def setup(cls):
+        self.X_i_c, self.y_i = load_iris(return_X_y=True)
+        self.X_i = mdlp.fit_transform(self.X_i_c, self.y_i).astype("int64")
+        self.X_w_c, self.y_w = load_wine(return_X_y=True)
+        self.X_w = mdlp.fit_transform(self.X_w_c, self.y_w).astype("int64")
 
     def test_entropy(self):
         metric = Metrics()
@@ -24,12 +23,51 @@ class Metrics_test(unittest.TestCase):
             ([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263),
             ([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263),
             ([1, 1, 5], 2, 0.9182958340544896),
-            (self.y, 3, 0.999999999),
+            (self.y_i, 3, 0.999999999),
         ]
         for dataset, base, entropy in datasets:
             computed = metric.entropy(dataset, base)
             self.assertAlmostEqual(entropy, computed)
 
+    def test_differential_entropy(self):
+        metric = Metrics()
+        datasets = [
+            ([0, 0, 0, 0, 1, 1, 1, 1], 6, 1.0026709900837547096),
+            ([0, 1, 0, 2, 1, 2], 5, 1.3552453009332424),
+            ([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 7, 1.7652626150881443),
+            ([1, 1, 1, 5, 2, 2, 3, 3, 3], 8, 1.9094631320594582),
+            ([1, 1, 1, 2, 2, 3, 3, 3, 5], 8, 1.9094631320594582),
+            ([1, 1, 5], 2, 2.5794415416798357),
+            (self.X_i_c, 37, 3.06627326925228),
+            (self.X_w_c, 37, 63.13827518897429),
+        ]
+        for dataset, base, entropy in datasets:
+            computed = metric.differential_entropy(
+                np.array(dataset, dtype="float64"), base
+            )
+            self.assertAlmostEqual(entropy, computed, msg=str(dataset))
+        expected = [
+            1.6378708764142766,
+            2.0291571802275037,
+            0.8273865123744271,
+            3.203935772642847,
+            4.859193341386733,
+            1.3707315434976266,
+            1.8794952925706312,
+            -0.2983180654207054,
+            1.4521478934625076,
+            2.834404839362728,
+            0.4894081282811191,
+            1.361210381692561,
+            7.6373991502818175,
+        ]
+        n_samples, n_features = self.X_w_c.shape
+        for c, res_expected in zip(range(n_features), expected):
+            computed = metric.differential_entropy(
+                self.X_w_c[:, c], n_samples - 1
+            )
+            self.assertAlmostEqual(computed, res_expected)
+
     def test_conditional_entropy(self):
         metric = Metrics()
         results_expected = [
@@ -39,7 +77,7 @@ class Metrics_test(unittest.TestCase):
             0.13032469395094992,
         ]
         for expected, col in zip(results_expected, range(self.n)):
-            computed = metric.conditional_entropy(self.X[:, col], self.y, 3)
+            computed = metric.conditional_entropy(self.X_i[:, col], self.y, 3)
             self.assertAlmostEqual(expected, computed)
         self.assertAlmostEqual(
             0.6309297535714573,
@@ -62,7 +100,7 @@ class Metrics_test(unittest.TestCase):
             0.8696753060490499,
         ]
         for expected, col in zip(results_expected, range(self.n)):
-            computed = metric.information_gain(self.X[:, col], self.y, 3)
+            computed = metric.information_gain(self.X_i[:, col], self.y, 3)
             self.assertAlmostEqual(expected, computed)
         # https://planetcalc.com/8419/
         # ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_
@@ -73,7 +111,7 @@ class Metrics_test(unittest.TestCase):
             1.378402748,
         ]
         for expected, col in zip(results_expected, range(self.n)):
-            computed = metric.information_gain(self.X[:, col], self.y, 2)
+            computed = metric.information_gain(self.X_i[:, col], self.y, 2)
             self.assertAlmostEqual(expected, computed)
 
     def test_symmetrical_uncertainty(self):
@@ -85,5 +123,20 @@ class Metrics_test(unittest.TestCase):
             0.870521418179061,
         ]
         for expected, col in zip(results_expected, range(self.n)):
-            computed = metric.symmetrical_uncertainty(self.X[:, col], self.y)
+            computed = metric.symmetrical_uncertainty(self.X_i[:, col], self.y)
             self.assertAlmostEqual(expected, computed)
+
+    def test_symmetrical_uncertainty_continuous(self):
+        metric = Metrics()
+        results_expected = [
+            0.33296547388990266,
+            0.19068147573570668,
+            0.810724587460511,
+            0.870521418179061,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.symmetrical_unc_continuous(
+                self.X_i[:, col], self.y
+            )
+            print(computed)
+            # self.assertAlmostEqual(expected, computed)