first try with sklearn mi

2025-08-16 08:05:56 +00:00 · 2021-06-02 01:46:21 +02:00
parent eb00e1516a
commit 96098e9fe1
2 changed files with 95 additions and 7 deletions
--- a/mfs/Metrics.py
+++ b/mfs/Metrics.py
@@ -1,9 +1,11 @@
 from math import log
 import numpy as np

-from scipy.special import gamma, psi
+from scipy.special import digamma, gamma, psi
+from sklearn.neighbors import BallTree, KDTree
 from sklearn.neighbors import NearestNeighbors
-from sklearn.feature_selection._mutual_info import _compute_mi
+
+# from sklearn.feature_selection._mutual_info import _compute_mi


 class Metrics:
@@ -19,17 +21,95 @@ class Metrics:
            values of the continuous variable
        y : np.array
            array of labels
-        base : int, optional
-            base of the logarithm, by default 2

        Returns
        -------
        float
            Information gained
        """
-        return _compute_mi(
-            x, y, x_discrete=False, y_discrete=True, n_neighbors=3
+        # return _compute_mi(
+        #     x, y, x_discrete=False, y_discrete=True, n_neighbors=3
+        # )
+        return Metrics._compute_mi_cd(x, y, n_neighbors=3)
+
+    def _compute_mi_cd(c, d, n_neighbors):
+        """Compute mutual information between continuous and discrete variables.
+
+        Parameters
+        ----------
+        c : ndarray, shape (n_samples,)
+            Samples of a continuous random variable.
+
+        d : ndarray, shape (n_samples,)
+            Samples of a discrete random variable.
+
+        n_neighbors : int
+            Number of nearest neighbors to search for each point, see [1]_.
+
+        Returns
+        -------
+        mi : float
+            Estimated mutual information. If it turned out to be negative it is
+            replace by 0.
+
+        Notes
+        -----
+        True mutual information can't be negative. If its estimate by a numerical
+        method is negative, it means (providing the method is adequate) that the
+        mutual information is close to 0 and replacing it by 0 is a reasonable
+        strategy.
+
+        References
+        ----------
+        .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
+        Data Sets". PLoS ONE 9(2), 2014.
+        """
+        n_samples = c.shape[0]
+        if c.ndim == 1:
+            c = c.reshape((-1, 1))
+
+        radius = np.empty(n_samples)
+        label_counts = np.empty(n_samples)
+        k_all = np.empty(n_samples)
+        nn = NearestNeighbors()
+        for label in np.unique(d):
+            mask = d == label
+            count = np.sum(mask)
+            if count > 1:
+                k = min(n_neighbors, count - 1)
+                nn.set_params(n_neighbors=k)
+                nn.fit(c[mask])
+                r = nn.kneighbors()[0]
+                radius[mask] = np.nextafter(r[:, -1], 0)
+                k_all[mask] = k
+            label_counts[mask] = count
+
+        # Ignore points with unique labels.
+        mask = label_counts > 1
+        n_samples = np.sum(mask)
+        label_counts = label_counts[mask]
+        k_all = k_all[mask]
+        c = c[mask]
+        radius = radius[mask]
+
+        # kd = KDTree(c)
+        kd = (
+            BallTree(c, metric="chebyshev")
+            if n_samples >= 20
+            else KDTree(c, metric="chebyshev")
        )
+        m_all = kd.query_radius(
+            c, radius, count_only=True, return_distance=False
+        )
+        m_all = np.array(m_all) - 1.0
+
+        mi = (
+            digamma(n_samples)
+            + np.mean(digamma(k_all))
+            - np.mean(digamma(label_counts))
+            - np.mean(digamma(m_all + 1))
+        )
+        return max(0, mi)

    @staticmethod
    def _nearest_distances(X, k=1):
--- a/mfs/k.py
+++ b/mfs/k.py
@@ -17,6 +17,14 @@ print(Metrics.information_gain(X, y))
 for i in range(n):
    print(i, Metrics.information_gain(X[:, i], y))
 print("- Continuous features")
-# print(Metrics.information_gain_cont(X, y))
+print(Metrics.information_gain_cont(X, y))
 for i in range(n):
    print(i, Metrics.information_gain_cont(X[:, i], y))
+print("CFS Discrete")
+print(mfsd.cfs(X, y).get_results())
+print("CFS continuous")
+print(mfsc.cfs(X, y).get_results())
+print("FCBF Discrete")
+print(mfsd.fcbf(X, y, 1e-7).get_results())
+print("FCBF continuous")
+print(mfsc.fcbf(X, y, 1e-7).get_results())