update and complete sample code

2025-08-17 16:45:53 +00:00 · 2021-07-29 20:35:29 +02:00
parent 59c86b8e53
commit bd709131e4
8 changed files with 391 additions and 1343 deletions
--- a/mfs/tests/MFS_test.py
+++ b/mfs/tests/MFS_test.py
@@ -47,17 +47,17 @@ class MFS_test(unittest.TestCase):

    def test_csf_wine_cont(self):
        mfs = MFS(discrete=False)
-        expected = [10, 6, 0, 2, 9, 7]
+        expected = [10, 6, 0, 2, 11, 9]
        self.assertListEqual(
            expected, mfs.cfs(self.X_wc, self.y_w).get_results()
        )
        expected = [
            0.735264150416997,
-            0.8279580852902876,
-            0.7828768186880067,
-            0.7279815238718462,
-            0.6287944059925545,
-            0.5416637958201808,
+            0.8321684551546848,
+            0.7439915858469107,
+            0.6238883340158233,
+            0.513637402071709,
+            0.41596400981378984,
        ]
        self.assertListAlmostEqual(expected, mfs.get_scores())

--- a/mfs/tests/Metrics_test.py
+++ b/mfs/tests/Metrics_test.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 from sklearn.datasets import load_iris, load_wine
+from sklearn.utils import check_random_state
 from mdlp import MDLP
 from ..Selection import Metrics

@@ -173,3 +174,72 @@ class Metrics_test(unittest.TestCase):
                self.X_w_c[:, col], self.y_w
            )
            self.assertAlmostEqual(expected, computed)
+
+    def test_compute_mi_cd_wine(self):
+        metric = Metrics()
+        mi = metric._compute_mi_cd(self.X_w_c, self.y_w, 5)
+        self.assertAlmostEqual(mi, 0.27887866726386035)
+
+    def test_compute_mi_cd_no_mi(self):
+        metric = Metrics()
+        synth_y = list(range(0, self.y_w.shape[0]))
+        mi = metric._compute_mi_cd(self.X_w_c, synth_y, 1)
+        self.assertAlmostEqual(mi, 0.0)
+
+    def test_compute_mi_cd(self):
+        # code taken from sklearn.feature_selection.tests.test_mutual_info
+        # To test define a joint distribution as follows:
+        # p(x, y) = p(x) p(y | x)
+        # X ~ Bernoulli(p)
+        # (Y | x = 0) ~ Uniform(-1, 1)
+        # (Y | x = 1) ~ Uniform(0, 2)
+
+        # Use the following formula for mutual information:
+        # I(X; Y) = H(Y) - H(Y | X)
+        # Two entropies can be computed by hand:
+        # H(Y) = -(1-p)/2 * ln((1-p)/2) - p/2*log(p/2) - 1/2*log(1/2)
+        # H(Y | X) = ln(2)
+
+        # Now we need to implement sampling from out distribution, which is
+        # done easily using conditional distribution logic.
+
+        metric = Metrics()
+        n_samples = 1000
+        rng = check_random_state(0)
+
+        for p in [0.3, 0.5, 0.7]:
+            x = rng.uniform(size=n_samples) > p
+
+            y = np.empty(n_samples)
+            mask = x == 0
+            y[mask] = rng.uniform(-1, 1, size=np.sum(mask))
+            y[~mask] = rng.uniform(0, 2, size=np.sum(~mask))
+            I_theory = -0.5 * (
+                (1 - p) * np.log(0.5 * (1 - p))
+                + p * np.log(0.5 * p)
+                + np.log(0.5)
+            ) - np.log(2)
+
+            # Assert the same tolerance.
+            for n_neighbors in [3, 5, 7]:
+                I_computed = metric._compute_mi_cd(y, x, n_neighbors)
+                self.assertAlmostEqual(I_computed, I_theory, 1)
+
+    def test_compute_mi_cd_unique_label(self):
+        # code taken from sklearn.feature_selection.tests.test_mutual_info
+        # Test that adding unique label doesn't change MI.
+        metric = Metrics()
+        n_samples = 100
+        x = np.random.uniform(size=n_samples) > 0.5
+
+        y = np.empty(n_samples)
+        mask = x == 0
+        y[mask] = np.random.uniform(-1, 1, size=np.sum(mask))
+        y[~mask] = np.random.uniform(0, 2, size=np.sum(~mask))
+
+        mi_1 = metric._compute_mi_cd(y, x, 3)
+
+        x = np.hstack((x, 2))
+        y = np.hstack((y, 10))
+        mi_2 = metric._compute_mi_cd(y, x, 3)
+        self.assertAlmostEqual(mi_1, mi_2, 1)