Fix some tests

2025-08-17 08:35:52 +00:00 · 2021-06-01 23:14:22 +02:00
parent b15a059b1d
commit eb00e1516a
4 changed files with 68 additions and 12 deletions
--- a/mfs/Metrics.py
+++ b/mfs/Metrics.py
@@ -2,11 +2,9 @@ from math import log
 import numpy as np

 from scipy.special import gamma, psi
-from sklearn.neighbors import BallTree, KDTree, NearestNeighbors
+from sklearn.neighbors import NearestNeighbors
 from sklearn.feature_selection._mutual_info import _compute_mi

-# from .entropy_estimators import mi, entropy as c_entropy
-

 class Metrics:
    @staticmethod
@@ -65,6 +63,10 @@ class Metrics:
        and:
        Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual
        information. Phys Rev E 69(6 Pt 2):066138.
+
+        Differential entropy can be negative
+        https://stats.stackexchange.com/questions/73881/
+        when-is-the-differential-entropy-negative
        """
        if x.ndim == 1:
            x = x.reshape(-1, 1)
@@ -131,7 +133,10 @@ class Metrics:
        return (
            2.0
            * Metrics.information_gain_cont(x, y)
-            / (Metrics.differential_entropy(x) + Metrics.entropy(y))
+            / (
+                Metrics.differential_entropy(x, k=len(x) - 1)
+                + Metrics.entropy(y)
+            )
        )

    @staticmethod
--- a/mfs/Selection.py
+++ b/mfs/Selection.py
@@ -20,6 +20,9 @@ class MFS:
    ----------
    max_features: int
        The maximum number of features to return
+    discrete: boolean
+        If the features are continuous or discrete. It always supose discrete
+        labels.
    """

    def __init__(self, max_features=None, discrete=True):
--- a/mfs/k.py
+++ b/mfs/k.py
@@ -0,0 +1,22 @@
+from sklearn.datasets import load_wine
+from mfs import MFS
+from mfs.Metrics import Metrics
+
+mfsc = MFS(discrete=False)
+mfsd = MFS(discrete=True)
+X, y = load_wine(return_X_y=True)
+m, n = X.shape
+
+print("* Differential entropy in X")
+for i in range(n):
+    print(i, Metrics.differential_entropy(X[:, i], k=10))
+
+print("* Information Gain")
+print("- Discrete features")
+print(Metrics.information_gain(X, y))
+for i in range(n):
+    print(i, Metrics.information_gain(X[:, i], y))
+print("- Continuous features")
+# print(Metrics.information_gain_cont(X, y))
+for i in range(n):
+    print(i, Metrics.information_gain_cont(X[:, i], y))
--- a/mfs/tests/Metrics_test.py
+++ b/mfs/tests/Metrics_test.py
@@ -1,6 +1,7 @@
 import unittest
 import numpy as np
 from sklearn.datasets import load_iris, load_wine
+from ..entropy_estimators import entropy
 from mdlp import MDLP
 from ..Selection import Metrics

@@ -25,9 +26,9 @@ class Metrics_test(unittest.TestCase):
            ([1, 1, 5], 2, 0.9182958340544896),
            (self.y_i, 3, 0.999999999),
        ]
-        for dataset, base, entropy in datasets:
+        for dataset, base, entropy_expected in datasets:
            computed = metric.entropy(dataset, base)
-            self.assertAlmostEqual(entropy, computed)
+            self.assertAlmostEqual(entropy_expected, computed)

    def test_differential_entropy(self):
        metric = Metrics()
@@ -41,11 +42,13 @@ class Metrics_test(unittest.TestCase):
            (self.X_i_c, 37, 3.06627326925228),
            (self.X_w_c, 37, 63.13827518897429),
        ]
-        for dataset, base, entropy in datasets:
+        for dataset, base, entropy_expected in datasets:
            computed = metric.differential_entropy(
                np.array(dataset, dtype="float64"), base
            )
-            self.assertAlmostEqual(entropy, computed, msg=str(dataset))
+            self.assertAlmostEqual(
+                entropy_expected, computed, msg=str(dataset)
+            )
        expected = [
            1.6378708764142766,
            2.0291571802275037,
@@ -68,6 +71,29 @@ class Metrics_test(unittest.TestCase):
            )
            self.assertAlmostEqual(computed, res_expected)

+    def test_dif_ent(self):
+        expected = [
+            1.6378708764142766,
+            2.0291571802275037,
+            0.8273865123744271,
+            3.203935772642847,
+            4.859193341386733,
+            1.3707315434976266,
+            1.8794952925706312,
+            -0.2983180654207054,
+            1.4521478934625076,
+            2.834404839362728,
+            0.4894081282811191,
+            1.361210381692561,
+            7.6373991502818175,
+        ]
+        n_samples, n_features = self.X_w_c.shape
+        for c, res_expected in enumerate(expected):
+            computed = entropy(
+                self.X_w_c[:, c].reshape(-1, 1), k=n_samples - 2
+            )
+            print("-*-", computed)
+
    def test_conditional_entropy(self):
        metric = Metrics()
        results_expected = [
@@ -133,10 +159,10 @@ class Metrics_test(unittest.TestCase):
    def test_symmetrical_uncertainty_continuous(self):
        metric = Metrics()
        results_expected = [
-            -0.08368315199022527,
-            -0.08539330663499867,
-            -0.026524185532893957,
-            -0.016238166071083728,
+            0.3116626663552704,
+            0.22524988105092494,
+            0.24511182026415218,
+            0.07114329389542708,
        ]
        for expected, col in zip(results_expected, range(self.X_w.shape[1])):
            computed = metric.symmetrical_unc_continuous(