Fix mistake in between features merit

2025-08-15 23:55:56 +00:00 · 2021-06-04 01:48:15 +02:00
parent ff80fe6172
commit 7677aaf94b
3 changed files with 132 additions and 17 deletions
--- a/k.py
+++ b/k.py
@@ -21,13 +21,15 @@ mfsd = MFS(discrete=True)
 # X = data[:, -1:]
 # y = data[:, -1]

-
-data, meta = arff.loadarff(
-    "/Users/rmontanana/Code/stree_datasets/data/tanveer/balance-scale/balance-scale.arff"
+filename = (
+    "/Users/rmontanana/Code/stree_datasets/data/tanveer/conn-bench-sonar-min"
+    "es-rocks/conn-bench-sonar-mines-rocks.arff"
 )
-train = np.array([data["f1"], data["f2"], data["f3"], data["f4"]])
-y = data["clase"]
+data, meta = arff.loadarff(filename)
+train = np.array([data[i] for i in meta])
 X = train.T
+X = X[:, :-1].astype("float64")
+y = data["clase"]


 for c in range(X.shape[1]):
@@ -68,10 +70,10 @@ clf = Stree(random_state=0)
 subf = fcfb_f
 print("fcfb", clf.fit(X[:, subf], y).score(X[:, subf], y))

-for c in range(X.shape[1]):
-    for k in range(X.shape[1]):
-        ac = 0
-        for v in range(X[:, c].shape[0]):
-            if X[v, c] == X[v, k]:
-                ac += 1
-        print(f"{c} {k} {ac}")
+# for c in range(X.shape[1]):
+#     for k in range(X.shape[1]):
+#         ac = 0
+#         for v in range(X[:, c].shape[0]):
+#             if X[v, c] == X[v, k]:
+#                 ac += 1
+#         print(f"{c} {k} {ac}")
--- a/mfs/Metrics.py
+++ b/mfs/Metrics.py
@@ -27,6 +27,90 @@ class Metrics:
        """
        return Metrics._compute_mi_cd(x, y, n_neighbors=3)

+    @staticmethod
+    def information_gain_cont_features(xa, xb):
+        """Measures the reduction in uncertainty about the value of xb when the
+        value of xa continuous is known (also called mutual information)
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        xa : np.array
+            values of the continuous variable
+        xb : np.array
+            values of the continuous variable
+
+        Returns
+        -------
+        float
+            Information gained
+        """
+        return Metrics._compute_mi_cc(xa, xb, n_neighbors=3)
+
+    @staticmethod
+    def _compute_mi_cc(x, y, n_neighbors):
+        """Compute mutual information between two continuous variables.
+
+        Parameters
+        ----------
+        x, y : ndarray, shape (n_samples,)
+            Samples of two continuous random variables, must have an identical
+            shape.
+
+        n_neighbors : int
+            Number of nearest neighbors to search for each point, see [1]_.
+
+        Returns
+        -------
+        mi : float
+            Estimated mutual information. If it turned out to be negative it is
+            replace by 0.
+
+        Notes
+        -----
+        True mutual information can't be negative. If its estimate by a numerical
+        method is negative, it means (providing the method is adequate) that the
+        mutual information is close to 0 and replacing it by 0 is a reasonable
+        strategy.
+
+        References
+        ----------
+        .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
+            information". Phys. Rev. E 69, 2004.
+        """
+
+        n_samples = x.size
+
+        x = x.reshape((-1, 1))
+        y = y.reshape((-1, 1))
+        xy = np.hstack((x, y))
+
+        # Here we rely on NearestNeighbors to select the fastest algorithm.
+        nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
+
+        nn.fit(xy)
+        radius = nn.kneighbors()[0]
+        radius = np.nextafter(radius[:, -1], 0)
+
+        # KDTree is explicitly fit to allow for the querying of number of
+        # neighbors within a specified radius
+        kd = KDTree(x, metric="chebyshev")
+        nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
+        nx = np.array(nx) - 1.0
+
+        kd = KDTree(y, metric="chebyshev")
+        ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
+        ny = np.array(ny) - 1.0
+
+        mi = (
+            digamma(n_samples)
+            + digamma(n_neighbors)
+            - np.mean(digamma(nx + 1))
+            - np.mean(digamma(ny + 1))
+        )
+
+        return max(0, mi)
+
    @staticmethod
    def _compute_mi_cd(c, d, n_neighbors):
        """Compute mutual information between continuous and discrete
@@ -189,6 +273,33 @@ class Metrics:
            )
        )

+    @staticmethod
+    def symmetrical_unc_continuous_features(x, y):
+        """Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
+        https://github.com/gregversteeg/NPEET
+
+        Parameters
+        ----------
+        x : np.array
+            values of the continuous variable
+        y : np.array
+            array of labels
+
+        Returns
+        -------
+        float
+            symmetrical uncertainty
+        """
+
+        return (
+            2.0
+            * Metrics.information_gain_cont_features(x, y)
+            / (
+                Metrics.differential_entropy(x, k=len(x) - 1)
+                + Metrics.entropy(y)
+            )
+        )
+
    @staticmethod
    def symmetrical_uncertainty(x, y):
        """Compute symmetrical uncertainty. Normalize* information gain (mutual
--- a/mfs/Selection.py
+++ b/mfs/Selection.py
@@ -33,6 +33,11 @@ class MFS:
            if discrete
            else Metrics.symmetrical_unc_continuous
        )
+        self.symmetrical_uncertainty_features = (
+            Metrics.symmetrical_uncertainty
+            if discrete
+            else Metrics.symmetrical_unc_continuous_features
+        )
        self._fitted = False

    def _initialize(self, X, y):
@@ -93,7 +98,7 @@ class MFS:
        if (feature_a, feature_b) not in self._su_features:
            self._su_features[
                (feature_a, feature_b)
-            ] = self.symmetrical_uncertainty(
+            ] = self.symmetrical_uncertainty_features(
                self.X_[:, feature_a], self.X_[:, feature_b]
            )
        return self._su_features[(feature_a, feature_b)]
@@ -148,7 +153,7 @@ class MFS:
        candidates.append(first_candidate)
        self._scores.append(s_list[first_candidate])
        while continue_condition:
-            merit = float_info.min
+            merit = -float_info.min
            id_selected = None
            for idx, feature in enumerate(feature_order):
                candidates.append(feature)
@@ -157,9 +162,6 @@ class MFS:
                    id_selected = idx
                    merit = merit_new
                candidates.pop()
-            if id_selected is None:
-                # Every merit computed is 0
-                break
            candidates.append(feature_order[id_selected])
            self._scores.append(merit)
            del feature_order[id_selected]