Rename Project and first working version

2025-08-16 16:15:56 +00:00 · 2021-05-25 02:10:04 +02:00
parent a19f2cc12a
commit 70560506f1
12 changed files with 299 additions and 155 deletions
--- a/mfs/Selection.py
+++ b/mfs/Selection.py
@@ -0,0 +1,229 @@
+from math import log
+from sys import float_info
+from itertools import combinations
+import numpy as np
+
+
+class Metrics:
+    @staticmethod
+    def conditional_entropy(x, y, base=2):
+        """quantifies the amount of information needed to describe the outcome
+        of Y given that the value of X is known
+        computes H(Y|X)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            conditional entropy of y given x
+        """
+        xy = np.c_[x, y]
+        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
+
+    @staticmethod
+    def entropy(y, base=2):
+        """measure of the uncertainty in predicting the value of y
+
+        Parameters
+        ----------
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            entropy of y
+        """
+        _, count = np.unique(y, return_counts=True, axis=0)
+        proba = count.astype(float) / len(y)
+        proba = proba[proba > 0.0]
+        return np.sum(proba * np.log(1.0 / proba)) / log(base)
+
+    @staticmethod
+    def information_gain(x, y, base=2):
+        """Measures the reduction in uncertainty about the value of y when the
+        value of X is known (also called mutual information)
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            Information gained
+        """
+        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
+            x, y, base
+        )
+
+    @staticmethod
+    def symmetrical_uncertainty(x, y):
+        """Compute symmetrical uncertainty. Normalize* information gain (mutual
+        information) with the entropies of the features in order to compensate
+        the bias due to high cardinality features. *Range [0, 1]
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+
+        Returns
+        -------
+        float
+            symmetrical uncertainty
+        """
+        return (
+            2.0
+            * Metrics.information_gain(x, y)
+            / (Metrics.entropy(x) + Metrics.entropy(y))
+        )
+
+
+class MFS:
+    """Compute Fast Fast Correlation Based Filter
+    Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
+    Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
+    (ICML-2003)
+
+    and
+
+    Correlated Feature Selection as in "Correlation-based Feature Selection for
+    Machine Learning" by Mark A. Hall
+    """
+
+    def __init__(self):
+        self._initialize()
+
+    def _initialize(self):
+        self._result = None
+        self._scores = []
+        self._su_labels = None
+        self._su_features = {}
+
+    def _compute_su_labels(self):
+        if self._su_labels is None:
+            num_features = self.X_.shape[1]
+            self._su_labels = np.zeros(num_features)
+            for col in range(num_features):
+                self._su_labels[col] = Metrics.symmetrical_uncertainty(
+                    self.X_[:, col], self.y_
+                )
+        return self._su_labels
+
+    def _compute_su_features(self, feature_a, feature_b):
+        if (feature_a, feature_b) not in self._su_features:
+            self._su_features[
+                (feature_a, feature_b)
+            ] = Metrics.symmetrical_uncertainty(
+                self.X_[:, feature_a], self.X_[:, feature_b]
+            )
+        return self._su_features[(feature_a, feature_b)]
+
+    def _compute_merit(self, features):
+        rcf = self._su_labels[features].sum()
+        rff = 0.0
+        k = len(features)
+        for pair in list(combinations(features, 2)):
+            rff += self._compute_su_features(*pair)
+        return rcf / ((k ** 2 - k) * rff)
+
+    def cfs(self, X, y):
+        """CFS forward best first heuristic search
+
+        Parameters
+        ----------
+        X : np.array
+            array of features
+        y : np.array
+            vector of labels
+        """
+        self._initialize()
+        self.X_ = X
+        self.y_ = y
+        s_list = self._compute_su_labels()
+        # Descending orders
+        feature_order = (-s_list).argsort().tolist()
+        merit = float_info.min
+        exit_condition = 0
+        candidates = []
+        # start with the best feature (max symmetrical uncertainty wrt label)
+        first_candidate = feature_order.pop(0)
+        candidates.append(first_candidate)
+        self._scores.append(s_list[first_candidate])
+        while exit_condition < 5:  # as proposed in the original algorithm
+            id_selected = -1
+            for idx, feature in enumerate(feature_order):
+                candidates.append(feature)
+                merit_new = self._compute_merit(candidates)
+                if merit_new > merit:
+                    id_selected = idx
+                    merit = merit_new
+                    exit_condition = 0
+                candidates.pop()
+            if id_selected == -1:
+                exit_condition += 1
+            else:
+                candidates.append(feature_order[id_selected])
+                self._scores.append(merit_new)
+                del feature_order[id_selected]
+            if len(feature_order) == 0:
+                # Force leaving the loop
+                exit_condition = 5
+        self._result = candidates
+        return self
+
+    def fcbs(self, X, y, threshold):
+        if threshold < 1e-4:
+            raise ValueError("Threshold cannot be less than 1e4")
+        self._initialize()
+        self.X_ = X
+        self.y_ = y
+        s_list = self._compute_su_labels()
+        feature_order = (-s_list).argsort()
+        feature_dup = feature_order.copy().tolist()
+        self._result = []
+        for index_p in feature_order:
+            # Don't self compare
+            feature_dup.pop(0)
+            # Remove redundant features
+            if s_list[index_p] == 0.0:
+                # the feature has been removed from the list
+                continue
+            if s_list[index_p] < threshold:
+                break
+            # Remove redundant features
+            for index_q in feature_dup:
+                # test if feature(index_q) su with feature(index_p) is
+                su_pq = self._compute_su_features(index_p, index_q)
+                if su_pq >= s_list[index_q]:
+                    # remove feature from list
+                    s_list[index_q] = 0.0
+            self._result.append(index_p)
+            self._scores.append(s_list[index_p])
+        return self
+
+    def get_results(self):
+        return self._result
+
+    def get_scores(self):
+        return self._scores
--- a/mfs/init.py
+++ b/mfs/init.py
@@ -0,0 +1,9 @@
+from .Selection import MFS
+
+__version__ = "0.1"
+__author__ = "Ricardo Montañana Gómez"
+__author_email__ = "Ricardo.Montanana@alu.uclm.es"
+__copyright__ = "Copyright 2021, Ricardo Montañana Gómez"
+__license__ = "MIT License"
+
+__all__ = ["MFS"]
--- a/mfs/tests/MFS_test.py
+++ b/mfs/tests/MFS_test.py
@@ -0,0 +1,49 @@
+import unittest
+from mdlp import MDLP
+from sklearn.datasets import load_wine
+
+from ..Selection import MFS
+
+
+class MFS_test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        mdlp = MDLP(random_state=1)
+        X, self.y = load_wine(return_X_y=True)
+        self.X = mdlp.fit_transform(X, self.y).astype("int64")
+        self.m, self.n = self.X.shape
+
+    # @classmethod
+    # def setup(cls):
+    #     pass
+
+    def test_initialize(self):
+        mfs = MFS()
+        mfs.fcbs(self.X, self.y, 0.05)
+        mfs._initialize()
+        self.assertIsNone(mfs.get_results())
+        self.assertListEqual([], mfs.get_scores())
+        self.assertDictEqual({}, mfs._su_features)
+        self.assertIsNone(mfs._su_labels)
+
+    def test_csf(self):
+        mfs = MFS()
+        expected = [6, 4]
+        self.assertListEqual(expected, mfs.cfs(self.X, self.y).get_results())
+        expected = [0.5218299405215557, 2.4168234005280964]
+        self.assertListEqual(expected, mfs.get_scores())
+
+    def test_fcbs(self):
+        mfs = MFS()
+        computed = mfs.fcbs(self.X, self.y, threshold=0.05).get_results()
+        expected = [6, 9, 12, 0, 11, 4]
+        self.assertListEqual(expected, computed)
+        expected = [
+            0.5218299405215557,
+            0.46224298637417455,
+            0.44518278979085646,
+            0.38942355544213786,
+            0.3790082191220976,
+            0.24972405134844652,
+        ]
+        self.assertListEqual(expected, mfs.get_scores())
--- a/mfs/tests/Metrics_test.py
+++ b/mfs/tests/Metrics_test.py
@@ -0,0 +1,89 @@
+import unittest
+from sklearn.datasets import load_iris
+from mdlp import MDLP
+from ..Selection import Metrics
+
+
+class Metrics_test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        mdlp = MDLP(random_state=1)
+        X, self.y = load_iris(return_X_y=True)
+        self.X = mdlp.fit_transform(X, self.y).astype("int64")
+        self.m, self.n = self.X.shape
+
+    # @classmethod
+    # def setup(cls):
+
+    def test_entropy(self):
+        metric = Metrics()
+        datasets = [
+            ([0, 0, 0, 0, 1, 1, 1, 1], 2, 1.0),
+            ([0, 1, 0, 2, 1, 2], 3, 1.0),
+            ([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 2, 0.8812908992306927),
+            ([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263),
+            ([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263),
+            ([1, 1, 5], 2, 0.9182958340544896),
+            (self.y, 3, 0.999999999),
+        ]
+        for dataset, base, entropy in datasets:
+            computed = metric.entropy(dataset, base)
+            self.assertAlmostEqual(entropy, computed)
+
+    def test_conditional_entropy(self):
+        metric = Metrics()
+        results_expected = [
+            0.490953458537736,
+            0.7110077966379169,
+            0.15663362014829718,
+            0.13032469395094992,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.conditional_entropy(self.X[:, col], self.y, 3)
+            self.assertAlmostEqual(expected, computed)
+        self.assertAlmostEqual(
+            0.6309297535714573,
+            metric.conditional_entropy(
+                [1, 3, 2, 3, 2, 1], [1, 2, 0, 1, 1, 2], 3
+            ),
+        )
+        # https://planetcalc.com/8414/?joint=0.4%200%0A0.2%200.4&showDetails=1
+        self.assertAlmostEqual(
+            0.5509775004326938,
+            metric.conditional_entropy([1, 1, 2, 2, 2], [0, 0, 0, 2, 2], 2),
+        )
+
+    def test_information_gain(self):
+        metric = Metrics()
+        results_expected = [
+            0.5090465414622638,
+            0.28899220336208287,
+            0.8433663798517026,
+            0.8696753060490499,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.information_gain(self.X[:, col], self.y, 3)
+            self.assertAlmostEqual(expected, computed)
+        # https://planetcalc.com/8419/
+        # ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_
+        results_expected = [
+            0.806819679,
+            0.458041805,
+            1.336704086,
+            1.378402748,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.information_gain(self.X[:, col], self.y, 2)
+            self.assertAlmostEqual(expected, computed)
+
+    def test_symmetrical_uncertainty(self):
+        metric = Metrics()
+        results_expected = [
+            0.33296547388990266,
+            0.19068147573570668,
+            0.810724587460511,
+            0.870521418179061,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.symmetrical_uncertainty(self.X[:, col], self.y)
+            self.assertAlmostEqual(expected, computed)
--- a/mfs/tests/init.py
+++ b/mfs/tests/init.py
@@ -0,0 +1,4 @@
+from .MFS_test import MFS_test
+from .Metrics_test import Metrics_test
+
+__all__ = ["MFS_test", "Metrics_test"]