Rename Project and first working version

2025-08-16 08:05:56 +00:00 · 2021-05-25 02:10:04 +02:00
parent a19f2cc12a
commit 70560506f1
12 changed files with 299 additions and 155 deletions
--- a/4
+++ b/4
@@ -4,7 +4,7 @@ SHELL := /bin/bash

 coverage:  ## Run tests with coverage
 	coverage erase
-	coverage run -m unittest -v cfs.tests
+	coverage run -m unittest -v mfs.tests
 	coverage report -m

 deps:  ## Install dependencies
@@ -19,7 +19,7 @@ push:  ## Push code with tags
 	git push && git push --tags

 test:  ## Run tests
-	python -m unittest -v cfs.tests
+	python -m unittest -v mfs.tests

 doc:  ## Update documentation
 	make -C docs --makefile=Makefile html
--- a/README.md
+++ b/README.md
@@ -1,5 +1,13 @@
-# CFS
+# MFS

-## Correlation-based Feature Selection
+## Multi Feature Selection

-Based on the work of Mark Andrew Hall
+Compute Fast Fast Correlation Based Filter
+Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
+Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
+(ICML-2003)
+
+and
+
+Correlated Feature Selection as in "Correlation-based Feature Selection for
+Machine Learning" by Mark Andrew Hall
--- a/cfs/Entropy.py
+++ b/cfs/Entropy.py
@@ -1,39 +0,0 @@
-##Entropy
-def entropy(Y):
-    """
-    Also known as Shanon Entropy
-    Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
-    """
-    unique, count = np.unique(Y, return_counts=True, axis=0)
-    prob = count / len(Y)
-    en = -np.sum(prob * np.log2(prob))
-    return en
-
-
-# Joint Entropy
-def jEntropy(Y, X):
-    """
-    H(Y;X)
-    Reference: https://en.wikipedia.org/wiki/Joint_entropy
-    """
-    YX = np.c_[Y, X]
-    return entropy(YX)
-
-
-# Conditional Entropy
-def cEntropy(Y, X):
-    """
-    conditional entropy = Joint Entropy - Entropy of X
-    H(Y|X) = H(Y;X) - H(X)
-    Reference: https://en.wikipedia.org/wiki/Conditional_entropy
-    """
-    return jEntropy(Y, X) - entropy(X)
-
-
-# Information Gain
-def gain(Y, X):
-    """
-    Information Gain, I(Y;X) = H(Y) - H(Y|X)
-    Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
-    """
-    return entropy(Y) - cEntropy(Y, X)
--- a/cfs/Selection.py
+++ b/cfs/Selection.py
@@ -1,86 +0,0 @@
-from math import log
-import numpy as np
-
-
-class Metrics:
-    @staticmethod
-    def conditional_entropy(x, y, base=2):
-        """quantifies the amount of information needed to describe the outcome
-        of Y given that the value of X is known
-        computes H(Y|X)
-
-        Parameters
-        ----------
-        x : np.array
-            values of the variable
-        y : np.array
-            array of labels
-        base : int, optional
-            base of the logarithm, by default 2
-
-        Returns
-        -------
-        float
-            conditional entropy of y given x
-        """
-        xy = np.c_[x, y]
-        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
-
-    @staticmethod
-    def entropy(y, base=2):
-        """measure of the uncertainty in predicting the value of y
-
-        Parameters
-        ----------
-        y : np.array
-            array of labels
-        base : int, optional
-            base of the logarithm, by default 2
-
-        Returns
-        -------
-        float
-            entropy of y
-        """
-        _, count = np.unique(y, return_counts=True, axis=0)
-        proba = count.astype(float) / len(y)
-        proba = proba[proba > 0.0]
-        return np.sum(proba * np.log(1.0 / proba)) / log(base)
-
-    @staticmethod
-    def information_gain(x, y, base=2):
-        """Measures the reduction in uncertainty about the value of y when the
-        value of X is known (also called mutual information)
-        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
-
-        Parameters
-        ----------
-        x : np.array
-            values of the variable
-        y : np.array
-            array of labels
-        base : int, optional
-            base of the logarithm, by default 2
-
-        Returns
-        -------
-        float
-            Information gained
-        """
-        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
-            x, y, base
-        )
-
-    @staticmethod
-    def symmetrical_uncertainty(x, y):
-
-        return (
-            2.0
-            * Metrics.information_gain(x, y)
-            / (Metrics.entropy(x) + Metrics.entropy(y))
-        )
-
-
-class CFS:
-    def __init__(self, a):
-        self.a = a
--- a/cfs/tests/CFS_test.py
+++ b/cfs/tests/CFS_test.py
@@ -1,16 +0,0 @@
-import unittest
-
-from ..Selection import CFS
-
-
-class CFS_test(unittest.TestCase):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    # @classmethod
-    # def setup(cls):
-    #     pass
-
-    def test_initial(self):
-        cfs = CFS(a=1)
-        self.assertEqual(cfs.a, 1)
--- a/cfs/tests/init.py
+++ b/cfs/tests/init.py
@@ -1,4 +0,0 @@
-from .CFS_test import CFS_test
-from .Metrics_test import Metrics_test
-
-__all__ = ["CFS_test", "Metrics_test"]
--- a/mfs/Selection.py
+++ b/mfs/Selection.py
@@ -0,0 +1,229 @@
+from math import log
+from sys import float_info
+from itertools import combinations
+import numpy as np
+
+
+class Metrics:
+    @staticmethod
+    def conditional_entropy(x, y, base=2):
+        """quantifies the amount of information needed to describe the outcome
+        of Y given that the value of X is known
+        computes H(Y|X)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            conditional entropy of y given x
+        """
+        xy = np.c_[x, y]
+        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
+
+    @staticmethod
+    def entropy(y, base=2):
+        """measure of the uncertainty in predicting the value of y
+
+        Parameters
+        ----------
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            entropy of y
+        """
+        _, count = np.unique(y, return_counts=True, axis=0)
+        proba = count.astype(float) / len(y)
+        proba = proba[proba > 0.0]
+        return np.sum(proba * np.log(1.0 / proba)) / log(base)
+
+    @staticmethod
+    def information_gain(x, y, base=2):
+        """Measures the reduction in uncertainty about the value of y when the
+        value of X is known (also called mutual information)
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            Information gained
+        """
+        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
+            x, y, base
+        )
+
+    @staticmethod
+    def symmetrical_uncertainty(x, y):
+        """Compute symmetrical uncertainty. Normalize* information gain (mutual
+        information) with the entropies of the features in order to compensate
+        the bias due to high cardinality features. *Range [0, 1]
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+
+        Returns
+        -------
+        float
+            symmetrical uncertainty
+        """
+        return (
+            2.0
+            * Metrics.information_gain(x, y)
+            / (Metrics.entropy(x) + Metrics.entropy(y))
+        )
+
+
+class MFS:
+    """Compute Fast Fast Correlation Based Filter
+    Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
+    Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
+    (ICML-2003)
+
+    and
+
+    Correlated Feature Selection as in "Correlation-based Feature Selection for
+    Machine Learning" by Mark A. Hall
+    """
+
+    def __init__(self):
+        self._initialize()
+
+    def _initialize(self):
+        self._result = None
+        self._scores = []
+        self._su_labels = None
+        self._su_features = {}
+
+    def _compute_su_labels(self):
+        if self._su_labels is None:
+            num_features = self.X_.shape[1]
+            self._su_labels = np.zeros(num_features)
+            for col in range(num_features):
+                self._su_labels[col] = Metrics.symmetrical_uncertainty(
+                    self.X_[:, col], self.y_
+                )
+        return self._su_labels
+
+    def _compute_su_features(self, feature_a, feature_b):
+        if (feature_a, feature_b) not in self._su_features:
+            self._su_features[
+                (feature_a, feature_b)
+            ] = Metrics.symmetrical_uncertainty(
+                self.X_[:, feature_a], self.X_[:, feature_b]
+            )
+        return self._su_features[(feature_a, feature_b)]
+
+    def _compute_merit(self, features):
+        rcf = self._su_labels[features].sum()
+        rff = 0.0
+        k = len(features)
+        for pair in list(combinations(features, 2)):
+            rff += self._compute_su_features(*pair)
+        return rcf / ((k ** 2 - k) * rff)
+
+    def cfs(self, X, y):
+        """CFS forward best first heuristic search
+
+        Parameters
+        ----------
+        X : np.array
+            array of features
+        y : np.array
+            vector of labels
+        """
+        self._initialize()
+        self.X_ = X
+        self.y_ = y
+        s_list = self._compute_su_labels()
+        # Descending orders
+        feature_order = (-s_list).argsort().tolist()
+        merit = float_info.min
+        exit_condition = 0
+        candidates = []
+        # start with the best feature (max symmetrical uncertainty wrt label)
+        first_candidate = feature_order.pop(0)
+        candidates.append(first_candidate)
+        self._scores.append(s_list[first_candidate])
+        while exit_condition < 5:  # as proposed in the original algorithm
+            id_selected = -1
+            for idx, feature in enumerate(feature_order):
+                candidates.append(feature)
+                merit_new = self._compute_merit(candidates)
+                if merit_new > merit:
+                    id_selected = idx
+                    merit = merit_new
+                    exit_condition = 0
+                candidates.pop()
+            if id_selected == -1:
+                exit_condition += 1
+            else:
+                candidates.append(feature_order[id_selected])
+                self._scores.append(merit_new)
+                del feature_order[id_selected]
+            if len(feature_order) == 0:
+                # Force leaving the loop
+                exit_condition = 5
+        self._result = candidates
+        return self
+
+    def fcbs(self, X, y, threshold):
+        if threshold < 1e-4:
+            raise ValueError("Threshold cannot be less than 1e4")
+        self._initialize()
+        self.X_ = X
+        self.y_ = y
+        s_list = self._compute_su_labels()
+        feature_order = (-s_list).argsort()
+        feature_dup = feature_order.copy().tolist()
+        self._result = []
+        for index_p in feature_order:
+            # Don't self compare
+            feature_dup.pop(0)
+            # Remove redundant features
+            if s_list[index_p] == 0.0:
+                # the feature has been removed from the list
+                continue
+            if s_list[index_p] < threshold:
+                break
+            # Remove redundant features
+            for index_q in feature_dup:
+                # test if feature(index_q) su with feature(index_p) is
+                su_pq = self._compute_su_features(index_p, index_q)
+                if su_pq >= s_list[index_q]:
+                    # remove feature from list
+                    s_list[index_q] = 0.0
+            self._result.append(index_p)
+            self._scores.append(s_list[index_p])
+        return self
+
+    def get_results(self):
+        return self._result
+
+    def get_scores(self):
+        return self._scores
--- a/mfs/init.py
+++ b/mfs/init.py
@@ -1,4 +1,4 @@
-from .Selection import CFS
+from .Selection import MFS

 __version__ = "0.1"
 __author__ = "Ricardo Montañana Gómez"
@@ -6,4 +6,4 @@ __author_email__ = "Ricardo.Montanana@alu.uclm.es"
 __copyright__ = "Copyright 2021, Ricardo Montañana Gómez"
 __license__ = "MIT License"

-__all__ = ["CFS"]
+__all__ = ["MFS"]
--- a/mfs/tests/MFS_test.py
+++ b/mfs/tests/MFS_test.py
@@ -0,0 +1,49 @@
+import unittest
+from mdlp import MDLP
+from sklearn.datasets import load_wine
+
+from ..Selection import MFS
+
+
+class MFS_test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        mdlp = MDLP(random_state=1)
+        X, self.y = load_wine(return_X_y=True)
+        self.X = mdlp.fit_transform(X, self.y).astype("int64")
+        self.m, self.n = self.X.shape
+
+    # @classmethod
+    # def setup(cls):
+    #     pass
+
+    def test_initialize(self):
+        mfs = MFS()
+        mfs.fcbs(self.X, self.y, 0.05)
+        mfs._initialize()
+        self.assertIsNone(mfs.get_results())
+        self.assertListEqual([], mfs.get_scores())
+        self.assertDictEqual({}, mfs._su_features)
+        self.assertIsNone(mfs._su_labels)
+
+    def test_csf(self):
+        mfs = MFS()
+        expected = [6, 4]
+        self.assertListEqual(expected, mfs.cfs(self.X, self.y).get_results())
+        expected = [0.5218299405215557, 2.4168234005280964]
+        self.assertListEqual(expected, mfs.get_scores())
+
+    def test_fcbs(self):
+        mfs = MFS()
+        computed = mfs.fcbs(self.X, self.y, threshold=0.05).get_results()
+        expected = [6, 9, 12, 0, 11, 4]
+        self.assertListEqual(expected, computed)
+        expected = [
+            0.5218299405215557,
+            0.46224298637417455,
+            0.44518278979085646,
+            0.38942355544213786,
+            0.3790082191220976,
+            0.24972405134844652,
+        ]
+        self.assertListEqual(expected, mfs.get_scores())
--- a/mfs/tests/Metrics_test.py
+++ b/mfs/tests/Metrics_test.py
@@ -1,7 +1,6 @@
 import unittest
 from sklearn.datasets import load_iris
 from mdlp import MDLP
-import numpy as np
 from ..Selection import Metrics


--- a/mfs/tests/init.py
+++ b/mfs/tests/init.py
@@ -0,0 +1,4 @@
+from .MFS_test import MFS_test
+from .Metrics_test import Metrics_test
+
+__all__ = ["MFS_test", "Metrics_test"]
--- a/setup.py
+++ b/setup.py
@@ -20,10 +20,10 @@ def get_data(field: str):


 setuptools.setup(
-    name="CFS",
+    name="MFS",
    version=get_data("version"),
    license=get_data("license"),
-    description="Correlation-based Feature Selection",
+    description="Multi Feature Selection",
    long_description=readme(),
    long_description_content_type="text/markdown",
    packages=setuptools.find_packages(),