From 70560506f1d82b9cc425f2079d546b9f5796c97d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 25 May 2021 02:10:04 +0200 Subject: [PATCH] Rename Project and first working version --- Makefile | 4 +- README.md | 14 +- cfs/Entropy.py | 39 ----- cfs/Selection.py | 86 ----------- cfs/tests/CFS_test.py | 16 -- cfs/tests/__init__.py | 4 - mfs/Selection.py | 229 +++++++++++++++++++++++++++++ {cfs => mfs}/__init__.py | 4 +- mfs/tests/MFS_test.py | 49 ++++++ {cfs => mfs}/tests/Metrics_test.py | 1 - mfs/tests/__init__.py | 4 + setup.py | 4 +- 12 files changed, 299 insertions(+), 155 deletions(-) delete mode 100644 cfs/Entropy.py delete mode 100644 cfs/Selection.py delete mode 100644 cfs/tests/CFS_test.py delete mode 100644 cfs/tests/__init__.py create mode 100755 mfs/Selection.py rename {cfs => mfs}/__init__.py (81%) create mode 100755 mfs/tests/MFS_test.py rename {cfs => mfs}/tests/Metrics_test.py (99%) mode change 100644 => 100755 create mode 100644 mfs/tests/__init__.py diff --git a/Makefile b/Makefile index 9af4987..100c82b 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ SHELL := /bin/bash coverage: ## Run tests with coverage coverage erase - coverage run -m unittest -v cfs.tests + coverage run -m unittest -v mfs.tests coverage report -m deps: ## Install dependencies @@ -19,7 +19,7 @@ push: ## Push code with tags git push && git push --tags test: ## Run tests - python -m unittest -v cfs.tests + python -m unittest -v mfs.tests doc: ## Update documentation make -C docs --makefile=Makefile html diff --git a/README.md b/README.md index 24d415e..3384f17 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,13 @@ -# CFS +# MFS -## Correlation-based Feature Selection +## Multi Feature Selection -Based on the work of Mark Andrew Hall +Compute Fast Fast Correlation Based Filter +Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast +Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn. +(ICML-2003) + +and + +Correlated Feature Selection as in "Correlation-based Feature Selection for +Machine Learning" by Mark Andrew Hall diff --git a/cfs/Entropy.py b/cfs/Entropy.py deleted file mode 100644 index 3f51471..0000000 --- a/cfs/Entropy.py +++ /dev/null @@ -1,39 +0,0 @@ -##Entropy -def entropy(Y): - """ - Also known as Shanon Entropy - Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory) - """ - unique, count = np.unique(Y, return_counts=True, axis=0) - prob = count / len(Y) - en = -np.sum(prob * np.log2(prob)) - return en - - -# Joint Entropy -def jEntropy(Y, X): - """ - H(Y;X) - Reference: https://en.wikipedia.org/wiki/Joint_entropy - """ - YX = np.c_[Y, X] - return entropy(YX) - - -# Conditional Entropy -def cEntropy(Y, X): - """ - conditional entropy = Joint Entropy - Entropy of X - H(Y|X) = H(Y;X) - H(X) - Reference: https://en.wikipedia.org/wiki/Conditional_entropy - """ - return jEntropy(Y, X) - entropy(X) - - -# Information Gain -def gain(Y, X): - """ - Information Gain, I(Y;X) = H(Y) - H(Y|X) - Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition - """ - return entropy(Y) - cEntropy(Y, X) \ No newline at end of file diff --git a/cfs/Selection.py b/cfs/Selection.py deleted file mode 100644 index 3d655be..0000000 --- a/cfs/Selection.py +++ /dev/null @@ -1,86 +0,0 @@ -from math import log -import numpy as np - - -class Metrics: - @staticmethod - def conditional_entropy(x, y, base=2): - """quantifies the amount of information needed to describe the outcome - of Y given that the value of X is known - computes H(Y|X) - - Parameters - ---------- - x : np.array - values of the variable - y : np.array - array of labels - base : int, optional - base of the logarithm, by default 2 - - Returns - ------- - float - conditional entropy of y given x - """ - xy = np.c_[x, y] - return Metrics.entropy(xy, base) - Metrics.entropy(x, base) - - @staticmethod - def entropy(y, base=2): - """measure of the uncertainty in predicting the value of y - - Parameters - ---------- - y : np.array - array of labels - base : int, optional - base of the logarithm, by default 2 - - Returns - ------- - float - entropy of y - """ - _, count = np.unique(y, return_counts=True, axis=0) - proba = count.astype(float) / len(y) - proba = proba[proba > 0.0] - return np.sum(proba * np.log(1.0 / proba)) / log(base) - - @staticmethod - def information_gain(x, y, base=2): - """Measures the reduction in uncertainty about the value of y when the - value of X is known (also called mutual information) - (https://www.sciencedirect.com/science/article/pii/S0020025519303603) - - Parameters - ---------- - x : np.array - values of the variable - y : np.array - array of labels - base : int, optional - base of the logarithm, by default 2 - - Returns - ------- - float - Information gained - """ - return Metrics.entropy(y, base) - Metrics.conditional_entropy( - x, y, base - ) - - @staticmethod - def symmetrical_uncertainty(x, y): - - return ( - 2.0 - * Metrics.information_gain(x, y) - / (Metrics.entropy(x) + Metrics.entropy(y)) - ) - - -class CFS: - def __init__(self, a): - self.a = a diff --git a/cfs/tests/CFS_test.py b/cfs/tests/CFS_test.py deleted file mode 100644 index e76aaff..0000000 --- a/cfs/tests/CFS_test.py +++ /dev/null @@ -1,16 +0,0 @@ -import unittest - -from ..Selection import CFS - - -class CFS_test(unittest.TestCase): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # @classmethod - # def setup(cls): - # pass - - def test_initial(self): - cfs = CFS(a=1) - self.assertEqual(cfs.a, 1) diff --git a/cfs/tests/__init__.py b/cfs/tests/__init__.py deleted file mode 100644 index f1f5c1b..0000000 --- a/cfs/tests/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .CFS_test import CFS_test -from .Metrics_test import Metrics_test - -__all__ = ["CFS_test", "Metrics_test"] diff --git a/mfs/Selection.py b/mfs/Selection.py new file mode 100755 index 0000000..d13e02f --- /dev/null +++ b/mfs/Selection.py @@ -0,0 +1,229 @@ +from math import log +from sys import float_info +from itertools import combinations +import numpy as np + + +class Metrics: + @staticmethod + def conditional_entropy(x, y, base=2): + """quantifies the amount of information needed to describe the outcome + of Y given that the value of X is known + computes H(Y|X) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + conditional entropy of y given x + """ + xy = np.c_[x, y] + return Metrics.entropy(xy, base) - Metrics.entropy(x, base) + + @staticmethod + def entropy(y, base=2): + """measure of the uncertainty in predicting the value of y + + Parameters + ---------- + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + entropy of y + """ + _, count = np.unique(y, return_counts=True, axis=0) + proba = count.astype(float) / len(y) + proba = proba[proba > 0.0] + return np.sum(proba * np.log(1.0 / proba)) / log(base) + + @staticmethod + def information_gain(x, y, base=2): + """Measures the reduction in uncertainty about the value of y when the + value of X is known (also called mutual information) + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + Information gained + """ + return Metrics.entropy(y, base) - Metrics.conditional_entropy( + x, y, base + ) + + @staticmethod + def symmetrical_uncertainty(x, y): + """Compute symmetrical uncertainty. Normalize* information gain (mutual + information) with the entropies of the features in order to compensate + the bias due to high cardinality features. *Range [0, 1] + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + + Returns + ------- + float + symmetrical uncertainty + """ + return ( + 2.0 + * Metrics.information_gain(x, y) + / (Metrics.entropy(x) + Metrics.entropy(y)) + ) + + +class MFS: + """Compute Fast Fast Correlation Based Filter + Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast + Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn. + (ICML-2003) + + and + + Correlated Feature Selection as in "Correlation-based Feature Selection for + Machine Learning" by Mark A. Hall + """ + + def __init__(self): + self._initialize() + + def _initialize(self): + self._result = None + self._scores = [] + self._su_labels = None + self._su_features = {} + + def _compute_su_labels(self): + if self._su_labels is None: + num_features = self.X_.shape[1] + self._su_labels = np.zeros(num_features) + for col in range(num_features): + self._su_labels[col] = Metrics.symmetrical_uncertainty( + self.X_[:, col], self.y_ + ) + return self._su_labels + + def _compute_su_features(self, feature_a, feature_b): + if (feature_a, feature_b) not in self._su_features: + self._su_features[ + (feature_a, feature_b) + ] = Metrics.symmetrical_uncertainty( + self.X_[:, feature_a], self.X_[:, feature_b] + ) + return self._su_features[(feature_a, feature_b)] + + def _compute_merit(self, features): + rcf = self._su_labels[features].sum() + rff = 0.0 + k = len(features) + for pair in list(combinations(features, 2)): + rff += self._compute_su_features(*pair) + return rcf / ((k ** 2 - k) * rff) + + def cfs(self, X, y): + """CFS forward best first heuristic search + + Parameters + ---------- + X : np.array + array of features + y : np.array + vector of labels + """ + self._initialize() + self.X_ = X + self.y_ = y + s_list = self._compute_su_labels() + # Descending orders + feature_order = (-s_list).argsort().tolist() + merit = float_info.min + exit_condition = 0 + candidates = [] + # start with the best feature (max symmetrical uncertainty wrt label) + first_candidate = feature_order.pop(0) + candidates.append(first_candidate) + self._scores.append(s_list[first_candidate]) + while exit_condition < 5: # as proposed in the original algorithm + id_selected = -1 + for idx, feature in enumerate(feature_order): + candidates.append(feature) + merit_new = self._compute_merit(candidates) + if merit_new > merit: + id_selected = idx + merit = merit_new + exit_condition = 0 + candidates.pop() + if id_selected == -1: + exit_condition += 1 + else: + candidates.append(feature_order[id_selected]) + self._scores.append(merit_new) + del feature_order[id_selected] + if len(feature_order) == 0: + # Force leaving the loop + exit_condition = 5 + self._result = candidates + return self + + def fcbs(self, X, y, threshold): + if threshold < 1e-4: + raise ValueError("Threshold cannot be less than 1e4") + self._initialize() + self.X_ = X + self.y_ = y + s_list = self._compute_su_labels() + feature_order = (-s_list).argsort() + feature_dup = feature_order.copy().tolist() + self._result = [] + for index_p in feature_order: + # Don't self compare + feature_dup.pop(0) + # Remove redundant features + if s_list[index_p] == 0.0: + # the feature has been removed from the list + continue + if s_list[index_p] < threshold: + break + # Remove redundant features + for index_q in feature_dup: + # test if feature(index_q) su with feature(index_p) is + su_pq = self._compute_su_features(index_p, index_q) + if su_pq >= s_list[index_q]: + # remove feature from list + s_list[index_q] = 0.0 + self._result.append(index_p) + self._scores.append(s_list[index_p]) + return self + + def get_results(self): + return self._result + + def get_scores(self): + return self._scores diff --git a/cfs/__init__.py b/mfs/__init__.py similarity index 81% rename from cfs/__init__.py rename to mfs/__init__.py index f5d4c56..ac8815a 100644 --- a/cfs/__init__.py +++ b/mfs/__init__.py @@ -1,4 +1,4 @@ -from .Selection import CFS +from .Selection import MFS __version__ = "0.1" __author__ = "Ricardo Montañana Gómez" @@ -6,4 +6,4 @@ __author_email__ = "Ricardo.Montanana@alu.uclm.es" __copyright__ = "Copyright 2021, Ricardo Montañana Gómez" __license__ = "MIT License" -__all__ = ["CFS"] +__all__ = ["MFS"] diff --git a/mfs/tests/MFS_test.py b/mfs/tests/MFS_test.py new file mode 100755 index 0000000..2991ad2 --- /dev/null +++ b/mfs/tests/MFS_test.py @@ -0,0 +1,49 @@ +import unittest +from mdlp import MDLP +from sklearn.datasets import load_wine + +from ..Selection import MFS + + +class MFS_test(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + mdlp = MDLP(random_state=1) + X, self.y = load_wine(return_X_y=True) + self.X = mdlp.fit_transform(X, self.y).astype("int64") + self.m, self.n = self.X.shape + + # @classmethod + # def setup(cls): + # pass + + def test_initialize(self): + mfs = MFS() + mfs.fcbs(self.X, self.y, 0.05) + mfs._initialize() + self.assertIsNone(mfs.get_results()) + self.assertListEqual([], mfs.get_scores()) + self.assertDictEqual({}, mfs._su_features) + self.assertIsNone(mfs._su_labels) + + def test_csf(self): + mfs = MFS() + expected = [6, 4] + self.assertListEqual(expected, mfs.cfs(self.X, self.y).get_results()) + expected = [0.5218299405215557, 2.4168234005280964] + self.assertListEqual(expected, mfs.get_scores()) + + def test_fcbs(self): + mfs = MFS() + computed = mfs.fcbs(self.X, self.y, threshold=0.05).get_results() + expected = [6, 9, 12, 0, 11, 4] + self.assertListEqual(expected, computed) + expected = [ + 0.5218299405215557, + 0.46224298637417455, + 0.44518278979085646, + 0.38942355544213786, + 0.3790082191220976, + 0.24972405134844652, + ] + self.assertListEqual(expected, mfs.get_scores()) diff --git a/cfs/tests/Metrics_test.py b/mfs/tests/Metrics_test.py old mode 100644 new mode 100755 similarity index 99% rename from cfs/tests/Metrics_test.py rename to mfs/tests/Metrics_test.py index be148c2..e68c86c --- a/cfs/tests/Metrics_test.py +++ b/mfs/tests/Metrics_test.py @@ -1,7 +1,6 @@ import unittest from sklearn.datasets import load_iris from mdlp import MDLP -import numpy as np from ..Selection import Metrics diff --git a/mfs/tests/__init__.py b/mfs/tests/__init__.py new file mode 100644 index 0000000..1246aa8 --- /dev/null +++ b/mfs/tests/__init__.py @@ -0,0 +1,4 @@ +from .MFS_test import MFS_test +from .Metrics_test import Metrics_test + +__all__ = ["MFS_test", "Metrics_test"] diff --git a/setup.py b/setup.py index 7471f0b..59e6c1e 100644 --- a/setup.py +++ b/setup.py @@ -20,10 +20,10 @@ def get_data(field: str): setuptools.setup( - name="CFS", + name="MFS", version=get_data("version"), license=get_data("license"), - description="Correlation-based Feature Selection", + description="Multi Feature Selection", long_description=readme(), long_description_content_type="text/markdown", packages=setuptools.find_packages(),