mirror of
https://github.com/Doctorado-ML/mufs.git
synced 2025-08-15 23:55:56 +00:00
Rename Project and first working version
This commit is contained in:
4
Makefile
4
Makefile
@@ -4,7 +4,7 @@ SHELL := /bin/bash
|
|||||||
|
|
||||||
coverage: ## Run tests with coverage
|
coverage: ## Run tests with coverage
|
||||||
coverage erase
|
coverage erase
|
||||||
coverage run -m unittest -v cfs.tests
|
coverage run -m unittest -v mfs.tests
|
||||||
coverage report -m
|
coverage report -m
|
||||||
|
|
||||||
deps: ## Install dependencies
|
deps: ## Install dependencies
|
||||||
@@ -19,7 +19,7 @@ push: ## Push code with tags
|
|||||||
git push && git push --tags
|
git push && git push --tags
|
||||||
|
|
||||||
test: ## Run tests
|
test: ## Run tests
|
||||||
python -m unittest -v cfs.tests
|
python -m unittest -v mfs.tests
|
||||||
|
|
||||||
doc: ## Update documentation
|
doc: ## Update documentation
|
||||||
make -C docs --makefile=Makefile html
|
make -C docs --makefile=Makefile html
|
||||||
|
14
README.md
14
README.md
@@ -1,5 +1,13 @@
|
|||||||
# CFS
|
# MFS
|
||||||
|
|
||||||
## Correlation-based Feature Selection
|
## Multi Feature Selection
|
||||||
|
|
||||||
Based on the work of Mark Andrew Hall
|
Compute Fast Fast Correlation Based Filter
|
||||||
|
Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
|
||||||
|
Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
|
||||||
|
(ICML-2003)
|
||||||
|
|
||||||
|
and
|
||||||
|
|
||||||
|
Correlated Feature Selection as in "Correlation-based Feature Selection for
|
||||||
|
Machine Learning" by Mark Andrew Hall
|
||||||
|
@@ -1,39 +0,0 @@
|
|||||||
##Entropy
|
|
||||||
def entropy(Y):
|
|
||||||
"""
|
|
||||||
Also known as Shanon Entropy
|
|
||||||
Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
|
|
||||||
"""
|
|
||||||
unique, count = np.unique(Y, return_counts=True, axis=0)
|
|
||||||
prob = count / len(Y)
|
|
||||||
en = -np.sum(prob * np.log2(prob))
|
|
||||||
return en
|
|
||||||
|
|
||||||
|
|
||||||
# Joint Entropy
|
|
||||||
def jEntropy(Y, X):
|
|
||||||
"""
|
|
||||||
H(Y;X)
|
|
||||||
Reference: https://en.wikipedia.org/wiki/Joint_entropy
|
|
||||||
"""
|
|
||||||
YX = np.c_[Y, X]
|
|
||||||
return entropy(YX)
|
|
||||||
|
|
||||||
|
|
||||||
# Conditional Entropy
|
|
||||||
def cEntropy(Y, X):
|
|
||||||
"""
|
|
||||||
conditional entropy = Joint Entropy - Entropy of X
|
|
||||||
H(Y|X) = H(Y;X) - H(X)
|
|
||||||
Reference: https://en.wikipedia.org/wiki/Conditional_entropy
|
|
||||||
"""
|
|
||||||
return jEntropy(Y, X) - entropy(X)
|
|
||||||
|
|
||||||
|
|
||||||
# Information Gain
|
|
||||||
def gain(Y, X):
|
|
||||||
"""
|
|
||||||
Information Gain, I(Y;X) = H(Y) - H(Y|X)
|
|
||||||
Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
|
|
||||||
"""
|
|
||||||
return entropy(Y) - cEntropy(Y, X)
|
|
@@ -1,86 +0,0 @@
|
|||||||
from math import log
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
class Metrics:
|
|
||||||
@staticmethod
|
|
||||||
def conditional_entropy(x, y, base=2):
|
|
||||||
"""quantifies the amount of information needed to describe the outcome
|
|
||||||
of Y given that the value of X is known
|
|
||||||
computes H(Y|X)
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : np.array
|
|
||||||
values of the variable
|
|
||||||
y : np.array
|
|
||||||
array of labels
|
|
||||||
base : int, optional
|
|
||||||
base of the logarithm, by default 2
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
float
|
|
||||||
conditional entropy of y given x
|
|
||||||
"""
|
|
||||||
xy = np.c_[x, y]
|
|
||||||
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def entropy(y, base=2):
|
|
||||||
"""measure of the uncertainty in predicting the value of y
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
y : np.array
|
|
||||||
array of labels
|
|
||||||
base : int, optional
|
|
||||||
base of the logarithm, by default 2
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
float
|
|
||||||
entropy of y
|
|
||||||
"""
|
|
||||||
_, count = np.unique(y, return_counts=True, axis=0)
|
|
||||||
proba = count.astype(float) / len(y)
|
|
||||||
proba = proba[proba > 0.0]
|
|
||||||
return np.sum(proba * np.log(1.0 / proba)) / log(base)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def information_gain(x, y, base=2):
|
|
||||||
"""Measures the reduction in uncertainty about the value of y when the
|
|
||||||
value of X is known (also called mutual information)
|
|
||||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
x : np.array
|
|
||||||
values of the variable
|
|
||||||
y : np.array
|
|
||||||
array of labels
|
|
||||||
base : int, optional
|
|
||||||
base of the logarithm, by default 2
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
float
|
|
||||||
Information gained
|
|
||||||
"""
|
|
||||||
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
|
|
||||||
x, y, base
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def symmetrical_uncertainty(x, y):
|
|
||||||
|
|
||||||
return (
|
|
||||||
2.0
|
|
||||||
* Metrics.information_gain(x, y)
|
|
||||||
/ (Metrics.entropy(x) + Metrics.entropy(y))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class CFS:
|
|
||||||
def __init__(self, a):
|
|
||||||
self.a = a
|
|
@@ -1,16 +0,0 @@
|
|||||||
import unittest
|
|
||||||
|
|
||||||
from ..Selection import CFS
|
|
||||||
|
|
||||||
|
|
||||||
class CFS_test(unittest.TestCase):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
# @classmethod
|
|
||||||
# def setup(cls):
|
|
||||||
# pass
|
|
||||||
|
|
||||||
def test_initial(self):
|
|
||||||
cfs = CFS(a=1)
|
|
||||||
self.assertEqual(cfs.a, 1)
|
|
@@ -1,4 +0,0 @@
|
|||||||
from .CFS_test import CFS_test
|
|
||||||
from .Metrics_test import Metrics_test
|
|
||||||
|
|
||||||
__all__ = ["CFS_test", "Metrics_test"]
|
|
229
mfs/Selection.py
Executable file
229
mfs/Selection.py
Executable file
@@ -0,0 +1,229 @@
|
|||||||
|
from math import log
|
||||||
|
from sys import float_info
|
||||||
|
from itertools import combinations
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class Metrics:
|
||||||
|
@staticmethod
|
||||||
|
def conditional_entropy(x, y, base=2):
|
||||||
|
"""quantifies the amount of information needed to describe the outcome
|
||||||
|
of Y given that the value of X is known
|
||||||
|
computes H(Y|X)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : np.array
|
||||||
|
values of the variable
|
||||||
|
y : np.array
|
||||||
|
array of labels
|
||||||
|
base : int, optional
|
||||||
|
base of the logarithm, by default 2
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
conditional entropy of y given x
|
||||||
|
"""
|
||||||
|
xy = np.c_[x, y]
|
||||||
|
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def entropy(y, base=2):
|
||||||
|
"""measure of the uncertainty in predicting the value of y
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : np.array
|
||||||
|
array of labels
|
||||||
|
base : int, optional
|
||||||
|
base of the logarithm, by default 2
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
entropy of y
|
||||||
|
"""
|
||||||
|
_, count = np.unique(y, return_counts=True, axis=0)
|
||||||
|
proba = count.astype(float) / len(y)
|
||||||
|
proba = proba[proba > 0.0]
|
||||||
|
return np.sum(proba * np.log(1.0 / proba)) / log(base)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def information_gain(x, y, base=2):
|
||||||
|
"""Measures the reduction in uncertainty about the value of y when the
|
||||||
|
value of X is known (also called mutual information)
|
||||||
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : np.array
|
||||||
|
values of the variable
|
||||||
|
y : np.array
|
||||||
|
array of labels
|
||||||
|
base : int, optional
|
||||||
|
base of the logarithm, by default 2
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
Information gained
|
||||||
|
"""
|
||||||
|
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
|
||||||
|
x, y, base
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def symmetrical_uncertainty(x, y):
|
||||||
|
"""Compute symmetrical uncertainty. Normalize* information gain (mutual
|
||||||
|
information) with the entropies of the features in order to compensate
|
||||||
|
the bias due to high cardinality features. *Range [0, 1]
|
||||||
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : np.array
|
||||||
|
values of the variable
|
||||||
|
y : np.array
|
||||||
|
array of labels
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
symmetrical uncertainty
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
2.0
|
||||||
|
* Metrics.information_gain(x, y)
|
||||||
|
/ (Metrics.entropy(x) + Metrics.entropy(y))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MFS:
|
||||||
|
"""Compute Fast Fast Correlation Based Filter
|
||||||
|
Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
|
||||||
|
Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
|
||||||
|
(ICML-2003)
|
||||||
|
|
||||||
|
and
|
||||||
|
|
||||||
|
Correlated Feature Selection as in "Correlation-based Feature Selection for
|
||||||
|
Machine Learning" by Mark A. Hall
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self._initialize()
|
||||||
|
|
||||||
|
def _initialize(self):
|
||||||
|
self._result = None
|
||||||
|
self._scores = []
|
||||||
|
self._su_labels = None
|
||||||
|
self._su_features = {}
|
||||||
|
|
||||||
|
def _compute_su_labels(self):
|
||||||
|
if self._su_labels is None:
|
||||||
|
num_features = self.X_.shape[1]
|
||||||
|
self._su_labels = np.zeros(num_features)
|
||||||
|
for col in range(num_features):
|
||||||
|
self._su_labels[col] = Metrics.symmetrical_uncertainty(
|
||||||
|
self.X_[:, col], self.y_
|
||||||
|
)
|
||||||
|
return self._su_labels
|
||||||
|
|
||||||
|
def _compute_su_features(self, feature_a, feature_b):
|
||||||
|
if (feature_a, feature_b) not in self._su_features:
|
||||||
|
self._su_features[
|
||||||
|
(feature_a, feature_b)
|
||||||
|
] = Metrics.symmetrical_uncertainty(
|
||||||
|
self.X_[:, feature_a], self.X_[:, feature_b]
|
||||||
|
)
|
||||||
|
return self._su_features[(feature_a, feature_b)]
|
||||||
|
|
||||||
|
def _compute_merit(self, features):
|
||||||
|
rcf = self._su_labels[features].sum()
|
||||||
|
rff = 0.0
|
||||||
|
k = len(features)
|
||||||
|
for pair in list(combinations(features, 2)):
|
||||||
|
rff += self._compute_su_features(*pair)
|
||||||
|
return rcf / ((k ** 2 - k) * rff)
|
||||||
|
|
||||||
|
def cfs(self, X, y):
|
||||||
|
"""CFS forward best first heuristic search
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : np.array
|
||||||
|
array of features
|
||||||
|
y : np.array
|
||||||
|
vector of labels
|
||||||
|
"""
|
||||||
|
self._initialize()
|
||||||
|
self.X_ = X
|
||||||
|
self.y_ = y
|
||||||
|
s_list = self._compute_su_labels()
|
||||||
|
# Descending orders
|
||||||
|
feature_order = (-s_list).argsort().tolist()
|
||||||
|
merit = float_info.min
|
||||||
|
exit_condition = 0
|
||||||
|
candidates = []
|
||||||
|
# start with the best feature (max symmetrical uncertainty wrt label)
|
||||||
|
first_candidate = feature_order.pop(0)
|
||||||
|
candidates.append(first_candidate)
|
||||||
|
self._scores.append(s_list[first_candidate])
|
||||||
|
while exit_condition < 5: # as proposed in the original algorithm
|
||||||
|
id_selected = -1
|
||||||
|
for idx, feature in enumerate(feature_order):
|
||||||
|
candidates.append(feature)
|
||||||
|
merit_new = self._compute_merit(candidates)
|
||||||
|
if merit_new > merit:
|
||||||
|
id_selected = idx
|
||||||
|
merit = merit_new
|
||||||
|
exit_condition = 0
|
||||||
|
candidates.pop()
|
||||||
|
if id_selected == -1:
|
||||||
|
exit_condition += 1
|
||||||
|
else:
|
||||||
|
candidates.append(feature_order[id_selected])
|
||||||
|
self._scores.append(merit_new)
|
||||||
|
del feature_order[id_selected]
|
||||||
|
if len(feature_order) == 0:
|
||||||
|
# Force leaving the loop
|
||||||
|
exit_condition = 5
|
||||||
|
self._result = candidates
|
||||||
|
return self
|
||||||
|
|
||||||
|
def fcbs(self, X, y, threshold):
|
||||||
|
if threshold < 1e-4:
|
||||||
|
raise ValueError("Threshold cannot be less than 1e4")
|
||||||
|
self._initialize()
|
||||||
|
self.X_ = X
|
||||||
|
self.y_ = y
|
||||||
|
s_list = self._compute_su_labels()
|
||||||
|
feature_order = (-s_list).argsort()
|
||||||
|
feature_dup = feature_order.copy().tolist()
|
||||||
|
self._result = []
|
||||||
|
for index_p in feature_order:
|
||||||
|
# Don't self compare
|
||||||
|
feature_dup.pop(0)
|
||||||
|
# Remove redundant features
|
||||||
|
if s_list[index_p] == 0.0:
|
||||||
|
# the feature has been removed from the list
|
||||||
|
continue
|
||||||
|
if s_list[index_p] < threshold:
|
||||||
|
break
|
||||||
|
# Remove redundant features
|
||||||
|
for index_q in feature_dup:
|
||||||
|
# test if feature(index_q) su with feature(index_p) is
|
||||||
|
su_pq = self._compute_su_features(index_p, index_q)
|
||||||
|
if su_pq >= s_list[index_q]:
|
||||||
|
# remove feature from list
|
||||||
|
s_list[index_q] = 0.0
|
||||||
|
self._result.append(index_p)
|
||||||
|
self._scores.append(s_list[index_p])
|
||||||
|
return self
|
||||||
|
|
||||||
|
def get_results(self):
|
||||||
|
return self._result
|
||||||
|
|
||||||
|
def get_scores(self):
|
||||||
|
return self._scores
|
@@ -1,4 +1,4 @@
|
|||||||
from .Selection import CFS
|
from .Selection import MFS
|
||||||
|
|
||||||
__version__ = "0.1"
|
__version__ = "0.1"
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
@@ -6,4 +6,4 @@ __author_email__ = "Ricardo.Montanana@alu.uclm.es"
|
|||||||
__copyright__ = "Copyright 2021, Ricardo Montañana Gómez"
|
__copyright__ = "Copyright 2021, Ricardo Montañana Gómez"
|
||||||
__license__ = "MIT License"
|
__license__ = "MIT License"
|
||||||
|
|
||||||
__all__ = ["CFS"]
|
__all__ = ["MFS"]
|
49
mfs/tests/MFS_test.py
Executable file
49
mfs/tests/MFS_test.py
Executable file
@@ -0,0 +1,49 @@
|
|||||||
|
import unittest
|
||||||
|
from mdlp import MDLP
|
||||||
|
from sklearn.datasets import load_wine
|
||||||
|
|
||||||
|
from ..Selection import MFS
|
||||||
|
|
||||||
|
|
||||||
|
class MFS_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
mdlp = MDLP(random_state=1)
|
||||||
|
X, self.y = load_wine(return_X_y=True)
|
||||||
|
self.X = mdlp.fit_transform(X, self.y).astype("int64")
|
||||||
|
self.m, self.n = self.X.shape
|
||||||
|
|
||||||
|
# @classmethod
|
||||||
|
# def setup(cls):
|
||||||
|
# pass
|
||||||
|
|
||||||
|
def test_initialize(self):
|
||||||
|
mfs = MFS()
|
||||||
|
mfs.fcbs(self.X, self.y, 0.05)
|
||||||
|
mfs._initialize()
|
||||||
|
self.assertIsNone(mfs.get_results())
|
||||||
|
self.assertListEqual([], mfs.get_scores())
|
||||||
|
self.assertDictEqual({}, mfs._su_features)
|
||||||
|
self.assertIsNone(mfs._su_labels)
|
||||||
|
|
||||||
|
def test_csf(self):
|
||||||
|
mfs = MFS()
|
||||||
|
expected = [6, 4]
|
||||||
|
self.assertListEqual(expected, mfs.cfs(self.X, self.y).get_results())
|
||||||
|
expected = [0.5218299405215557, 2.4168234005280964]
|
||||||
|
self.assertListEqual(expected, mfs.get_scores())
|
||||||
|
|
||||||
|
def test_fcbs(self):
|
||||||
|
mfs = MFS()
|
||||||
|
computed = mfs.fcbs(self.X, self.y, threshold=0.05).get_results()
|
||||||
|
expected = [6, 9, 12, 0, 11, 4]
|
||||||
|
self.assertListEqual(expected, computed)
|
||||||
|
expected = [
|
||||||
|
0.5218299405215557,
|
||||||
|
0.46224298637417455,
|
||||||
|
0.44518278979085646,
|
||||||
|
0.38942355544213786,
|
||||||
|
0.3790082191220976,
|
||||||
|
0.24972405134844652,
|
||||||
|
]
|
||||||
|
self.assertListEqual(expected, mfs.get_scores())
|
1
cfs/tests/Metrics_test.py → mfs/tests/Metrics_test.py
Normal file → Executable file
1
cfs/tests/Metrics_test.py → mfs/tests/Metrics_test.py
Normal file → Executable file
@@ -1,7 +1,6 @@
|
|||||||
import unittest
|
import unittest
|
||||||
from sklearn.datasets import load_iris
|
from sklearn.datasets import load_iris
|
||||||
from mdlp import MDLP
|
from mdlp import MDLP
|
||||||
import numpy as np
|
|
||||||
from ..Selection import Metrics
|
from ..Selection import Metrics
|
||||||
|
|
||||||
|
|
4
mfs/tests/__init__.py
Normal file
4
mfs/tests/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
from .MFS_test import MFS_test
|
||||||
|
from .Metrics_test import Metrics_test
|
||||||
|
|
||||||
|
__all__ = ["MFS_test", "Metrics_test"]
|
4
setup.py
4
setup.py
@@ -20,10 +20,10 @@ def get_data(field: str):
|
|||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="CFS",
|
name="MFS",
|
||||||
version=get_data("version"),
|
version=get_data("version"),
|
||||||
license=get_data("license"),
|
license=get_data("license"),
|
||||||
description="Correlation-based Feature Selection",
|
description="Multi Feature Selection",
|
||||||
long_description=readme(),
|
long_description=readme(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
packages=setuptools.find_packages(),
|
packages=setuptools.find_packages(),
|
||||||
|
Reference in New Issue
Block a user