Rename Project and first working version

This commit is contained in:
2021-05-25 02:10:04 +02:00
parent a19f2cc12a
commit 70560506f1
12 changed files with 299 additions and 155 deletions

View File

@@ -4,7 +4,7 @@ SHELL := /bin/bash
coverage: ## Run tests with coverage coverage: ## Run tests with coverage
coverage erase coverage erase
coverage run -m unittest -v cfs.tests coverage run -m unittest -v mfs.tests
coverage report -m coverage report -m
deps: ## Install dependencies deps: ## Install dependencies
@@ -19,7 +19,7 @@ push: ## Push code with tags
git push && git push --tags git push && git push --tags
test: ## Run tests test: ## Run tests
python -m unittest -v cfs.tests python -m unittest -v mfs.tests
doc: ## Update documentation doc: ## Update documentation
make -C docs --makefile=Makefile html make -C docs --makefile=Makefile html

View File

@@ -1,5 +1,13 @@
# CFS # MFS
## Correlation-based Feature Selection ## Multi Feature Selection
Based on the work of Mark Andrew Hall Compute Fast Fast Correlation Based Filter
Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
(ICML-2003)
and
Correlated Feature Selection as in "Correlation-based Feature Selection for
Machine Learning" by Mark Andrew Hall

View File

@@ -1,39 +0,0 @@
##Entropy
def entropy(Y):
"""
Also known as Shanon Entropy
Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
"""
unique, count = np.unique(Y, return_counts=True, axis=0)
prob = count / len(Y)
en = -np.sum(prob * np.log2(prob))
return en
# Joint Entropy
def jEntropy(Y, X):
"""
H(Y;X)
Reference: https://en.wikipedia.org/wiki/Joint_entropy
"""
YX = np.c_[Y, X]
return entropy(YX)
# Conditional Entropy
def cEntropy(Y, X):
"""
conditional entropy = Joint Entropy - Entropy of X
H(Y|X) = H(Y;X) - H(X)
Reference: https://en.wikipedia.org/wiki/Conditional_entropy
"""
return jEntropy(Y, X) - entropy(X)
# Information Gain
def gain(Y, X):
"""
Information Gain, I(Y;X) = H(Y) - H(Y|X)
Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
"""
return entropy(Y) - cEntropy(Y, X)

View File

@@ -1,86 +0,0 @@
from math import log
import numpy as np
class Metrics:
@staticmethod
def conditional_entropy(x, y, base=2):
"""quantifies the amount of information needed to describe the outcome
of Y given that the value of X is known
computes H(Y|X)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
conditional entropy of y given x
"""
xy = np.c_[x, y]
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
@staticmethod
def entropy(y, base=2):
"""measure of the uncertainty in predicting the value of y
Parameters
----------
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
entropy of y
"""
_, count = np.unique(y, return_counts=True, axis=0)
proba = count.astype(float) / len(y)
proba = proba[proba > 0.0]
return np.sum(proba * np.log(1.0 / proba)) / log(base)
@staticmethod
def information_gain(x, y, base=2):
"""Measures the reduction in uncertainty about the value of y when the
value of X is known (also called mutual information)
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
Information gained
"""
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
x, y, base
)
@staticmethod
def symmetrical_uncertainty(x, y):
return (
2.0
* Metrics.information_gain(x, y)
/ (Metrics.entropy(x) + Metrics.entropy(y))
)
class CFS:
def __init__(self, a):
self.a = a

View File

@@ -1,16 +0,0 @@
import unittest
from ..Selection import CFS
class CFS_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# @classmethod
# def setup(cls):
# pass
def test_initial(self):
cfs = CFS(a=1)
self.assertEqual(cfs.a, 1)

View File

@@ -1,4 +0,0 @@
from .CFS_test import CFS_test
from .Metrics_test import Metrics_test
__all__ = ["CFS_test", "Metrics_test"]

229
mfs/Selection.py Executable file
View File

@@ -0,0 +1,229 @@
from math import log
from sys import float_info
from itertools import combinations
import numpy as np
class Metrics:
@staticmethod
def conditional_entropy(x, y, base=2):
"""quantifies the amount of information needed to describe the outcome
of Y given that the value of X is known
computes H(Y|X)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
conditional entropy of y given x
"""
xy = np.c_[x, y]
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
@staticmethod
def entropy(y, base=2):
"""measure of the uncertainty in predicting the value of y
Parameters
----------
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
entropy of y
"""
_, count = np.unique(y, return_counts=True, axis=0)
proba = count.astype(float) / len(y)
proba = proba[proba > 0.0]
return np.sum(proba * np.log(1.0 / proba)) / log(base)
@staticmethod
def information_gain(x, y, base=2):
"""Measures the reduction in uncertainty about the value of y when the
value of X is known (also called mutual information)
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
Information gained
"""
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
x, y, base
)
@staticmethod
def symmetrical_uncertainty(x, y):
"""Compute symmetrical uncertainty. Normalize* information gain (mutual
information) with the entropies of the features in order to compensate
the bias due to high cardinality features. *Range [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
Returns
-------
float
symmetrical uncertainty
"""
return (
2.0
* Metrics.information_gain(x, y)
/ (Metrics.entropy(x) + Metrics.entropy(y))
)
class MFS:
"""Compute Fast Fast Correlation Based Filter
Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
(ICML-2003)
and
Correlated Feature Selection as in "Correlation-based Feature Selection for
Machine Learning" by Mark A. Hall
"""
def __init__(self):
self._initialize()
def _initialize(self):
self._result = None
self._scores = []
self._su_labels = None
self._su_features = {}
def _compute_su_labels(self):
if self._su_labels is None:
num_features = self.X_.shape[1]
self._su_labels = np.zeros(num_features)
for col in range(num_features):
self._su_labels[col] = Metrics.symmetrical_uncertainty(
self.X_[:, col], self.y_
)
return self._su_labels
def _compute_su_features(self, feature_a, feature_b):
if (feature_a, feature_b) not in self._su_features:
self._su_features[
(feature_a, feature_b)
] = Metrics.symmetrical_uncertainty(
self.X_[:, feature_a], self.X_[:, feature_b]
)
return self._su_features[(feature_a, feature_b)]
def _compute_merit(self, features):
rcf = self._su_labels[features].sum()
rff = 0.0
k = len(features)
for pair in list(combinations(features, 2)):
rff += self._compute_su_features(*pair)
return rcf / ((k ** 2 - k) * rff)
def cfs(self, X, y):
"""CFS forward best first heuristic search
Parameters
----------
X : np.array
array of features
y : np.array
vector of labels
"""
self._initialize()
self.X_ = X
self.y_ = y
s_list = self._compute_su_labels()
# Descending orders
feature_order = (-s_list).argsort().tolist()
merit = float_info.min
exit_condition = 0
candidates = []
# start with the best feature (max symmetrical uncertainty wrt label)
first_candidate = feature_order.pop(0)
candidates.append(first_candidate)
self._scores.append(s_list[first_candidate])
while exit_condition < 5: # as proposed in the original algorithm
id_selected = -1
for idx, feature in enumerate(feature_order):
candidates.append(feature)
merit_new = self._compute_merit(candidates)
if merit_new > merit:
id_selected = idx
merit = merit_new
exit_condition = 0
candidates.pop()
if id_selected == -1:
exit_condition += 1
else:
candidates.append(feature_order[id_selected])
self._scores.append(merit_new)
del feature_order[id_selected]
if len(feature_order) == 0:
# Force leaving the loop
exit_condition = 5
self._result = candidates
return self
def fcbs(self, X, y, threshold):
if threshold < 1e-4:
raise ValueError("Threshold cannot be less than 1e4")
self._initialize()
self.X_ = X
self.y_ = y
s_list = self._compute_su_labels()
feature_order = (-s_list).argsort()
feature_dup = feature_order.copy().tolist()
self._result = []
for index_p in feature_order:
# Don't self compare
feature_dup.pop(0)
# Remove redundant features
if s_list[index_p] == 0.0:
# the feature has been removed from the list
continue
if s_list[index_p] < threshold:
break
# Remove redundant features
for index_q in feature_dup:
# test if feature(index_q) su with feature(index_p) is
su_pq = self._compute_su_features(index_p, index_q)
if su_pq >= s_list[index_q]:
# remove feature from list
s_list[index_q] = 0.0
self._result.append(index_p)
self._scores.append(s_list[index_p])
return self
def get_results(self):
return self._result
def get_scores(self):
return self._scores

View File

@@ -1,4 +1,4 @@
from .Selection import CFS from .Selection import MFS
__version__ = "0.1" __version__ = "0.1"
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
@@ -6,4 +6,4 @@ __author_email__ = "Ricardo.Montanana@alu.uclm.es"
__copyright__ = "Copyright 2021, Ricardo Montañana Gómez" __copyright__ = "Copyright 2021, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__all__ = ["CFS"] __all__ = ["MFS"]

49
mfs/tests/MFS_test.py Executable file
View File

@@ -0,0 +1,49 @@
import unittest
from mdlp import MDLP
from sklearn.datasets import load_wine
from ..Selection import MFS
class MFS_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
mdlp = MDLP(random_state=1)
X, self.y = load_wine(return_X_y=True)
self.X = mdlp.fit_transform(X, self.y).astype("int64")
self.m, self.n = self.X.shape
# @classmethod
# def setup(cls):
# pass
def test_initialize(self):
mfs = MFS()
mfs.fcbs(self.X, self.y, 0.05)
mfs._initialize()
self.assertIsNone(mfs.get_results())
self.assertListEqual([], mfs.get_scores())
self.assertDictEqual({}, mfs._su_features)
self.assertIsNone(mfs._su_labels)
def test_csf(self):
mfs = MFS()
expected = [6, 4]
self.assertListEqual(expected, mfs.cfs(self.X, self.y).get_results())
expected = [0.5218299405215557, 2.4168234005280964]
self.assertListEqual(expected, mfs.get_scores())
def test_fcbs(self):
mfs = MFS()
computed = mfs.fcbs(self.X, self.y, threshold=0.05).get_results()
expected = [6, 9, 12, 0, 11, 4]
self.assertListEqual(expected, computed)
expected = [
0.5218299405215557,
0.46224298637417455,
0.44518278979085646,
0.38942355544213786,
0.3790082191220976,
0.24972405134844652,
]
self.assertListEqual(expected, mfs.get_scores())

View File

@@ -1,7 +1,6 @@
import unittest import unittest
from sklearn.datasets import load_iris from sklearn.datasets import load_iris
from mdlp import MDLP from mdlp import MDLP
import numpy as np
from ..Selection import Metrics from ..Selection import Metrics

4
mfs/tests/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .MFS_test import MFS_test
from .Metrics_test import Metrics_test
__all__ = ["MFS_test", "Metrics_test"]

View File

@@ -20,10 +20,10 @@ def get_data(field: str):
setuptools.setup( setuptools.setup(
name="CFS", name="MFS",
version=get_data("version"), version=get_data("version"),
license=get_data("license"), license=get_data("license"),
description="Correlation-based Feature Selection", description="Multi Feature Selection",
long_description=readme(), long_description=readme(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
packages=setuptools.find_packages(), packages=setuptools.find_packages(),