mirror of
https://github.com/Doctorado-ML/mufs.git
synced 2025-08-18 17:15:52 +00:00
Update comments and CI workflow
This commit is contained in:
10
.github/workflows/main.yml
vendored
10
.github/workflows/main.yml
vendored
@@ -28,9 +28,9 @@ jobs:
|
|||||||
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
||||||
- name: Lint
|
- name: Lint
|
||||||
run: |
|
run: |
|
||||||
black --check --diff stree
|
black --check --diff mfs
|
||||||
flake8 --count stree
|
flake8 --count mfs
|
||||||
- name: Tests
|
- name: Tests & coverage
|
||||||
run: |
|
run: |
|
||||||
coverage run -m unittest -v stree.tests
|
coverage run -m unittest -v mfs.tests
|
||||||
coverage xml
|
coverage report -m --fail-under=100
|
||||||
|
16
README.md
16
README.md
@@ -1,13 +1,15 @@
|
|||||||
|

|
||||||
|
|
||||||
# MFS
|
# MFS
|
||||||
|
|
||||||
## Multi Feature Selection
|
## Multi Feature Selection
|
||||||
|
|
||||||
Compute Fast Fast Correlation Based Filter
|
### Fast Correlation-Based Filter
|
||||||
Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
|
|
||||||
Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
|
|
||||||
(ICML-2003)
|
|
||||||
|
|
||||||
and
|
Feature Selection for High-Dimensional Data : A Fast Correlation-Based Filter Solution. / Yu, Lei; Liu, Huan.
|
||||||
|
|
||||||
Correlated Feature Selection as in "Correlation-based Feature Selection for
|
Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fawcett; N. Mishra. 2003. p. 856-863 (Proceedings, Twentieth International Conference on Machine Learning; Vol. 2).
|
||||||
Machine Learning" by Mark Andrew Hall
|
|
||||||
|
### Correlation-based Feature Selection
|
||||||
|
|
||||||
|
Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
|
||||||
|
@@ -115,12 +115,23 @@ class MFS:
|
|||||||
self._initialize()
|
self._initialize()
|
||||||
|
|
||||||
def _initialize(self):
|
def _initialize(self):
|
||||||
|
"""Initialize the attributes so support multiple calls using same
|
||||||
|
object
|
||||||
|
"""
|
||||||
self._result = None
|
self._result = None
|
||||||
self._scores = []
|
self._scores = []
|
||||||
self._su_labels = None
|
self._su_labels = None
|
||||||
self._su_features = {}
|
self._su_features = {}
|
||||||
|
|
||||||
def _compute_su_labels(self):
|
def _compute_su_labels(self):
|
||||||
|
"""Compute symmetrical uncertainty between each feature of the dataset
|
||||||
|
and the labels and store it to use in future calls
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
vector with sym. un. of every feature and the labels
|
||||||
|
"""
|
||||||
if self._su_labels is None:
|
if self._su_labels is None:
|
||||||
num_features = self.X_.shape[1]
|
num_features = self.X_.shape[1]
|
||||||
self._su_labels = np.zeros(num_features)
|
self._su_labels = np.zeros(num_features)
|
||||||
@@ -131,6 +142,21 @@ class MFS:
|
|||||||
return self._su_labels
|
return self._su_labels
|
||||||
|
|
||||||
def _compute_su_features(self, feature_a, feature_b):
|
def _compute_su_features(self, feature_a, feature_b):
|
||||||
|
"""Compute symmetrical uncertainty between two features and stores it
|
||||||
|
to use in future calls
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
feature_a : int
|
||||||
|
index of the first feature
|
||||||
|
feature_b : int
|
||||||
|
index of the second feature
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
The symmetrical uncertainty of the two features
|
||||||
|
"""
|
||||||
if (feature_a, feature_b) not in self._su_features:
|
if (feature_a, feature_b) not in self._su_features:
|
||||||
self._su_features[
|
self._su_features[
|
||||||
(feature_a, feature_b)
|
(feature_a, feature_b)
|
||||||
@@ -140,6 +166,18 @@ class MFS:
|
|||||||
return self._su_features[(feature_a, feature_b)]
|
return self._su_features[(feature_a, feature_b)]
|
||||||
|
|
||||||
def _compute_merit(self, features):
|
def _compute_merit(self, features):
|
||||||
|
"""Compute the merit function for cfs algorithms
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
features : list
|
||||||
|
list of features to include in the computation
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
The merit of the feature set passed
|
||||||
|
"""
|
||||||
rcf = self._su_labels[features].sum()
|
rcf = self._su_labels[features].sum()
|
||||||
rff = 0.0
|
rff = 0.0
|
||||||
k = len(features)
|
k = len(features)
|
||||||
@@ -148,7 +186,8 @@ class MFS:
|
|||||||
return rcf / sqrt(k + (k ** 2 - k) * rff)
|
return rcf / sqrt(k + (k ** 2 - k) * rff)
|
||||||
|
|
||||||
def cfs(self, X, y):
|
def cfs(self, X, y):
|
||||||
"""CFS forward best first heuristic search
|
"""Correlation-based Feature Selection
|
||||||
|
with a forward best first heuristic search
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -156,6 +195,11 @@ class MFS:
|
|||||||
array of features
|
array of features
|
||||||
y : np.array
|
y : np.array
|
||||||
vector of labels
|
vector of labels
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
self
|
||||||
|
self
|
||||||
"""
|
"""
|
||||||
self._initialize()
|
self._initialize()
|
||||||
self.X_ = X
|
self.X_ = X
|
||||||
@@ -186,6 +230,13 @@ class MFS:
|
|||||||
# Force leaving the loop
|
# Force leaving the loop
|
||||||
continue_condition = False
|
continue_condition = False
|
||||||
if len(self._scores) >= 5:
|
if len(self._scores) >= 5:
|
||||||
|
"""
|
||||||
|
"To prevent the best first search from exploring the entire
|
||||||
|
feature subset search space, a stopping criterion is imposed.
|
||||||
|
The search will terminate if five consecutive fully expanded
|
||||||
|
subsets show no improvement over the current best subset."
|
||||||
|
as stated in Mark A. Hall Thesis
|
||||||
|
"""
|
||||||
item_ant = -1
|
item_ant = -1
|
||||||
for item in self._scores[-5:]:
|
for item in self._scores[-5:]:
|
||||||
if item_ant == -1:
|
if item_ant == -1:
|
||||||
@@ -200,8 +251,29 @@ class MFS:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def fcbs(self, X, y, threshold):
|
def fcbs(self, X, y, threshold):
|
||||||
|
"""Fast Correlation-Based Filter
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
X : np.array
|
||||||
|
array of features
|
||||||
|
y : np.array
|
||||||
|
vector of labels
|
||||||
|
threshold : float
|
||||||
|
threshold to select relevant features
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
self
|
||||||
|
self
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
if the threshold is less than a selected value of 1e-4
|
||||||
|
"""
|
||||||
if threshold < 1e-4:
|
if threshold < 1e-4:
|
||||||
raise ValueError("Threshold cannot be less than 1e4")
|
raise ValueError("Threshold cannot be less than 1e-4")
|
||||||
self._initialize()
|
self._initialize()
|
||||||
self.X_ = X
|
self.X_ = X
|
||||||
self.y_ = y
|
self.y_ = y
|
||||||
@@ -229,7 +301,21 @@ class MFS:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def get_results(self):
|
def get_results(self):
|
||||||
|
"""Return the results of the algorithm applied if any
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list of features indices selected
|
||||||
|
"""
|
||||||
return self._result
|
return self._result
|
||||||
|
|
||||||
def get_scores(self):
|
def get_scores(self):
|
||||||
|
"""Return the scores computed for the features selected
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list
|
||||||
|
list of scores of the features selected
|
||||||
|
"""
|
||||||
return self._scores
|
return self._scores
|
||||||
|
Reference in New Issue
Block a user