mirror of
https://github.com/Doctorado-ML/mufs.git
synced 2025-08-18 17:15:52 +00:00
Compare commits
8 Commits
v0.1.1
...
fix_merit_
Author | SHA1 | Date | |
---|---|---|---|
d0f1cc5979
|
|||
b958bccef6
|
|||
a0f172ac13
|
|||
|
cfb37d2f6c | ||
5d1720c9ae
|
|||
1c5f1977e5
|
|||
27f8a370c5
|
|||
|
9d74bc8a70 |
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
pip install -q cython
|
||||
pip install -q numpy
|
||||
pip install -q git+git://github.com/doctorado-ml/mdlp
|
||||
pip install -q -r requirements.txt
|
||||
pip install -q -r requirements/dev.txt
|
||||
pip install -q --upgrade codecov coverage black flake8 codacy-coverage
|
||||
- name: Lint
|
||||
run: |
|
||||
|
@@ -1,6 +1,6 @@
|
||||
repos:
|
||||
- repo: https://github.com/ambv/black
|
||||
rev: 20.8b1
|
||||
rev: 22.1.0
|
||||
hooks:
|
||||
- id: black
|
||||
exclude: ".virtual_documents"
|
||||
|
@@ -1,6 +1,8 @@
|
||||

|
||||
[](https://www.codacy.com/gh/Doctorado-ML/mufs/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/mufs&utm_campaign=Badge_Grade)
|
||||
[](https://lgtm.com/projects/g/Doctorado-ML/mufs/context:python)
|
||||
[](https://badge.fury.io/py/MUFS)
|
||||

|
||||
|
||||
# MUFS
|
||||
|
||||
@@ -15,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa
|
||||
### Correlation-based Feature Selection
|
||||
|
||||
Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
|
||||
|
||||
### IWSS
|
||||
|
||||
Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673.
|
||||
|
@@ -26,7 +26,7 @@ class MUFS:
|
||||
"""
|
||||
|
||||
def __init__(self, max_features=None, discrete=True):
|
||||
self._max_features = max_features
|
||||
self.max_features = max_features
|
||||
self._discrete = discrete
|
||||
self.symmetrical_uncertainty = (
|
||||
Metrics.symmetrical_uncertainty
|
||||
@@ -53,8 +53,10 @@ class MUFS:
|
||||
"""
|
||||
self.X_ = X
|
||||
self.y_ = y
|
||||
if self._max_features is None:
|
||||
if self.max_features is None:
|
||||
self._max_features = X.shape[1]
|
||||
else:
|
||||
self._max_features = self.max_features
|
||||
self._result = None
|
||||
self._scores = []
|
||||
self._su_labels = None
|
||||
@@ -105,7 +107,9 @@ class MUFS:
|
||||
|
||||
def _compute_merit(self, features):
|
||||
"""Compute the merit function for cfs algorithms
|
||||
|
||||
"Good feature subsets contain features highly correlated with
|
||||
(predictive of) the class, yet uncorrelated with (not predictive of)
|
||||
each other"
|
||||
Parameters
|
||||
----------
|
||||
features : list
|
||||
@@ -124,7 +128,7 @@ class MUFS:
|
||||
k = len(features)
|
||||
for pair in list(combinations(features, 2)):
|
||||
rff += self._compute_su_features(*pair)
|
||||
return rcf / sqrt(k + (k ** 2 - k) * rff)
|
||||
return k * rcf / sqrt(k + (k**2 - k) * rff)
|
||||
|
||||
def cfs(self, X, y):
|
||||
"""Correlation-based Feature Selection
|
||||
@@ -264,3 +268,58 @@ class MUFS:
|
||||
list of scores of the features selected
|
||||
"""
|
||||
return self._scores if self._fitted else []
|
||||
|
||||
def iwss(self, X, y, threshold):
|
||||
"""Incremental Wrapper Subset Selection
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : np.array
|
||||
array of features
|
||||
y : np.array
|
||||
vector of labels
|
||||
threshold : float
|
||||
threshold to select relevant features
|
||||
|
||||
Returns
|
||||
-------
|
||||
self
|
||||
self
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
if the threshold is less than a selected value of 1e-7
|
||||
or greater than .5
|
||||
|
||||
"""
|
||||
if threshold < 0 or threshold > 0.5:
|
||||
raise ValueError(
|
||||
"Threshold cannot be less than 0 or greater than 0.5"
|
||||
)
|
||||
self._initialize(X, y)
|
||||
s_list = self._compute_su_labels()
|
||||
feature_order = (-s_list).argsort()
|
||||
features = feature_order.copy().tolist()
|
||||
candidates = []
|
||||
# Add first and second features to result
|
||||
first_feature = features.pop(0)
|
||||
candidates.append(first_feature)
|
||||
self._scores.append(s_list[first_feature])
|
||||
candidates.append(features.pop(0))
|
||||
merit = self._compute_merit(candidates)
|
||||
self._scores.append(merit)
|
||||
for feature in features:
|
||||
candidates.append(feature)
|
||||
merit_new = self._compute_merit(candidates)
|
||||
delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0
|
||||
if merit_new > merit or delta < threshold:
|
||||
if merit_new > merit:
|
||||
merit = merit_new
|
||||
self._scores.append(merit_new)
|
||||
else:
|
||||
candidates.pop()
|
||||
break
|
||||
if len(candidates) == self._max_features:
|
||||
break
|
||||
self._result = candidates
|
||||
return self
|
||||
|
@@ -1,6 +1,6 @@
|
||||
from .Selection import MUFS
|
||||
|
||||
__version__ = "0.1.1"
|
||||
__version__ = "0.1.2"
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
__author_email__ = "Ricardo.Montanana@alu.uclm.es"
|
||||
__copyright__ = "Copyright 2021, Ricardo Montañana Gómez"
|
||||
|
@@ -1,11 +1,14 @@
|
||||
import unittest
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from mdlp import MDLP
|
||||
from sklearn.datasets import load_wine, load_iris
|
||||
|
||||
from ..Selection import MUFS
|
||||
|
||||
|
||||
class MUFS_test(unittest.TestCase):
|
||||
class MUFSTest(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
mdlp = MDLP(random_state=1)
|
||||
@@ -31,33 +34,38 @@ class MUFS_test(unittest.TestCase):
|
||||
|
||||
def test_csf_wine(self):
|
||||
mufs = MUFS()
|
||||
expected = [6, 12, 9, 4, 10, 0]
|
||||
self.assertListAlmostEqual(
|
||||
expected = [6, 12, 9, 4, 10, 0, 7, 8]
|
||||
self.assertListEqual(
|
||||
expected, mufs.cfs(self.X_w, self.y_w).get_results()
|
||||
)
|
||||
expected = [
|
||||
0.5218299405215557,
|
||||
0.602513857132804,
|
||||
0.4877384978817362,
|
||||
0.3743688234383051,
|
||||
0.28795671854246285,
|
||||
0.2309165735173175,
|
||||
1.205027714265608,
|
||||
1.4632154936452084,
|
||||
1.4974752937532203,
|
||||
1.4397835927123144,
|
||||
1.385499441103905,
|
||||
1.340618857006277,
|
||||
1.2989177695790775,
|
||||
]
|
||||
self.assertListAlmostEqual(expected, mufs.get_scores())
|
||||
|
||||
def test_csf_wine_cont(self):
|
||||
mufs = MUFS(discrete=False)
|
||||
expected = [10, 6, 0, 2, 11, 9]
|
||||
expected = [10, 6, 0, 2, 11, 9, 8, 1, 5]
|
||||
self.assertListEqual(
|
||||
expected, mufs.cfs(self.X_wc, self.y_w).get_results()
|
||||
)
|
||||
expected = [
|
||||
0.735264150416997,
|
||||
0.8321684551546848,
|
||||
0.7439915858469107,
|
||||
0.6238883340158233,
|
||||
0.513637402071709,
|
||||
0.41596400981378984,
|
||||
1.6643369103093697,
|
||||
2.231974757540732,
|
||||
2.4955533360632933,
|
||||
2.568187010358545,
|
||||
2.495784058882739,
|
||||
2.4409992149141915,
|
||||
2.3665143407182456,
|
||||
2.280111788845658,
|
||||
]
|
||||
self.assertListAlmostEqual(expected, mufs.get_scores())
|
||||
|
||||
@@ -67,23 +75,19 @@ class MUFS_test(unittest.TestCase):
|
||||
self.assertListAlmostEqual(
|
||||
expected, mufs.cfs(self.X_w, self.y_w).get_results()
|
||||
)
|
||||
expected = [
|
||||
0.5218299405215557,
|
||||
0.602513857132804,
|
||||
0.4877384978817362,
|
||||
]
|
||||
expected = [0.5218299405215557, 1.205027714265608, 1.4632154936452084]
|
||||
self.assertListAlmostEqual(expected, mufs.get_scores())
|
||||
|
||||
def test_csf_iris(self):
|
||||
mufs = MUFS()
|
||||
expected = [3, 2, 0, 1]
|
||||
computed = mufs.cfs(self.X_i, self.y_i).get_results()
|
||||
self.assertListAlmostEqual(expected, computed)
|
||||
self.assertListEqual(expected, computed)
|
||||
expected = [
|
||||
0.870521418179061,
|
||||
0.8968651482682227,
|
||||
0.5908278453318913,
|
||||
0.40371971570693366,
|
||||
1.7937302965364454,
|
||||
1.7724835359956739,
|
||||
1.6148788628277346,
|
||||
]
|
||||
self.assertListAlmostEqual(expected, mufs.get_scores())
|
||||
|
||||
@@ -148,3 +152,54 @@ class MUFS_test(unittest.TestCase):
|
||||
0.44518278979085646,
|
||||
]
|
||||
self.assertListAlmostEqual(expected, mufs.get_scores())
|
||||
|
||||
def test_iwss_wine(self):
|
||||
mufs = MUFS()
|
||||
expected = [6, 9, 12, 0, 11, 10, 5]
|
||||
self.assertListEqual(
|
||||
expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results()
|
||||
)
|
||||
expected = [
|
||||
0.5218299405215557,
|
||||
1.189564575222017,
|
||||
1.4632154936452084,
|
||||
1.428626297656075,
|
||||
1.3384248731269246,
|
||||
1.2869213430115078,
|
||||
1.1949414936926785,
|
||||
]
|
||||
self.assertListAlmostEqual(expected, mufs.get_scores())
|
||||
|
||||
def test_iwss_wine_max_features(self):
|
||||
mufs = MUFS(max_features=3)
|
||||
expected = [6, 9, 12]
|
||||
self.assertListEqual(
|
||||
expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results()
|
||||
)
|
||||
expected = [0.5218299405215557, 1.189564575222017, 1.4632154936452084]
|
||||
self.assertListAlmostEqual(expected, mufs.get_scores())
|
||||
|
||||
def test_iwss_exception(self):
|
||||
mufs = MUFS()
|
||||
with self.assertRaises(ValueError):
|
||||
mufs.iwss(self.X_w, self.y_w, 0.51)
|
||||
with self.assertRaises(ValueError):
|
||||
mufs.iwss(self.X_w, self.y_w, -0.01)
|
||||
|
||||
def test_iwss_better_merit_condition(self):
|
||||
folder = os.path.dirname(os.path.abspath(__file__))
|
||||
data = pd.read_csv(
|
||||
os.path.join(folder, "balloons_R.dat"),
|
||||
sep="\t",
|
||||
index_col=0,
|
||||
)
|
||||
X = data.drop("clase", axis=1).to_numpy()
|
||||
y = data["clase"].to_numpy()
|
||||
mufs = MUFS()
|
||||
expected = [0, 2, 3, 1]
|
||||
self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results())
|
||||
|
||||
def test_iwss_empty(self):
|
||||
mufs = MUFS()
|
||||
X = np.delete(self.X_i, [0, 1], 1)
|
||||
self.assertListEqual(mufs.iwss(X, self.y_i, 0.3).get_results(), [1, 0])
|
||||
|
@@ -6,7 +6,7 @@ from mdlp import MDLP
|
||||
from ..Selection import Metrics
|
||||
|
||||
|
||||
class Metrics_test(unittest.TestCase):
|
||||
class MetricsTest(unittest.TestCase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
mdlp = MDLP(random_state=1)
|
||||
|
@@ -1,4 +1,4 @@
|
||||
from .MUFS_test import MUFS_test
|
||||
from .Metrics_test import Metrics_test
|
||||
from .MUFS_test import MUFSTest
|
||||
from .Metrics_test import MetricsTest
|
||||
|
||||
__all__ = ["MUFS_test", "Metrics_test"]
|
||||
__all__ = ["MUFSTest", "MetricsTest"]
|
||||
|
17
mufs/tests/balloons_R.dat
Executable file
17
mufs/tests/balloons_R.dat
Executable file
@@ -0,0 +1,17 @@
|
||||
f1 f2 f3 f4 clase
|
||||
1 0.968246 -0.968246 0.968246 0.968246 1
|
||||
2 0.968246 -0.968246 0.968246 -0.968246 1
|
||||
3 0.968246 -0.968246 -0.968246 0.968246 1
|
||||
4 0.968246 -0.968246 -0.968246 -0.968246 1
|
||||
5 0.968246 0.968246 0.968246 0.968246 1
|
||||
6 0.968246 0.968246 0.968246 -0.968246 0
|
||||
7 0.968246 0.968246 -0.968246 0.968246 0
|
||||
8 0.968246 0.968246 -0.968246 -0.968246 0
|
||||
9 -0.968246 -0.968246 0.968246 0.968246 1
|
||||
10 -0.968246 -0.968246 0.968246 -0.968246 0
|
||||
11 -0.968246 -0.968246 -0.968246 0.968246 0
|
||||
12 -0.968246 -0.968246 -0.968246 -0.968246 0
|
||||
13 -0.968246 0.968246 0.968246 0.968246 1
|
||||
14 -0.968246 0.968246 0.968246 -0.968246 0
|
||||
15 -0.968246 0.968246 -0.968246 0.968246 0
|
||||
16 -0.968246 0.968246 -0.968246 -0.968246 0
|
3
requirements/dev.txt
Normal file
3
requirements/dev.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
-r production.txt
|
||||
mdlp
|
||||
pandas
|
@@ -1,2 +1 @@
|
||||
scikit-learn>0.24
|
||||
mdlp
|
21
sample.py
21
sample.py
@@ -1,4 +1,5 @@
|
||||
import warnings
|
||||
import time
|
||||
from mufs import MUFS
|
||||
from mufs.Metrics import Metrics
|
||||
from stree import Stree
|
||||
@@ -26,16 +27,26 @@ for i in range(n):
|
||||
# Classification
|
||||
warnings.filterwarnings("ignore")
|
||||
print("CFS")
|
||||
now = time.time()
|
||||
cfs_f = mufsc.cfs(X, y).get_results()
|
||||
print(cfs_f)
|
||||
time_cfs = time.time() - now
|
||||
print(cfs_f, "items: ", len(cfs_f), f"time: {time_cfs:.3f} seconds")
|
||||
print("FCBF")
|
||||
fcfb_f = mufsc.fcbf(X, y, 5e-2).get_results()
|
||||
print(fcfb_f, len(fcfb_f))
|
||||
now = time.time()
|
||||
fcbf_f = mufsc.fcbf(X, y, 0.07).get_results()
|
||||
time_fcbf = time.time() - now
|
||||
print(fcbf_f, "items: ", len(fcbf_f), f"time: {time_fcbf:.3f} seconds")
|
||||
now = time.time()
|
||||
print("IWSS")
|
||||
iwss_f = mufsc.iwss(X, y, 0.5).get_results()
|
||||
time_iwss = time.time() - now
|
||||
print(iwss_f, "items: ", len(iwss_f), f"time: {time_iwss:.3f} seconds")
|
||||
print("X.shape=", X.shape)
|
||||
clf = Stree(random_state=0)
|
||||
print("Accuracy whole dataset", clf.fit(X, y).score(X, y))
|
||||
clf = Stree(random_state=0)
|
||||
print("Accuracy cfs", clf.fit(X[:, cfs_f], y).score(X[:, cfs_f], y))
|
||||
clf = Stree(random_state=0)
|
||||
subf = fcfb_f
|
||||
print("Accuracy fcfb", clf.fit(X[:, subf], y).score(X[:, subf], y))
|
||||
print("Accuracy fcfb", clf.fit(X[:, fcbf_f], y).score(X[:, fcbf_f], y))
|
||||
clf = Stree(random_state=0)
|
||||
print("Accuracy iwss", clf.fit(X[:, iwss_f], y).score(X[:, iwss_f], y))
|
||||
|
Reference in New Issue
Block a user