Begin IWSS implementation

Update requirements
Create requirements for dev
This commit is contained in:
2021-10-10 19:06:57 +02:00
parent 9d74bc8a70
commit 27f8a370c5
6 changed files with 128 additions and 6 deletions

View File

@@ -17,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa
### Correlation-based Feature Selection ### Correlation-based Feature Selection
Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'. Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
### IWSS
Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673.

View File

@@ -26,7 +26,7 @@ class MUFS:
""" """
def __init__(self, max_features=None, discrete=True): def __init__(self, max_features=None, discrete=True):
self._max_features = max_features self.max_features = max_features
self._discrete = discrete self._discrete = discrete
self.symmetrical_uncertainty = ( self.symmetrical_uncertainty = (
Metrics.symmetrical_uncertainty Metrics.symmetrical_uncertainty
@@ -53,8 +53,10 @@ class MUFS:
""" """
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
if self._max_features is None: if self.max_features is None:
self._max_features = X.shape[1] self._max_features = X.shape[1]
else:
self._max_features = self.max_features
self._result = None self._result = None
self._scores = [] self._scores = []
self._su_labels = None self._su_labels = None
@@ -105,7 +107,9 @@ class MUFS:
def _compute_merit(self, features): def _compute_merit(self, features):
"""Compute the merit function for cfs algorithms """Compute the merit function for cfs algorithms
"Good feature subsets contain features highly correlated with
(predictive of) the class, yet uncorrelated with (not predictive of)
each other"
Parameters Parameters
---------- ----------
features : list features : list
@@ -264,3 +268,57 @@ class MUFS:
list of scores of the features selected list of scores of the features selected
""" """
return self._scores if self._fitted else [] return self._scores if self._fitted else []
def iwss(self, X, y, threshold):
"""Incremental Wrapper Subset Selection
Parameters
----------
X : np.array
array of features
y : np.array
vector of labels
threshold : float
threshold to select relevant features
Returns
-------
self
self
Raises
------
ValueError
if the threshold is less than a selected value of 1e-7
or greater than .5
"""
if threshold < 0 or threshold > 0.5:
raise ValueError(
"Threshold cannot be less than 0 or greater than 0.5"
)
self._initialize(X, y)
s_list = self._compute_su_labels()
feature_order = (-s_list).argsort()
features = feature_order.copy().tolist()
candidates = []
# Add first and second features to result
first_feature = features.pop(0)
candidates.append(first_feature)
self._scores.append(s_list[first_feature])
candidates.append(features.pop(0))
merit = self._compute_merit(candidates)
self._scores.append(merit)
for feature in features:
candidates.append(feature)
merit_new = self._compute_merit(candidates)
delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0
if merit_new > merit or delta < threshold:
if merit_new > merit:
merit = merit_new
self._scores.append(merit_new)
else:
candidates.pop()
if len(candidates) == self._max_features:
break
self._result = candidates
return self

View File

@@ -32,7 +32,7 @@ class MUFS_test(unittest.TestCase):
def test_csf_wine(self): def test_csf_wine(self):
mufs = MUFS() mufs = MUFS()
expected = [6, 12, 9, 4, 10, 0] expected = [6, 12, 9, 4, 10, 0]
self.assertListAlmostEqual( self.assertListEqual(
expected, mufs.cfs(self.X_w, self.y_w).get_results() expected, mufs.cfs(self.X_w, self.y_w).get_results()
) )
expected = [ expected = [
@@ -78,7 +78,7 @@ class MUFS_test(unittest.TestCase):
mufs = MUFS() mufs = MUFS()
expected = [3, 2, 0, 1] expected = [3, 2, 0, 1]
computed = mufs.cfs(self.X_i, self.y_i).get_results() computed = mufs.cfs(self.X_i, self.y_i).get_results()
self.assertListAlmostEqual(expected, computed) self.assertListEqual(expected, computed)
expected = [ expected = [
0.870521418179061, 0.870521418179061,
0.8968651482682227, 0.8968651482682227,
@@ -148,3 +148,44 @@ class MUFS_test(unittest.TestCase):
0.44518278979085646, 0.44518278979085646,
] ]
self.assertListAlmostEqual(expected, mufs.get_scores()) self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_wine(self):
mufs = MUFS()
expected = [6, 9, 12]
self.assertListEqual(
expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results()
)
expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_wine_max_features(self):
mufs = MUFS(max_features=3)
expected = [6, 9, 12]
self.assertListEqual(
expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results()
)
expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_exception(self):
mufs = MUFS()
with self.assertRaises(ValueError):
mufs.iwss(self.X_w, self.y_w, 0.51)
with self.assertRaises(ValueError):
mufs.iwss(self.X_w, self.y_w, -0.01)
def test_iwss_better_merit_condition(self):
import pandas as pd
import os
folder = os.path.dirname(os.path.abspath(__file__))
data = pd.read_csv(
os.path.join(folder, "balloons_R.dat"),
sep="\t",
index_col=0,
)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
mufs = MUFS()
expected = [0, 2, 3, 1]
self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results())

17
mufs/tests/balloons_R.dat Executable file
View File

@@ -0,0 +1,17 @@
f1 f2 f3 f4 clase
1 0.968246 -0.968246 0.968246 0.968246 1
2 0.968246 -0.968246 0.968246 -0.968246 1
3 0.968246 -0.968246 -0.968246 0.968246 1
4 0.968246 -0.968246 -0.968246 -0.968246 1
5 0.968246 0.968246 0.968246 0.968246 1
6 0.968246 0.968246 0.968246 -0.968246 0
7 0.968246 0.968246 -0.968246 0.968246 0
8 0.968246 0.968246 -0.968246 -0.968246 0
9 -0.968246 -0.968246 0.968246 0.968246 1
10 -0.968246 -0.968246 0.968246 -0.968246 0
11 -0.968246 -0.968246 -0.968246 0.968246 0
12 -0.968246 -0.968246 -0.968246 -0.968246 0
13 -0.968246 0.968246 0.968246 0.968246 1
14 -0.968246 0.968246 0.968246 -0.968246 0
15 -0.968246 0.968246 -0.968246 0.968246 0
16 -0.968246 0.968246 -0.968246 -0.968246 0

3
requirements/dev.txt Normal file
View File

@@ -0,0 +1,3 @@
-r production.txt
mdlp
pandas

View File

@@ -1,2 +1 @@
scikit-learn>0.24 scikit-learn>0.24
mdlp