From 27f8a370c5f74a03f181f9de7b4c1759c1b6d9ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 10 Oct 2021 19:06:57 +0200 Subject: [PATCH] Begin IWSS implementation Update requirements Create requirements for dev --- README.md | 4 ++ mufs/Selection.py | 64 ++++++++++++++++++- mufs/tests/MUFS_test.py | 45 ++++++++++++- mufs/tests/balloons_R.dat | 17 +++++ requirements/dev.txt | 3 + .../production.txt | 1 - 6 files changed, 128 insertions(+), 6 deletions(-) create mode 100755 mufs/tests/balloons_R.dat create mode 100644 requirements/dev.txt rename requirements.txt => requirements/production.txt (81%) diff --git a/README.md b/README.md index 6fccbd7..8d8cdb1 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa ### Correlation-based Feature Selection Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'. + +### IWSS + +Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673. diff --git a/mufs/Selection.py b/mufs/Selection.py index 7783450..7f96003 100755 --- a/mufs/Selection.py +++ b/mufs/Selection.py @@ -26,7 +26,7 @@ class MUFS: """ def __init__(self, max_features=None, discrete=True): - self._max_features = max_features + self.max_features = max_features self._discrete = discrete self.symmetrical_uncertainty = ( Metrics.symmetrical_uncertainty @@ -53,8 +53,10 @@ class MUFS: """ self.X_ = X self.y_ = y - if self._max_features is None: + if self.max_features is None: self._max_features = X.shape[1] + else: + self._max_features = self.max_features self._result = None self._scores = [] self._su_labels = None @@ -105,7 +107,9 @@ class MUFS: def _compute_merit(self, features): """Compute the merit function for cfs algorithms - + "Good feature subsets contain features highly correlated with + (predictive of) the class, yet uncorrelated with (not predictive of) + each other" Parameters ---------- features : list @@ -264,3 +268,57 @@ class MUFS: list of scores of the features selected """ return self._scores if self._fitted else [] + + def iwss(self, X, y, threshold): + """Incremental Wrapper Subset Selection + + Parameters + ---------- + X : np.array + array of features + y : np.array + vector of labels + threshold : float + threshold to select relevant features + + Returns + ------- + self + self + Raises + ------ + ValueError + if the threshold is less than a selected value of 1e-7 + or greater than .5 + + """ + if threshold < 0 or threshold > 0.5: + raise ValueError( + "Threshold cannot be less than 0 or greater than 0.5" + ) + self._initialize(X, y) + s_list = self._compute_su_labels() + feature_order = (-s_list).argsort() + features = feature_order.copy().tolist() + candidates = [] + # Add first and second features to result + first_feature = features.pop(0) + candidates.append(first_feature) + self._scores.append(s_list[first_feature]) + candidates.append(features.pop(0)) + merit = self._compute_merit(candidates) + self._scores.append(merit) + for feature in features: + candidates.append(feature) + merit_new = self._compute_merit(candidates) + delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0 + if merit_new > merit or delta < threshold: + if merit_new > merit: + merit = merit_new + self._scores.append(merit_new) + else: + candidates.pop() + if len(candidates) == self._max_features: + break + self._result = candidates + return self diff --git a/mufs/tests/MUFS_test.py b/mufs/tests/MUFS_test.py index 82a0dbd..312620d 100755 --- a/mufs/tests/MUFS_test.py +++ b/mufs/tests/MUFS_test.py @@ -32,7 +32,7 @@ class MUFS_test(unittest.TestCase): def test_csf_wine(self): mufs = MUFS() expected = [6, 12, 9, 4, 10, 0] - self.assertListAlmostEqual( + self.assertListEqual( expected, mufs.cfs(self.X_w, self.y_w).get_results() ) expected = [ @@ -78,7 +78,7 @@ class MUFS_test(unittest.TestCase): mufs = MUFS() expected = [3, 2, 0, 1] computed = mufs.cfs(self.X_i, self.y_i).get_results() - self.assertListAlmostEqual(expected, computed) + self.assertListEqual(expected, computed) expected = [ 0.870521418179061, 0.8968651482682227, @@ -148,3 +148,44 @@ class MUFS_test(unittest.TestCase): 0.44518278979085646, ] self.assertListAlmostEqual(expected, mufs.get_scores()) + + def test_iwss_wine(self): + mufs = MUFS() + expected = [6, 9, 12] + self.assertListEqual( + expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results() + ) + expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362] + self.assertListAlmostEqual(expected, mufs.get_scores()) + + def test_iwss_wine_max_features(self): + mufs = MUFS(max_features=3) + expected = [6, 9, 12] + self.assertListEqual( + expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results() + ) + expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362] + self.assertListAlmostEqual(expected, mufs.get_scores()) + + def test_iwss_exception(self): + mufs = MUFS() + with self.assertRaises(ValueError): + mufs.iwss(self.X_w, self.y_w, 0.51) + with self.assertRaises(ValueError): + mufs.iwss(self.X_w, self.y_w, -0.01) + + def test_iwss_better_merit_condition(self): + import pandas as pd + import os + + folder = os.path.dirname(os.path.abspath(__file__)) + data = pd.read_csv( + os.path.join(folder, "balloons_R.dat"), + sep="\t", + index_col=0, + ) + X = data.drop("clase", axis=1).to_numpy() + y = data["clase"].to_numpy() + mufs = MUFS() + expected = [0, 2, 3, 1] + self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results()) diff --git a/mufs/tests/balloons_R.dat b/mufs/tests/balloons_R.dat new file mode 100755 index 0000000..1579461 --- /dev/null +++ b/mufs/tests/balloons_R.dat @@ -0,0 +1,17 @@ + f1 f2 f3 f4 clase +1 0.968246 -0.968246 0.968246 0.968246 1 +2 0.968246 -0.968246 0.968246 -0.968246 1 +3 0.968246 -0.968246 -0.968246 0.968246 1 +4 0.968246 -0.968246 -0.968246 -0.968246 1 +5 0.968246 0.968246 0.968246 0.968246 1 +6 0.968246 0.968246 0.968246 -0.968246 0 +7 0.968246 0.968246 -0.968246 0.968246 0 +8 0.968246 0.968246 -0.968246 -0.968246 0 +9 -0.968246 -0.968246 0.968246 0.968246 1 +10 -0.968246 -0.968246 0.968246 -0.968246 0 +11 -0.968246 -0.968246 -0.968246 0.968246 0 +12 -0.968246 -0.968246 -0.968246 -0.968246 0 +13 -0.968246 0.968246 0.968246 0.968246 1 +14 -0.968246 0.968246 0.968246 -0.968246 0 +15 -0.968246 0.968246 -0.968246 0.968246 0 +16 -0.968246 0.968246 -0.968246 -0.968246 0 diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 0000000..ed21e03 --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,3 @@ +-r production.txt +mdlp +pandas diff --git a/requirements.txt b/requirements/production.txt similarity index 81% rename from requirements.txt rename to requirements/production.txt index 30eef1a..163f1bc 100644 --- a/requirements.txt +++ b/requirements/production.txt @@ -1,2 +1 @@ scikit-learn>0.24 -mdlp \ No newline at end of file