From 27f8a370c5f74a03f181f9de7b4c1759c1b6d9ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 10 Oct 2021 19:06:57 +0200 Subject: [PATCH 1/3] Begin IWSS implementation Update requirements Create requirements for dev --- README.md | 4 ++ mufs/Selection.py | 64 ++++++++++++++++++- mufs/tests/MUFS_test.py | 45 ++++++++++++- mufs/tests/balloons_R.dat | 17 +++++ requirements/dev.txt | 3 + .../production.txt | 1 - 6 files changed, 128 insertions(+), 6 deletions(-) create mode 100755 mufs/tests/balloons_R.dat create mode 100644 requirements/dev.txt rename requirements.txt => requirements/production.txt (81%) diff --git a/README.md b/README.md index 6fccbd7..8d8cdb1 100644 --- a/README.md +++ b/README.md @@ -17,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa ### Correlation-based Feature Selection Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'. + +### IWSS + +Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673. diff --git a/mufs/Selection.py b/mufs/Selection.py index 7783450..7f96003 100755 --- a/mufs/Selection.py +++ b/mufs/Selection.py @@ -26,7 +26,7 @@ class MUFS: """ def __init__(self, max_features=None, discrete=True): - self._max_features = max_features + self.max_features = max_features self._discrete = discrete self.symmetrical_uncertainty = ( Metrics.symmetrical_uncertainty @@ -53,8 +53,10 @@ class MUFS: """ self.X_ = X self.y_ = y - if self._max_features is None: + if self.max_features is None: self._max_features = X.shape[1] + else: + self._max_features = self.max_features self._result = None self._scores = [] self._su_labels = None @@ -105,7 +107,9 @@ class MUFS: def _compute_merit(self, features): """Compute the merit function for cfs algorithms - + "Good feature subsets contain features highly correlated with + (predictive of) the class, yet uncorrelated with (not predictive of) + each other" Parameters ---------- features : list @@ -264,3 +268,57 @@ class MUFS: list of scores of the features selected """ return self._scores if self._fitted else [] + + def iwss(self, X, y, threshold): + """Incremental Wrapper Subset Selection + + Parameters + ---------- + X : np.array + array of features + y : np.array + vector of labels + threshold : float + threshold to select relevant features + + Returns + ------- + self + self + Raises + ------ + ValueError + if the threshold is less than a selected value of 1e-7 + or greater than .5 + + """ + if threshold < 0 or threshold > 0.5: + raise ValueError( + "Threshold cannot be less than 0 or greater than 0.5" + ) + self._initialize(X, y) + s_list = self._compute_su_labels() + feature_order = (-s_list).argsort() + features = feature_order.copy().tolist() + candidates = [] + # Add first and second features to result + first_feature = features.pop(0) + candidates.append(first_feature) + self._scores.append(s_list[first_feature]) + candidates.append(features.pop(0)) + merit = self._compute_merit(candidates) + self._scores.append(merit) + for feature in features: + candidates.append(feature) + merit_new = self._compute_merit(candidates) + delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0 + if merit_new > merit or delta < threshold: + if merit_new > merit: + merit = merit_new + self._scores.append(merit_new) + else: + candidates.pop() + if len(candidates) == self._max_features: + break + self._result = candidates + return self diff --git a/mufs/tests/MUFS_test.py b/mufs/tests/MUFS_test.py index 82a0dbd..312620d 100755 --- a/mufs/tests/MUFS_test.py +++ b/mufs/tests/MUFS_test.py @@ -32,7 +32,7 @@ class MUFS_test(unittest.TestCase): def test_csf_wine(self): mufs = MUFS() expected = [6, 12, 9, 4, 10, 0] - self.assertListAlmostEqual( + self.assertListEqual( expected, mufs.cfs(self.X_w, self.y_w).get_results() ) expected = [ @@ -78,7 +78,7 @@ class MUFS_test(unittest.TestCase): mufs = MUFS() expected = [3, 2, 0, 1] computed = mufs.cfs(self.X_i, self.y_i).get_results() - self.assertListAlmostEqual(expected, computed) + self.assertListEqual(expected, computed) expected = [ 0.870521418179061, 0.8968651482682227, @@ -148,3 +148,44 @@ class MUFS_test(unittest.TestCase): 0.44518278979085646, ] self.assertListAlmostEqual(expected, mufs.get_scores()) + + def test_iwss_wine(self): + mufs = MUFS() + expected = [6, 9, 12] + self.assertListEqual( + expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results() + ) + expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362] + self.assertListAlmostEqual(expected, mufs.get_scores()) + + def test_iwss_wine_max_features(self): + mufs = MUFS(max_features=3) + expected = [6, 9, 12] + self.assertListEqual( + expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results() + ) + expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362] + self.assertListAlmostEqual(expected, mufs.get_scores()) + + def test_iwss_exception(self): + mufs = MUFS() + with self.assertRaises(ValueError): + mufs.iwss(self.X_w, self.y_w, 0.51) + with self.assertRaises(ValueError): + mufs.iwss(self.X_w, self.y_w, -0.01) + + def test_iwss_better_merit_condition(self): + import pandas as pd + import os + + folder = os.path.dirname(os.path.abspath(__file__)) + data = pd.read_csv( + os.path.join(folder, "balloons_R.dat"), + sep="\t", + index_col=0, + ) + X = data.drop("clase", axis=1).to_numpy() + y = data["clase"].to_numpy() + mufs = MUFS() + expected = [0, 2, 3, 1] + self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results()) diff --git a/mufs/tests/balloons_R.dat b/mufs/tests/balloons_R.dat new file mode 100755 index 0000000..1579461 --- /dev/null +++ b/mufs/tests/balloons_R.dat @@ -0,0 +1,17 @@ + f1 f2 f3 f4 clase +1 0.968246 -0.968246 0.968246 0.968246 1 +2 0.968246 -0.968246 0.968246 -0.968246 1 +3 0.968246 -0.968246 -0.968246 0.968246 1 +4 0.968246 -0.968246 -0.968246 -0.968246 1 +5 0.968246 0.968246 0.968246 0.968246 1 +6 0.968246 0.968246 0.968246 -0.968246 0 +7 0.968246 0.968246 -0.968246 0.968246 0 +8 0.968246 0.968246 -0.968246 -0.968246 0 +9 -0.968246 -0.968246 0.968246 0.968246 1 +10 -0.968246 -0.968246 0.968246 -0.968246 0 +11 -0.968246 -0.968246 -0.968246 0.968246 0 +12 -0.968246 -0.968246 -0.968246 -0.968246 0 +13 -0.968246 0.968246 0.968246 0.968246 1 +14 -0.968246 0.968246 0.968246 -0.968246 0 +15 -0.968246 0.968246 -0.968246 0.968246 0 +16 -0.968246 0.968246 -0.968246 -0.968246 0 diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 0000000..ed21e03 --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,3 @@ +-r production.txt +mdlp +pandas diff --git a/requirements.txt b/requirements/production.txt similarity index 81% rename from requirements.txt rename to requirements/production.txt index 30eef1a..163f1bc 100644 --- a/requirements.txt +++ b/requirements/production.txt @@ -1,2 +1 @@ scikit-learn>0.24 -mdlp \ No newline at end of file From 1c5f1977e5db13fbfebc2db444aeea5a472dea13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 28 Oct 2021 11:55:40 +0200 Subject: [PATCH 2/3] Complete iwss based implementation (#2) --- mufs/Selection.py | 1 + mufs/tests/MUFS_test.py | 13 +++++++++---- mufs/tests/Metrics_test.py | 2 +- mufs/tests/__init__.py | 6 +++--- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/mufs/Selection.py b/mufs/Selection.py index 7f96003..eb328e6 100755 --- a/mufs/Selection.py +++ b/mufs/Selection.py @@ -318,6 +318,7 @@ class MUFS: self._scores.append(merit_new) else: candidates.pop() + break if len(candidates) == self._max_features: break self._result = candidates diff --git a/mufs/tests/MUFS_test.py b/mufs/tests/MUFS_test.py index 312620d..1b60d61 100755 --- a/mufs/tests/MUFS_test.py +++ b/mufs/tests/MUFS_test.py @@ -1,11 +1,14 @@ import unittest +import os +import pandas as pd +import numpy as np from mdlp import MDLP from sklearn.datasets import load_wine, load_iris from ..Selection import MUFS -class MUFS_test(unittest.TestCase): +class MUFSTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) mdlp = MDLP(random_state=1) @@ -175,9 +178,6 @@ class MUFS_test(unittest.TestCase): mufs.iwss(self.X_w, self.y_w, -0.01) def test_iwss_better_merit_condition(self): - import pandas as pd - import os - folder = os.path.dirname(os.path.abspath(__file__)) data = pd.read_csv( os.path.join(folder, "balloons_R.dat"), @@ -189,3 +189,8 @@ class MUFS_test(unittest.TestCase): mufs = MUFS() expected = [0, 2, 3, 1] self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results()) + + def test_iwss_empty(self): + mufs = MUFS() + X = np.delete(self.X_i, [0, 1], 1) + self.assertListEqual(mufs.iwss(X, self.y_i, 0.3).get_results(), [1, 0]) diff --git a/mufs/tests/Metrics_test.py b/mufs/tests/Metrics_test.py index 18ac46a..3a2a270 100755 --- a/mufs/tests/Metrics_test.py +++ b/mufs/tests/Metrics_test.py @@ -6,7 +6,7 @@ from mdlp import MDLP from ..Selection import Metrics -class Metrics_test(unittest.TestCase): +class MetricsTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) mdlp = MDLP(random_state=1) diff --git a/mufs/tests/__init__.py b/mufs/tests/__init__.py index e914937..466683c 100644 --- a/mufs/tests/__init__.py +++ b/mufs/tests/__init__.py @@ -1,4 +1,4 @@ -from .MUFS_test import MUFS_test -from .Metrics_test import Metrics_test +from .MUFS_test import MUFSTest +from .Metrics_test import MetricsTest -__all__ = ["MUFS_test", "Metrics_test"] +__all__ = ["MUFSTest", "MetricsTest"] From 5d1720c9ae393adf18f602b2e07a88591bf59a3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 28 Oct 2021 12:22:21 +0200 Subject: [PATCH 3/3] Update ci file --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 69e4cab..7d9d9de 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -27,7 +27,7 @@ jobs: pip install -q cython pip install -q numpy pip install -q git+git://github.com/doctorado-ml/mdlp - pip install -q -r requirements.txt + pip install -q -r requirements/dev.txt pip install -q --upgrade codecov coverage black flake8 codacy-coverage - name: Lint run: |