Merge pull request #3 from Doctorado-ML/Add-IWSS-(#2)

Add iwss (#2)
2025-08-16 16:15:56 +00:00 · 2021-10-28 12:39:57 +02:00
parent 9d74bc8a70 5d1720c9ae
commit cfb37d2f6c
9 changed files with 140 additions and 12 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -27,7 +27,7 @@ jobs:
          pip install -q cython
          pip install -q numpy
          pip install -q git+git://github.com/doctorado-ml/mdlp
-          pip install -q -r requirements.txt
+          pip install -q -r requirements/dev.txt
          pip install -q --upgrade codecov coverage black flake8 codacy-coverage
      - name: Lint
        run: |
--- a/README.md
+++ b/README.md
@@ -17,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa
 ### Correlation-based Feature Selection
 Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
 ### IWSS
 Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673.
--- a/mufs/Selection.py
+++ b/mufs/Selection.py
@@ -26,7 +26,7 @@ class MUFS:
    """
    def __init__(self, max_features=None, discrete=True):
-        self._max_features = max_features
+        self.max_features = max_features
        self._discrete = discrete
        self.symmetrical_uncertainty = (
            Metrics.symmetrical_uncertainty
@@ -53,8 +53,10 @@ class MUFS:
        """
        self.X_ = X
        self.y_ = y
-        if self._max_features is None:
+        if self.max_features is None:
            self._max_features = X.shape[1]
        else:
            self._max_features = self.max_features
        self._result = None
        self._scores = []
        self._su_labels = None
@@ -105,7 +107,9 @@ class MUFS:
    def _compute_merit(self, features):
        """Compute the merit function for cfs algorithms
-
+           "Good feature subsets contain features highly correlated with
           (predictive of) the class, yet uncorrelated with (not predictive of)
           each other"
        Parameters
        ----------
        features : list
@@ -264,3 +268,58 @@ class MUFS:
            list of scores of the features selected
        """
        return self._scores if self._fitted else []
    def iwss(self, X, y, threshold):
        """Incremental Wrapper Subset Selection
        Parameters
        ----------
        X : np.array
            array of features
        y : np.array
            vector of labels
        threshold : float
            threshold to select relevant features
        Returns
        -------
        self
            self
        Raises
        ------
        ValueError
            if the threshold is less than a selected value of 1e-7
            or greater than .5
        """
        if threshold < 0 or threshold > 0.5:
            raise ValueError(
                "Threshold cannot be less than 0 or greater than 0.5"
            )
        self._initialize(X, y)
        s_list = self._compute_su_labels()
        feature_order = (-s_list).argsort()
        features = feature_order.copy().tolist()
        candidates = []
        # Add first and second features to result
        first_feature = features.pop(0)
        candidates.append(first_feature)
        self._scores.append(s_list[first_feature])
        candidates.append(features.pop(0))
        merit = self._compute_merit(candidates)
        self._scores.append(merit)
        for feature in features:
            candidates.append(feature)
            merit_new = self._compute_merit(candidates)
            delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0
            if merit_new > merit or delta < threshold:
                if merit_new > merit:
                    merit = merit_new
                self._scores.append(merit_new)
            else:
                candidates.pop()
                break
            if len(candidates) == self._max_features:
                break
        self._result = candidates
        return self
--- a/mufs/tests/MUFS_test.py
+++ b/mufs/tests/MUFS_test.py
@@ -1,11 +1,14 @@
 import unittest
 import os
 import pandas as pd
 import numpy as np
 from mdlp import MDLP
 from sklearn.datasets import load_wine, load_iris
 from ..Selection import MUFS
-class MUFS_test(unittest.TestCase):
+class MUFSTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        mdlp = MDLP(random_state=1)
@@ -32,7 +35,7 @@ class MUFS_test(unittest.TestCase):
    def test_csf_wine(self):
        mufs = MUFS()
        expected = [6, 12, 9, 4, 10, 0]
-        self.assertListAlmostEqual(
+        self.assertListEqual(
            expected, mufs.cfs(self.X_w, self.y_w).get_results()
        )
        expected = [
@@ -78,7 +81,7 @@ class MUFS_test(unittest.TestCase):
        mufs = MUFS()
        expected = [3, 2, 0, 1]
        computed = mufs.cfs(self.X_i, self.y_i).get_results()
-        self.assertListAlmostEqual(expected, computed)
+        self.assertListEqual(expected, computed)
        expected = [
            0.870521418179061,
            0.8968651482682227,
@@ -148,3 +151,46 @@ class MUFS_test(unittest.TestCase):
            0.44518278979085646,
        ]
        self.assertListAlmostEqual(expected, mufs.get_scores())
    def test_iwss_wine(self):
        mufs = MUFS()
        expected = [6, 9, 12]
        self.assertListEqual(
            expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results()
        )
        expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
        self.assertListAlmostEqual(expected, mufs.get_scores())
    def test_iwss_wine_max_features(self):
        mufs = MUFS(max_features=3)
        expected = [6, 9, 12]
        self.assertListEqual(
            expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results()
        )
        expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
        self.assertListAlmostEqual(expected, mufs.get_scores())
    def test_iwss_exception(self):
        mufs = MUFS()
        with self.assertRaises(ValueError):
            mufs.iwss(self.X_w, self.y_w, 0.51)
        with self.assertRaises(ValueError):
            mufs.iwss(self.X_w, self.y_w, -0.01)
    def test_iwss_better_merit_condition(self):
        folder = os.path.dirname(os.path.abspath(__file__))
        data = pd.read_csv(
            os.path.join(folder, "balloons_R.dat"),
            sep="\t",
            index_col=0,
        )
        X = data.drop("clase", axis=1).to_numpy()
        y = data["clase"].to_numpy()
        mufs = MUFS()
        expected = [0, 2, 3, 1]
        self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results())
    def test_iwss_empty(self):
        mufs = MUFS()
        X = np.delete(self.X_i, [0, 1], 1)
        self.assertListEqual(mufs.iwss(X, self.y_i, 0.3).get_results(), [1, 0])
--- a/mufs/tests/Metrics_test.py
+++ b/mufs/tests/Metrics_test.py
@@ -6,7 +6,7 @@ from mdlp import MDLP
 from ..Selection import Metrics
-class Metrics_test(unittest.TestCase):
+class MetricsTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        mdlp = MDLP(random_state=1)
--- a/mufs/tests/init.py
+++ b/mufs/tests/init.py
@@ -1,4 +1,4 @@
-from .MUFS_test import MUFS_test
+from .MUFS_test import MUFSTest
-from .Metrics_test import Metrics_test
+from .Metrics_test import MetricsTest
-__all__ = ["MUFS_test", "Metrics_test"]
+__all__ = ["MUFSTest", "MetricsTest"]
--- a/mufs/tests/balloons_R.dat
+++ b/mufs/tests/balloons_R.dat
@@ -0,0 +1,17 @@
 	f1	f2	f3	f4	clase
 1	0.968246	-0.968246	0.968246	0.968246	1
 2	0.968246	-0.968246	0.968246	-0.968246	1
 3	0.968246	-0.968246	-0.968246	0.968246	1
 4	0.968246	-0.968246	-0.968246	-0.968246	1
 5	0.968246	0.968246	0.968246	0.968246	1
 6	0.968246	0.968246	0.968246	-0.968246	0
 7	0.968246	0.968246	-0.968246	0.968246	0
 8	0.968246	0.968246	-0.968246	-0.968246	0
 9	-0.968246	-0.968246	0.968246	0.968246	1
 10	-0.968246	-0.968246	0.968246	-0.968246	0
 11	-0.968246	-0.968246	-0.968246	0.968246	0
 12	-0.968246	-0.968246	-0.968246	-0.968246	0
 13	-0.968246	0.968246	0.968246	0.968246	1
 14	-0.968246	0.968246	0.968246	-0.968246	0
 15	-0.968246	0.968246	-0.968246	0.968246	0
 16	-0.968246	0.968246	-0.968246	-0.968246	0
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -0,0 +1,3 @@
 -r production.txt
 mdlp
 pandas
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -1,2 +1 @@
 scikit-learn>0.24
 mdlp