fix format issue

Fix cfs merit formula
Update version number and sample
2025-08-18 17:15:52 +00:00 · 2022-03-10 14:32:33 +01:00 · 2022-03-10 12:56:47 +01:00 · 2021-10-28 14:30:28 +02:00 · 2021-10-28 12:39:57 +02:00 · 2021-10-28 12:22:21 +02:00
12 changed files with 190 additions and 40 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -27,7 +27,7 @@ jobs:
          pip install -q cython
          pip install -q numpy
          pip install -q git+git://github.com/doctorado-ml/mdlp
-          pip install -q -r requirements.txt
+          pip install -q -r requirements/dev.txt
          pip install -q --upgrade codecov coverage black flake8 codacy-coverage
      - name: Lint
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
  - repo: https://github.com/ambv/black
-    rev: 20.8b1
+    rev: 22.1.0
    hooks:
      - id: black
        exclude: ".virtual_documents"
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 ![CI](https://github.com/Doctorado-ML/mufs/workflows/CI/badge.svg)
 [![Codacy Badge](https://app.codacy.com/project/badge/Grade/66ad727eb13e4c7a8816db1e44d994a7)](https://www.codacy.com/gh/Doctorado-ML/mufs/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/mufs&utm_campaign=Badge_Grade)
 [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/mufs.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/mufs/context:python)
+[![PyPI version](https://badge.fury.io/py/MUFS.svg)](https://badge.fury.io/py/MUFS)
+![https://img.shields.io/badge/python-3.8%2B-blue](https://img.shields.io/badge/python-3.8%2B-brightgreen)

 # MUFS

@@ -15,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa
 ### Correlation-based Feature Selection

 Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
+
+### IWSS
+
+Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673.
--- a/mufs/Selection.py
+++ b/mufs/Selection.py
@@ -26,7 +26,7 @@ class MUFS:
    """

    def __init__(self, max_features=None, discrete=True):
-        self._max_features = max_features
+        self.max_features = max_features
        self._discrete = discrete
        self.symmetrical_uncertainty = (
            Metrics.symmetrical_uncertainty
@@ -53,8 +53,10 @@ class MUFS:
        """
        self.X_ = X
        self.y_ = y
-        if self._max_features is None:
+        if self.max_features is None:
            self._max_features = X.shape[1]
+        else:
+            self._max_features = self.max_features
        self._result = None
        self._scores = []
        self._su_labels = None
@@ -105,7 +107,9 @@ class MUFS:

    def _compute_merit(self, features):
        """Compute the merit function for cfs algorithms
-
+           "Good feature subsets contain features highly correlated with
+           (predictive of) the class, yet uncorrelated with (not predictive of)
+           each other"
        Parameters
        ----------
        features : list
@@ -124,7 +128,7 @@ class MUFS:
        k = len(features)
        for pair in list(combinations(features, 2)):
            rff += self._compute_su_features(*pair)
-        return rcf / sqrt(k + (k ** 2 - k) * rff)
+        return k * rcf / sqrt(k + (k**2 - k) * rff)

    def cfs(self, X, y):
        """Correlation-based Feature Selection
@@ -264,3 +268,58 @@ class MUFS:
            list of scores of the features selected
        """
        return self._scores if self._fitted else []
+
+    def iwss(self, X, y, threshold):
+        """Incremental Wrapper Subset Selection
+
+        Parameters
+        ----------
+        X : np.array
+            array of features
+        y : np.array
+            vector of labels
+        threshold : float
+            threshold to select relevant features
+
+        Returns
+        -------
+        self
+            self
+        Raises
+        ------
+        ValueError
+            if the threshold is less than a selected value of 1e-7
+            or greater than .5
+
+        """
+        if threshold < 0 or threshold > 0.5:
+            raise ValueError(
+                "Threshold cannot be less than 0 or greater than 0.5"
+            )
+        self._initialize(X, y)
+        s_list = self._compute_su_labels()
+        feature_order = (-s_list).argsort()
+        features = feature_order.copy().tolist()
+        candidates = []
+        # Add first and second features to result
+        first_feature = features.pop(0)
+        candidates.append(first_feature)
+        self._scores.append(s_list[first_feature])
+        candidates.append(features.pop(0))
+        merit = self._compute_merit(candidates)
+        self._scores.append(merit)
+        for feature in features:
+            candidates.append(feature)
+            merit_new = self._compute_merit(candidates)
+            delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0
+            if merit_new > merit or delta < threshold:
+                if merit_new > merit:
+                    merit = merit_new
+                self._scores.append(merit_new)
+            else:
+                candidates.pop()
+                break
+            if len(candidates) == self._max_features:
+                break
+        self._result = candidates
+        return self
--- a/mufs/init.py
+++ b/mufs/init.py
@@ -1,6 +1,6 @@
 from .Selection import MUFS

-__version__ = "0.1.1"
+__version__ = "0.1.2"
 __author__ = "Ricardo Montañana Gómez"
 __author_email__ = "Ricardo.Montanana@alu.uclm.es"
 __copyright__ = "Copyright 2021, Ricardo Montañana Gómez"
--- a/mufs/tests/MUFS_test.py
+++ b/mufs/tests/MUFS_test.py
@@ -1,11 +1,14 @@
 import unittest
+import os
+import pandas as pd
+import numpy as np
 from mdlp import MDLP
 from sklearn.datasets import load_wine, load_iris

 from ..Selection import MUFS


-class MUFS_test(unittest.TestCase):
+class MUFSTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        mdlp = MDLP(random_state=1)
@@ -31,33 +34,38 @@ class MUFS_test(unittest.TestCase):

    def test_csf_wine(self):
        mufs = MUFS()
-        expected = [6, 12, 9, 4, 10, 0]
-        self.assertListAlmostEqual(
+        expected = [6, 12, 9, 4, 10, 0, 7, 8]
+        self.assertListEqual(
            expected, mufs.cfs(self.X_w, self.y_w).get_results()
        )
        expected = [
            0.5218299405215557,
-            0.602513857132804,
-            0.4877384978817362,
-            0.3743688234383051,
-            0.28795671854246285,
-            0.2309165735173175,
+            1.205027714265608,
+            1.4632154936452084,
+            1.4974752937532203,
+            1.4397835927123144,
+            1.385499441103905,
+            1.340618857006277,
+            1.2989177695790775,
        ]
        self.assertListAlmostEqual(expected, mufs.get_scores())

    def test_csf_wine_cont(self):
        mufs = MUFS(discrete=False)
-        expected = [10, 6, 0, 2, 11, 9]
+        expected = [10, 6, 0, 2, 11, 9, 8, 1, 5]
        self.assertListEqual(
            expected, mufs.cfs(self.X_wc, self.y_w).get_results()
        )
        expected = [
            0.735264150416997,
-            0.8321684551546848,
-            0.7439915858469107,
-            0.6238883340158233,
-            0.513637402071709,
-            0.41596400981378984,
+            1.6643369103093697,
+            2.231974757540732,
+            2.4955533360632933,
+            2.568187010358545,
+            2.495784058882739,
+            2.4409992149141915,
+            2.3665143407182456,
+            2.280111788845658,
        ]
        self.assertListAlmostEqual(expected, mufs.get_scores())

@@ -67,23 +75,19 @@ class MUFS_test(unittest.TestCase):
        self.assertListAlmostEqual(
            expected, mufs.cfs(self.X_w, self.y_w).get_results()
        )
-        expected = [
-            0.5218299405215557,
-            0.602513857132804,
-            0.4877384978817362,
-        ]
+        expected = [0.5218299405215557, 1.205027714265608, 1.4632154936452084]
        self.assertListAlmostEqual(expected, mufs.get_scores())

    def test_csf_iris(self):
        mufs = MUFS()
        expected = [3, 2, 0, 1]
        computed = mufs.cfs(self.X_i, self.y_i).get_results()
-        self.assertListAlmostEqual(expected, computed)
+        self.assertListEqual(expected, computed)
        expected = [
            0.870521418179061,
-            0.8968651482682227,
-            0.5908278453318913,
-            0.40371971570693366,
+            1.7937302965364454,
+            1.7724835359956739,
+            1.6148788628277346,
        ]
        self.assertListAlmostEqual(expected, mufs.get_scores())

@@ -148,3 +152,54 @@ class MUFS_test(unittest.TestCase):
            0.44518278979085646,
        ]
        self.assertListAlmostEqual(expected, mufs.get_scores())
+
+    def test_iwss_wine(self):
+        mufs = MUFS()
+        expected = [6, 9, 12, 0, 11, 10, 5]
+        self.assertListEqual(
+            expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results()
+        )
+        expected = [
+            0.5218299405215557,
+            1.189564575222017,
+            1.4632154936452084,
+            1.428626297656075,
+            1.3384248731269246,
+            1.2869213430115078,
+            1.1949414936926785,
+        ]
+        self.assertListAlmostEqual(expected, mufs.get_scores())
+
+    def test_iwss_wine_max_features(self):
+        mufs = MUFS(max_features=3)
+        expected = [6, 9, 12]
+        self.assertListEqual(
+            expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results()
+        )
+        expected = [0.5218299405215557, 1.189564575222017, 1.4632154936452084]
+        self.assertListAlmostEqual(expected, mufs.get_scores())
+
+    def test_iwss_exception(self):
+        mufs = MUFS()
+        with self.assertRaises(ValueError):
+            mufs.iwss(self.X_w, self.y_w, 0.51)
+        with self.assertRaises(ValueError):
+            mufs.iwss(self.X_w, self.y_w, -0.01)
+
+    def test_iwss_better_merit_condition(self):
+        folder = os.path.dirname(os.path.abspath(__file__))
+        data = pd.read_csv(
+            os.path.join(folder, "balloons_R.dat"),
+            sep="\t",
+            index_col=0,
+        )
+        X = data.drop("clase", axis=1).to_numpy()
+        y = data["clase"].to_numpy()
+        mufs = MUFS()
+        expected = [0, 2, 3, 1]
+        self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results())
+
+    def test_iwss_empty(self):
+        mufs = MUFS()
+        X = np.delete(self.X_i, [0, 1], 1)
+        self.assertListEqual(mufs.iwss(X, self.y_i, 0.3).get_results(), [1, 0])
--- a/mufs/tests/Metrics_test.py
+++ b/mufs/tests/Metrics_test.py
@@ -6,7 +6,7 @@ from mdlp import MDLP
 from ..Selection import Metrics


-class Metrics_test(unittest.TestCase):
+class MetricsTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        mdlp = MDLP(random_state=1)
--- a/mufs/tests/init.py
+++ b/mufs/tests/init.py
@@ -1,4 +1,4 @@
-from .MUFS_test import MUFS_test
-from .Metrics_test import Metrics_test
+from .MUFS_test import MUFSTest
+from .Metrics_test import MetricsTest

-__all__ = ["MUFS_test", "Metrics_test"]
+__all__ = ["MUFSTest", "MetricsTest"]
--- a/mufs/tests/balloons_R.dat
+++ b/mufs/tests/balloons_R.dat
@@ -0,0 +1,17 @@
+	f1	f2	f3	f4	clase
+1	0.968246	-0.968246	0.968246	0.968246	1
+2	0.968246	-0.968246	0.968246	-0.968246	1
+3	0.968246	-0.968246	-0.968246	0.968246	1
+4	0.968246	-0.968246	-0.968246	-0.968246	1
+5	0.968246	0.968246	0.968246	0.968246	1
+6	0.968246	0.968246	0.968246	-0.968246	0
+7	0.968246	0.968246	-0.968246	0.968246	0
+8	0.968246	0.968246	-0.968246	-0.968246	0
+9	-0.968246	-0.968246	0.968246	0.968246	1
+10	-0.968246	-0.968246	0.968246	-0.968246	0
+11	-0.968246	-0.968246	-0.968246	0.968246	0
+12	-0.968246	-0.968246	-0.968246	-0.968246	0
+13	-0.968246	0.968246	0.968246	0.968246	1
+14	-0.968246	0.968246	0.968246	-0.968246	0
+15	-0.968246	0.968246	-0.968246	0.968246	0
+16	-0.968246	0.968246	-0.968246	-0.968246	0
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@@ -0,0 +1,3 @@
+-r production.txt
+mdlp
+pandas
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -1,2 +1 @@
 scikit-learn>0.24
-mdlp
--- a/sample.py
+++ b/sample.py
@@ -1,4 +1,5 @@
 import warnings
+import time
 from mufs import MUFS
 from mufs.Metrics import Metrics
 from stree import Stree
@@ -26,16 +27,26 @@ for i in range(n):
 # Classification
 warnings.filterwarnings("ignore")
 print("CFS")
+now = time.time()
 cfs_f = mufsc.cfs(X, y).get_results()
-print(cfs_f)
+time_cfs = time.time() - now
+print(cfs_f, "items: ", len(cfs_f), f"time: {time_cfs:.3f} seconds")
 print("FCBF")
-fcfb_f = mufsc.fcbf(X, y, 5e-2).get_results()
-print(fcfb_f, len(fcfb_f))
+now = time.time()
+fcbf_f = mufsc.fcbf(X, y, 0.07).get_results()
+time_fcbf = time.time() - now
+print(fcbf_f, "items: ", len(fcbf_f), f"time: {time_fcbf:.3f} seconds")
+now = time.time()
+print("IWSS")
+iwss_f = mufsc.iwss(X, y, 0.5).get_results()
+time_iwss = time.time() - now
+print(iwss_f, "items: ", len(iwss_f), f"time: {time_iwss:.3f} seconds")
 print("X.shape=", X.shape)
 clf = Stree(random_state=0)
 print("Accuracy whole dataset", clf.fit(X, y).score(X, y))
 clf = Stree(random_state=0)
 print("Accuracy cfs", clf.fit(X[:, cfs_f], y).score(X[:, cfs_f], y))
 clf = Stree(random_state=0)
-subf = fcfb_f
-print("Accuracy fcfb", clf.fit(X[:, subf], y).score(X[:, subf], y))
+print("Accuracy fcfb", clf.fit(X[:, fcbf_f], y).score(X[:, fcbf_f], y))
+clf = Stree(random_state=0)
+print("Accuracy iwss", clf.fit(X[:, iwss_f], y).score(X[:, iwss_f], y))
Author	SHA1	Message	Date
Ricardo Montañana	d0f1cc5979	fix format issue	2022-03-10 14:32:33 +01:00
Ricardo Montañana	b958bccef6	Fix cfs merit formula	2022-03-10 12:56:47 +01:00
Ricardo Montañana	a0f172ac13	Update version number and sample	2021-10-28 14:30:28 +02:00
Ricardo Montañana Gómez	cfb37d2f6c	Merge pull request #3 from Doctorado-ML/Add-IWSS-(#2 ) Add iwss (#2)	2021-10-28 12:39:57 +02:00
Ricardo Montañana	5d1720c9ae	Update ci file	2021-10-28 12:22:21 +02:00
Ricardo Montañana	1c5f1977e5	Complete iwss based implementation (#2 )	2021-10-28 11:55:40 +02:00
Ricardo Montañana	27f8a370c5	Begin IWSS implementation Update requirements Create requirements for dev	2021-10-10 19:06:57 +02:00
Ricardo Montañana Gómez	9d74bc8a70	Add package version badge to README	2021-08-17 12:02:15 +02:00