Update comments and CI workflow

2025-08-18 17:15:52 +00:00 · 2021-05-25 13:56:21 +02:00
parent 17d44080f5
commit fdfdf6fd00
3 changed files with 102 additions and 14 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -28,9 +28,9 @@ jobs:
          pip install -q --upgrade codecov coverage black flake8 codacy-coverage
      - name: Lint
        run: |
-          black --check --diff stree
+          black --check --diff mfs
-          flake8 --count stree
+          flake8 --count mfs
-      - name: Tests
+      - name: Tests & coverage
        run: |
-          coverage run -m unittest -v stree.tests
+          coverage run -m unittest -v mfs.tests
-          coverage xml
+          coverage report -m --fail-under=100
--- a/README.md
+++ b/README.md
@@ -1,13 +1,15 @@
 ![CI](https://github.com/Doctorado-ML/mfs/workflows/CI/badge.svg)
 # MFS
 ## Multi Feature Selection
-Compute Fast Fast Correlation Based Filter
+### Fast Correlation-Based Filter
 Yu, L. and Liu, H.; Feature Selection for High-Dimensional Data: A Fast
 Correlation Based Filter Solution,Proc. 20th Intl. Conf. Mach. Learn.
 (ICML-2003)
-and
+Feature Selection for High-Dimensional Data : A Fast Correlation-Based Filter Solution. / Yu, Lei; Liu, Huan.
-Correlated Feature Selection as in "Correlation-based Feature Selection for
+Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fawcett; N. Mishra. 2003. p. 856-863 (Proceedings, Twentieth International Conference on Machine Learning; Vol. 2).
-Machine Learning" by Mark Andrew Hall
+
 ### Correlation-based Feature Selection
 Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
--- a/mfs/Selection.py
+++ b/mfs/Selection.py
@@ -115,12 +115,23 @@ class MFS:
        self._initialize()
    def _initialize(self):
        """Initialize the attributes so support multiple calls using same
        object
        """
        self._result = None
        self._scores = []
        self._su_labels = None
        self._su_features = {}
    def _compute_su_labels(self):
        """Compute symmetrical uncertainty between each feature of the dataset
        and the labels and store it to use in future calls
        Returns
        -------
        list
            vector with sym. un. of every feature and the labels
        """
        if self._su_labels is None:
            num_features = self.X_.shape[1]
            self._su_labels = np.zeros(num_features)
@@ -131,6 +142,21 @@ class MFS:
        return self._su_labels
    def _compute_su_features(self, feature_a, feature_b):
        """Compute symmetrical uncertainty between two features and stores it
        to use in future calls
        Parameters
        ----------
        feature_a : int
            index of the first feature
        feature_b : int
            index of the second feature
        Returns
        -------
        float
            The symmetrical uncertainty of the two features
        """
        if (feature_a, feature_b) not in self._su_features:
            self._su_features[
                (feature_a, feature_b)
@@ -140,6 +166,18 @@ class MFS:
        return self._su_features[(feature_a, feature_b)]
    def _compute_merit(self, features):
        """Compute the merit function for cfs algorithms
        Parameters
        ----------
        features : list
            list of features to include in the computation
        Returns
        -------
        float
            The merit of the feature set passed
        """
        rcf = self._su_labels[features].sum()
        rff = 0.0
        k = len(features)
@@ -148,7 +186,8 @@ class MFS:
        return rcf / sqrt(k + (k ** 2 - k) * rff)
    def cfs(self, X, y):
-        """CFS forward best first heuristic search
+        """Correlation-based Feature Selection
        with a forward best first heuristic search
        Parameters
        ----------
@@ -156,6 +195,11 @@ class MFS:
            array of features
        y : np.array
            vector of labels
        Returns
        -------
        self
            self
        """
        self._initialize()
        self.X_ = X
@@ -186,6 +230,13 @@ class MFS:
                # Force leaving the loop
                continue_condition = False
            if len(self._scores) >= 5:
                """
                "To prevent the best first search from exploring the entire
                feature subset search space, a stopping criterion is imposed.
                The search will terminate if five consecutive fully expanded
                subsets show no improvement over the current best subset."
                as stated in Mark A. Hall Thesis
                """
                item_ant = -1
                for item in self._scores[-5:]:
                    if item_ant == -1:
@@ -200,8 +251,29 @@ class MFS:
        return self
    def fcbs(self, X, y, threshold):
        """Fast Correlation-Based Filter
        Parameters
        ----------
        X : np.array
            array of features
        y : np.array
            vector of labels
        threshold : float
            threshold to select relevant features
        Returns
        -------
        self
            self
        Raises
        ------
        ValueError
            if the threshold is less than a selected value of 1e-4
        """
        if threshold < 1e-4:
-            raise ValueError("Threshold cannot be less than 1e4")
+            raise ValueError("Threshold cannot be less than 1e-4")
        self._initialize()
        self.X_ = X
        self.y_ = y
@@ -229,7 +301,21 @@ class MFS:
        return self
    def get_results(self):
        """Return the results of the algorithm applied if any
        Returns
        -------
        list
            list of features indices selected
        """
        return self._result
    def get_scores(self):
        """Return the scores computed for the features selected
        Returns
        -------
        list
            list of scores of the features selected
        """
        return self._scores