Merge pull request #3 from Doctorado-ML/Add-IWSS-(#2)

Add iwss (#2)
This commit is contained in:
Ricardo Montañana Gómez
2021-10-28 12:39:57 +02:00
committed by GitHub
9 changed files with 140 additions and 12 deletions

View File

@@ -27,7 +27,7 @@ jobs:
pip install -q cython pip install -q cython
pip install -q numpy pip install -q numpy
pip install -q git+git://github.com/doctorado-ml/mdlp pip install -q git+git://github.com/doctorado-ml/mdlp
pip install -q -r requirements.txt pip install -q -r requirements/dev.txt
pip install -q --upgrade codecov coverage black flake8 codacy-coverage pip install -q --upgrade codecov coverage black flake8 codacy-coverage
- name: Lint - name: Lint
run: | run: |

View File

@@ -17,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa
### Correlation-based Feature Selection ### Correlation-based Feature Selection
Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'. Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
### IWSS
Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673.

View File

@@ -26,7 +26,7 @@ class MUFS:
""" """
def __init__(self, max_features=None, discrete=True): def __init__(self, max_features=None, discrete=True):
self._max_features = max_features self.max_features = max_features
self._discrete = discrete self._discrete = discrete
self.symmetrical_uncertainty = ( self.symmetrical_uncertainty = (
Metrics.symmetrical_uncertainty Metrics.symmetrical_uncertainty
@@ -53,8 +53,10 @@ class MUFS:
""" """
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
if self._max_features is None: if self.max_features is None:
self._max_features = X.shape[1] self._max_features = X.shape[1]
else:
self._max_features = self.max_features
self._result = None self._result = None
self._scores = [] self._scores = []
self._su_labels = None self._su_labels = None
@@ -105,7 +107,9 @@ class MUFS:
def _compute_merit(self, features): def _compute_merit(self, features):
"""Compute the merit function for cfs algorithms """Compute the merit function for cfs algorithms
"Good feature subsets contain features highly correlated with
(predictive of) the class, yet uncorrelated with (not predictive of)
each other"
Parameters Parameters
---------- ----------
features : list features : list
@@ -264,3 +268,58 @@ class MUFS:
list of scores of the features selected list of scores of the features selected
""" """
return self._scores if self._fitted else [] return self._scores if self._fitted else []
def iwss(self, X, y, threshold):
"""Incremental Wrapper Subset Selection
Parameters
----------
X : np.array
array of features
y : np.array
vector of labels
threshold : float
threshold to select relevant features
Returns
-------
self
self
Raises
------
ValueError
if the threshold is less than a selected value of 1e-7
or greater than .5
"""
if threshold < 0 or threshold > 0.5:
raise ValueError(
"Threshold cannot be less than 0 or greater than 0.5"
)
self._initialize(X, y)
s_list = self._compute_su_labels()
feature_order = (-s_list).argsort()
features = feature_order.copy().tolist()
candidates = []
# Add first and second features to result
first_feature = features.pop(0)
candidates.append(first_feature)
self._scores.append(s_list[first_feature])
candidates.append(features.pop(0))
merit = self._compute_merit(candidates)
self._scores.append(merit)
for feature in features:
candidates.append(feature)
merit_new = self._compute_merit(candidates)
delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0
if merit_new > merit or delta < threshold:
if merit_new > merit:
merit = merit_new
self._scores.append(merit_new)
else:
candidates.pop()
break
if len(candidates) == self._max_features:
break
self._result = candidates
return self

View File

@@ -1,11 +1,14 @@
import unittest import unittest
import os
import pandas as pd
import numpy as np
from mdlp import MDLP from mdlp import MDLP
from sklearn.datasets import load_wine, load_iris from sklearn.datasets import load_wine, load_iris
from ..Selection import MUFS from ..Selection import MUFS
class MUFS_test(unittest.TestCase): class MUFSTest(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
mdlp = MDLP(random_state=1) mdlp = MDLP(random_state=1)
@@ -32,7 +35,7 @@ class MUFS_test(unittest.TestCase):
def test_csf_wine(self): def test_csf_wine(self):
mufs = MUFS() mufs = MUFS()
expected = [6, 12, 9, 4, 10, 0] expected = [6, 12, 9, 4, 10, 0]
self.assertListAlmostEqual( self.assertListEqual(
expected, mufs.cfs(self.X_w, self.y_w).get_results() expected, mufs.cfs(self.X_w, self.y_w).get_results()
) )
expected = [ expected = [
@@ -78,7 +81,7 @@ class MUFS_test(unittest.TestCase):
mufs = MUFS() mufs = MUFS()
expected = [3, 2, 0, 1] expected = [3, 2, 0, 1]
computed = mufs.cfs(self.X_i, self.y_i).get_results() computed = mufs.cfs(self.X_i, self.y_i).get_results()
self.assertListAlmostEqual(expected, computed) self.assertListEqual(expected, computed)
expected = [ expected = [
0.870521418179061, 0.870521418179061,
0.8968651482682227, 0.8968651482682227,
@@ -148,3 +151,46 @@ class MUFS_test(unittest.TestCase):
0.44518278979085646, 0.44518278979085646,
] ]
self.assertListAlmostEqual(expected, mufs.get_scores()) self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_wine(self):
mufs = MUFS()
expected = [6, 9, 12]
self.assertListEqual(
expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results()
)
expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_wine_max_features(self):
mufs = MUFS(max_features=3)
expected = [6, 9, 12]
self.assertListEqual(
expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results()
)
expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_exception(self):
mufs = MUFS()
with self.assertRaises(ValueError):
mufs.iwss(self.X_w, self.y_w, 0.51)
with self.assertRaises(ValueError):
mufs.iwss(self.X_w, self.y_w, -0.01)
def test_iwss_better_merit_condition(self):
folder = os.path.dirname(os.path.abspath(__file__))
data = pd.read_csv(
os.path.join(folder, "balloons_R.dat"),
sep="\t",
index_col=0,
)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
mufs = MUFS()
expected = [0, 2, 3, 1]
self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results())
def test_iwss_empty(self):
mufs = MUFS()
X = np.delete(self.X_i, [0, 1], 1)
self.assertListEqual(mufs.iwss(X, self.y_i, 0.3).get_results(), [1, 0])

View File

@@ -6,7 +6,7 @@ from mdlp import MDLP
from ..Selection import Metrics from ..Selection import Metrics
class Metrics_test(unittest.TestCase): class MetricsTest(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
mdlp = MDLP(random_state=1) mdlp = MDLP(random_state=1)

View File

@@ -1,4 +1,4 @@
from .MUFS_test import MUFS_test from .MUFS_test import MUFSTest
from .Metrics_test import Metrics_test from .Metrics_test import MetricsTest
__all__ = ["MUFS_test", "Metrics_test"] __all__ = ["MUFSTest", "MetricsTest"]

17
mufs/tests/balloons_R.dat Executable file
View File

@@ -0,0 +1,17 @@
f1 f2 f3 f4 clase
1 0.968246 -0.968246 0.968246 0.968246 1
2 0.968246 -0.968246 0.968246 -0.968246 1
3 0.968246 -0.968246 -0.968246 0.968246 1
4 0.968246 -0.968246 -0.968246 -0.968246 1
5 0.968246 0.968246 0.968246 0.968246 1
6 0.968246 0.968246 0.968246 -0.968246 0
7 0.968246 0.968246 -0.968246 0.968246 0
8 0.968246 0.968246 -0.968246 -0.968246 0
9 -0.968246 -0.968246 0.968246 0.968246 1
10 -0.968246 -0.968246 0.968246 -0.968246 0
11 -0.968246 -0.968246 -0.968246 0.968246 0
12 -0.968246 -0.968246 -0.968246 -0.968246 0
13 -0.968246 0.968246 0.968246 0.968246 1
14 -0.968246 0.968246 0.968246 -0.968246 0
15 -0.968246 0.968246 -0.968246 0.968246 0
16 -0.968246 0.968246 -0.968246 -0.968246 0

3
requirements/dev.txt Normal file
View File

@@ -0,0 +1,3 @@
-r production.txt
mdlp
pandas

View File

@@ -1,2 +1 @@
scikit-learn>0.24 scikit-learn>0.24
mdlp