7 Commits

Author SHA1 Message Date
a0f172ac13 Update version number and sample 2021-10-28 14:30:28 +02:00
Ricardo Montañana Gómez
cfb37d2f6c Merge pull request #3 from Doctorado-ML/Add-IWSS-(#2)
Add iwss (#2)
2021-10-28 12:39:57 +02:00
5d1720c9ae Update ci file 2021-10-28 12:22:21 +02:00
1c5f1977e5 Complete iwss based implementation (#2) 2021-10-28 11:55:40 +02:00
27f8a370c5 Begin IWSS implementation
Update requirements
Create requirements for dev
2021-10-10 19:06:57 +02:00
Ricardo Montañana Gómez
9d74bc8a70 Add package version badge to README 2021-08-17 12:02:15 +02:00
Ricardo Montañana Gómez
ba7dc3eeb3 Merge pull request #1 from Doctorado-ML/updateCI
Update ci file
2021-08-02 18:46:25 +02:00
11 changed files with 159 additions and 18 deletions

View File

@@ -27,7 +27,7 @@ jobs:
pip install -q cython pip install -q cython
pip install -q numpy pip install -q numpy
pip install -q git+git://github.com/doctorado-ml/mdlp pip install -q git+git://github.com/doctorado-ml/mdlp
pip install -q -r requirements.txt pip install -q -r requirements/dev.txt
pip install -q --upgrade codecov coverage black flake8 codacy-coverage pip install -q --upgrade codecov coverage black flake8 codacy-coverage
- name: Lint - name: Lint
run: | run: |

View File

@@ -1,6 +1,8 @@
![CI](https://github.com/Doctorado-ML/mufs/workflows/CI/badge.svg) ![CI](https://github.com/Doctorado-ML/mufs/workflows/CI/badge.svg)
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/66ad727eb13e4c7a8816db1e44d994a7)](https://www.codacy.com/gh/Doctorado-ML/mufs/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/mufs&utm_campaign=Badge_Grade) [![Codacy Badge](https://app.codacy.com/project/badge/Grade/66ad727eb13e4c7a8816db1e44d994a7)](https://www.codacy.com/gh/Doctorado-ML/mufs/dashboard?utm_source=github.com&utm_medium=referral&utm_content=Doctorado-ML/mufs&utm_campaign=Badge_Grade)
[![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/mufs.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/mufs/context:python) [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/Doctorado-ML/mufs.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/Doctorado-ML/mufs/context:python)
[![PyPI version](https://badge.fury.io/py/MUFS.svg)](https://badge.fury.io/py/MUFS)
![https://img.shields.io/badge/python-3.8%2B-blue](https://img.shields.io/badge/python-3.8%2B-brightgreen)
# MUFS # MUFS
@@ -15,3 +17,7 @@ Proceedings, Twentieth International Conference on Machine Learning. ed. / T. Fa
### Correlation-based Feature Selection ### Correlation-based Feature Selection
Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'. Hall, M. A. (1999), 'Correlation-based Feature Selection for Machine Learning'.
### IWSS
Based on: P. Bermejo, J. A. Gamez and J. M. Puerta, "Incremental Wrapper-based subset Selection with replacement: An advantageous alternative to sequential forward selection," 2009 IEEE Symposium on Computational Intelligence and Data Mining, 2009, pp. 367-374, doi: 10.1109/CIDM.2009.4938673.

View File

@@ -26,7 +26,7 @@ class MUFS:
""" """
def __init__(self, max_features=None, discrete=True): def __init__(self, max_features=None, discrete=True):
self._max_features = max_features self.max_features = max_features
self._discrete = discrete self._discrete = discrete
self.symmetrical_uncertainty = ( self.symmetrical_uncertainty = (
Metrics.symmetrical_uncertainty Metrics.symmetrical_uncertainty
@@ -53,8 +53,10 @@ class MUFS:
""" """
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
if self._max_features is None: if self.max_features is None:
self._max_features = X.shape[1] self._max_features = X.shape[1]
else:
self._max_features = self.max_features
self._result = None self._result = None
self._scores = [] self._scores = []
self._su_labels = None self._su_labels = None
@@ -105,7 +107,9 @@ class MUFS:
def _compute_merit(self, features): def _compute_merit(self, features):
"""Compute the merit function for cfs algorithms """Compute the merit function for cfs algorithms
"Good feature subsets contain features highly correlated with
(predictive of) the class, yet uncorrelated with (not predictive of)
each other"
Parameters Parameters
---------- ----------
features : list features : list
@@ -264,3 +268,58 @@ class MUFS:
list of scores of the features selected list of scores of the features selected
""" """
return self._scores if self._fitted else [] return self._scores if self._fitted else []
def iwss(self, X, y, threshold):
"""Incremental Wrapper Subset Selection
Parameters
----------
X : np.array
array of features
y : np.array
vector of labels
threshold : float
threshold to select relevant features
Returns
-------
self
self
Raises
------
ValueError
if the threshold is less than a selected value of 1e-7
or greater than .5
"""
if threshold < 0 or threshold > 0.5:
raise ValueError(
"Threshold cannot be less than 0 or greater than 0.5"
)
self._initialize(X, y)
s_list = self._compute_su_labels()
feature_order = (-s_list).argsort()
features = feature_order.copy().tolist()
candidates = []
# Add first and second features to result
first_feature = features.pop(0)
candidates.append(first_feature)
self._scores.append(s_list[first_feature])
candidates.append(features.pop(0))
merit = self._compute_merit(candidates)
self._scores.append(merit)
for feature in features:
candidates.append(feature)
merit_new = self._compute_merit(candidates)
delta = abs(merit - merit_new) / merit if merit != 0.0 else 0.0
if merit_new > merit or delta < threshold:
if merit_new > merit:
merit = merit_new
self._scores.append(merit_new)
else:
candidates.pop()
break
if len(candidates) == self._max_features:
break
self._result = candidates
return self

View File

@@ -1,6 +1,6 @@
from .Selection import MUFS from .Selection import MUFS
__version__ = "0.1.1" __version__ = "0.1.2"
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
__author_email__ = "Ricardo.Montanana@alu.uclm.es" __author_email__ = "Ricardo.Montanana@alu.uclm.es"
__copyright__ = "Copyright 2021, Ricardo Montañana Gómez" __copyright__ = "Copyright 2021, Ricardo Montañana Gómez"

View File

@@ -1,11 +1,14 @@
import unittest import unittest
import os
import pandas as pd
import numpy as np
from mdlp import MDLP from mdlp import MDLP
from sklearn.datasets import load_wine, load_iris from sklearn.datasets import load_wine, load_iris
from ..Selection import MUFS from ..Selection import MUFS
class MUFS_test(unittest.TestCase): class MUFSTest(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
mdlp = MDLP(random_state=1) mdlp = MDLP(random_state=1)
@@ -32,7 +35,7 @@ class MUFS_test(unittest.TestCase):
def test_csf_wine(self): def test_csf_wine(self):
mufs = MUFS() mufs = MUFS()
expected = [6, 12, 9, 4, 10, 0] expected = [6, 12, 9, 4, 10, 0]
self.assertListAlmostEqual( self.assertListEqual(
expected, mufs.cfs(self.X_w, self.y_w).get_results() expected, mufs.cfs(self.X_w, self.y_w).get_results()
) )
expected = [ expected = [
@@ -78,7 +81,7 @@ class MUFS_test(unittest.TestCase):
mufs = MUFS() mufs = MUFS()
expected = [3, 2, 0, 1] expected = [3, 2, 0, 1]
computed = mufs.cfs(self.X_i, self.y_i).get_results() computed = mufs.cfs(self.X_i, self.y_i).get_results()
self.assertListAlmostEqual(expected, computed) self.assertListEqual(expected, computed)
expected = [ expected = [
0.870521418179061, 0.870521418179061,
0.8968651482682227, 0.8968651482682227,
@@ -148,3 +151,46 @@ class MUFS_test(unittest.TestCase):
0.44518278979085646, 0.44518278979085646,
] ]
self.assertListAlmostEqual(expected, mufs.get_scores()) self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_wine(self):
mufs = MUFS()
expected = [6, 9, 12]
self.assertListEqual(
expected, mufs.iwss(self.X_w, self.y_w, 0.2).get_results()
)
expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_wine_max_features(self):
mufs = MUFS(max_features=3)
expected = [6, 9, 12]
self.assertListEqual(
expected, mufs.iwss(self.X_w, self.y_w, 0.4).get_results()
)
expected = [0.5218299405215557, 0.5947822876110085, 0.4877384978817362]
self.assertListAlmostEqual(expected, mufs.get_scores())
def test_iwss_exception(self):
mufs = MUFS()
with self.assertRaises(ValueError):
mufs.iwss(self.X_w, self.y_w, 0.51)
with self.assertRaises(ValueError):
mufs.iwss(self.X_w, self.y_w, -0.01)
def test_iwss_better_merit_condition(self):
folder = os.path.dirname(os.path.abspath(__file__))
data = pd.read_csv(
os.path.join(folder, "balloons_R.dat"),
sep="\t",
index_col=0,
)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
mufs = MUFS()
expected = [0, 2, 3, 1]
self.assertListEqual(expected, mufs.iwss(X, y, 0.3).get_results())
def test_iwss_empty(self):
mufs = MUFS()
X = np.delete(self.X_i, [0, 1], 1)
self.assertListEqual(mufs.iwss(X, self.y_i, 0.3).get_results(), [1, 0])

View File

@@ -6,7 +6,7 @@ from mdlp import MDLP
from ..Selection import Metrics from ..Selection import Metrics
class Metrics_test(unittest.TestCase): class MetricsTest(unittest.TestCase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
mdlp = MDLP(random_state=1) mdlp = MDLP(random_state=1)

View File

@@ -1,4 +1,4 @@
from .MUFS_test import MUFS_test from .MUFS_test import MUFSTest
from .Metrics_test import Metrics_test from .Metrics_test import MetricsTest
__all__ = ["MUFS_test", "Metrics_test"] __all__ = ["MUFSTest", "MetricsTest"]

17
mufs/tests/balloons_R.dat Executable file
View File

@@ -0,0 +1,17 @@
f1 f2 f3 f4 clase
1 0.968246 -0.968246 0.968246 0.968246 1
2 0.968246 -0.968246 0.968246 -0.968246 1
3 0.968246 -0.968246 -0.968246 0.968246 1
4 0.968246 -0.968246 -0.968246 -0.968246 1
5 0.968246 0.968246 0.968246 0.968246 1
6 0.968246 0.968246 0.968246 -0.968246 0
7 0.968246 0.968246 -0.968246 0.968246 0
8 0.968246 0.968246 -0.968246 -0.968246 0
9 -0.968246 -0.968246 0.968246 0.968246 1
10 -0.968246 -0.968246 0.968246 -0.968246 0
11 -0.968246 -0.968246 -0.968246 0.968246 0
12 -0.968246 -0.968246 -0.968246 -0.968246 0
13 -0.968246 0.968246 0.968246 0.968246 1
14 -0.968246 0.968246 0.968246 -0.968246 0
15 -0.968246 0.968246 -0.968246 0.968246 0
16 -0.968246 0.968246 -0.968246 -0.968246 0

3
requirements/dev.txt Normal file
View File

@@ -0,0 +1,3 @@
-r production.txt
mdlp
pandas

View File

@@ -1,2 +1 @@
scikit-learn>0.24 scikit-learn>0.24
mdlp

View File

@@ -1,4 +1,5 @@
import warnings import warnings
import time
from mufs import MUFS from mufs import MUFS
from mufs.Metrics import Metrics from mufs.Metrics import Metrics
from stree import Stree from stree import Stree
@@ -26,16 +27,26 @@ for i in range(n):
# Classification # Classification
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
print("CFS") print("CFS")
now = time.time()
cfs_f = mufsc.cfs(X, y).get_results() cfs_f = mufsc.cfs(X, y).get_results()
print(cfs_f) time_cfs = time.time() - now
print(cfs_f, "items: ", len(cfs_f), f"time: {time_cfs:.3f} seconds")
print("FCBF") print("FCBF")
fcfb_f = mufsc.fcbf(X, y, 5e-2).get_results() now = time.time()
print(fcfb_f, len(fcfb_f)) fcbf_f = mufsc.fcbf(X, y, 0.07).get_results()
time_fcbf = time.time() - now
print(fcbf_f, "items: ", len(fcbf_f), f"time: {time_fcbf:.3f} seconds")
now = time.time()
print("IWSS")
iwss_f = mufsc.iwss(X, y, 0.5).get_results()
time_iwss = time.time() - now
print(iwss_f, "items: ", len(iwss_f), f"time: {time_iwss:.3f} seconds")
print("X.shape=", X.shape) print("X.shape=", X.shape)
clf = Stree(random_state=0) clf = Stree(random_state=0)
print("Accuracy whole dataset", clf.fit(X, y).score(X, y)) print("Accuracy whole dataset", clf.fit(X, y).score(X, y))
clf = Stree(random_state=0) clf = Stree(random_state=0)
print("Accuracy cfs", clf.fit(X[:, cfs_f], y).score(X[:, cfs_f], y)) print("Accuracy cfs", clf.fit(X[:, cfs_f], y).score(X[:, cfs_f], y))
clf = Stree(random_state=0) clf = Stree(random_state=0)
subf = fcfb_f print("Accuracy fcfb", clf.fit(X[:, fcbf_f], y).score(X[:, fcbf_f], y))
print("Accuracy fcfb", clf.fit(X[:, subf], y).score(X[:, subf], y)) clf = Stree(random_state=0)
print("Accuracy iwss", clf.fit(X[:, iwss_f], y).score(X[:, iwss_f], y))