diff --git a/mfs/Metrics.py b/mfs/Metrics.py index 778ac30..39850d9 100755 --- a/mfs/Metrics.py +++ b/mfs/Metrics.py @@ -2,11 +2,9 @@ from math import log import numpy as np from scipy.special import gamma, psi -from sklearn.neighbors import BallTree, KDTree, NearestNeighbors +from sklearn.neighbors import NearestNeighbors from sklearn.feature_selection._mutual_info import _compute_mi -# from .entropy_estimators import mi, entropy as c_entropy - class Metrics: @staticmethod @@ -65,6 +63,10 @@ class Metrics: and: Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual information. Phys Rev E 69(6 Pt 2):066138. + + Differential entropy can be negative + https://stats.stackexchange.com/questions/73881/ + when-is-the-differential-entropy-negative """ if x.ndim == 1: x = x.reshape(-1, 1) @@ -131,7 +133,10 @@ class Metrics: return ( 2.0 * Metrics.information_gain_cont(x, y) - / (Metrics.differential_entropy(x) + Metrics.entropy(y)) + / ( + Metrics.differential_entropy(x, k=len(x) - 1) + + Metrics.entropy(y) + ) ) @staticmethod diff --git a/mfs/Selection.py b/mfs/Selection.py index b6aa02d..0733684 100755 --- a/mfs/Selection.py +++ b/mfs/Selection.py @@ -20,6 +20,9 @@ class MFS: ---------- max_features: int The maximum number of features to return + discrete: boolean + If the features are continuous or discrete. It always supose discrete + labels. """ def __init__(self, max_features=None, discrete=True): diff --git a/mfs/k.py b/mfs/k.py new file mode 100644 index 0000000..8d262d1 --- /dev/null +++ b/mfs/k.py @@ -0,0 +1,22 @@ +from sklearn.datasets import load_wine +from mfs import MFS +from mfs.Metrics import Metrics + +mfsc = MFS(discrete=False) +mfsd = MFS(discrete=True) +X, y = load_wine(return_X_y=True) +m, n = X.shape + +print("* Differential entropy in X") +for i in range(n): + print(i, Metrics.differential_entropy(X[:, i], k=10)) + +print("* Information Gain") +print("- Discrete features") +print(Metrics.information_gain(X, y)) +for i in range(n): + print(i, Metrics.information_gain(X[:, i], y)) +print("- Continuous features") +# print(Metrics.information_gain_cont(X, y)) +for i in range(n): + print(i, Metrics.information_gain_cont(X[:, i], y)) diff --git a/mfs/tests/Metrics_test.py b/mfs/tests/Metrics_test.py index bf8d187..e1bd543 100755 --- a/mfs/tests/Metrics_test.py +++ b/mfs/tests/Metrics_test.py @@ -1,6 +1,7 @@ import unittest import numpy as np from sklearn.datasets import load_iris, load_wine +from ..entropy_estimators import entropy from mdlp import MDLP from ..Selection import Metrics @@ -25,9 +26,9 @@ class Metrics_test(unittest.TestCase): ([1, 1, 5], 2, 0.9182958340544896), (self.y_i, 3, 0.999999999), ] - for dataset, base, entropy in datasets: + for dataset, base, entropy_expected in datasets: computed = metric.entropy(dataset, base) - self.assertAlmostEqual(entropy, computed) + self.assertAlmostEqual(entropy_expected, computed) def test_differential_entropy(self): metric = Metrics() @@ -41,11 +42,13 @@ class Metrics_test(unittest.TestCase): (self.X_i_c, 37, 3.06627326925228), (self.X_w_c, 37, 63.13827518897429), ] - for dataset, base, entropy in datasets: + for dataset, base, entropy_expected in datasets: computed = metric.differential_entropy( np.array(dataset, dtype="float64"), base ) - self.assertAlmostEqual(entropy, computed, msg=str(dataset)) + self.assertAlmostEqual( + entropy_expected, computed, msg=str(dataset) + ) expected = [ 1.6378708764142766, 2.0291571802275037, @@ -68,6 +71,29 @@ class Metrics_test(unittest.TestCase): ) self.assertAlmostEqual(computed, res_expected) + def test_dif_ent(self): + expected = [ + 1.6378708764142766, + 2.0291571802275037, + 0.8273865123744271, + 3.203935772642847, + 4.859193341386733, + 1.3707315434976266, + 1.8794952925706312, + -0.2983180654207054, + 1.4521478934625076, + 2.834404839362728, + 0.4894081282811191, + 1.361210381692561, + 7.6373991502818175, + ] + n_samples, n_features = self.X_w_c.shape + for c, res_expected in enumerate(expected): + computed = entropy( + self.X_w_c[:, c].reshape(-1, 1), k=n_samples - 2 + ) + print("-*-", computed) + def test_conditional_entropy(self): metric = Metrics() results_expected = [ @@ -133,10 +159,10 @@ class Metrics_test(unittest.TestCase): def test_symmetrical_uncertainty_continuous(self): metric = Metrics() results_expected = [ - -0.08368315199022527, - -0.08539330663499867, - -0.026524185532893957, - -0.016238166071083728, + 0.3116626663552704, + 0.22524988105092494, + 0.24511182026415218, + 0.07114329389542708, ] for expected, col in zip(results_expected, range(self.X_w.shape[1])): computed = metric.symmetrical_unc_continuous(