first try with sklearn mi

This commit is contained in:
2021-06-02 01:46:21 +02:00
parent eb00e1516a
commit 96098e9fe1
2 changed files with 95 additions and 7 deletions

View File

@@ -1,9 +1,11 @@
from math import log from math import log
import numpy as np import numpy as np
from scipy.special import gamma, psi from scipy.special import digamma, gamma, psi
from sklearn.neighbors import BallTree, KDTree
from sklearn.neighbors import NearestNeighbors from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection._mutual_info import _compute_mi
# from sklearn.feature_selection._mutual_info import _compute_mi
class Metrics: class Metrics:
@@ -19,17 +21,95 @@ class Metrics:
values of the continuous variable values of the continuous variable
y : np.array y : np.array
array of labels array of labels
base : int, optional
base of the logarithm, by default 2
Returns Returns
------- -------
float float
Information gained Information gained
""" """
return _compute_mi( # return _compute_mi(
x, y, x_discrete=False, y_discrete=True, n_neighbors=3 # x, y, x_discrete=False, y_discrete=True, n_neighbors=3
# )
return Metrics._compute_mi_cd(x, y, n_neighbors=3)
def _compute_mi_cd(c, d, n_neighbors):
"""Compute mutual information between continuous and discrete variables.
Parameters
----------
c : ndarray, shape (n_samples,)
Samples of a continuous random variable.
d : ndarray, shape (n_samples,)
Samples of a discrete random variable.
n_neighbors : int
Number of nearest neighbors to search for each point, see [1]_.
Returns
-------
mi : float
Estimated mutual information. If it turned out to be negative it is
replace by 0.
Notes
-----
True mutual information can't be negative. If its estimate by a numerical
method is negative, it means (providing the method is adequate) that the
mutual information is close to 0 and replacing it by 0 is a reasonable
strategy.
References
----------
.. [1] B. C. Ross "Mutual Information between Discrete and Continuous
Data Sets". PLoS ONE 9(2), 2014.
"""
n_samples = c.shape[0]
if c.ndim == 1:
c = c.reshape((-1, 1))
radius = np.empty(n_samples)
label_counts = np.empty(n_samples)
k_all = np.empty(n_samples)
nn = NearestNeighbors()
for label in np.unique(d):
mask = d == label
count = np.sum(mask)
if count > 1:
k = min(n_neighbors, count - 1)
nn.set_params(n_neighbors=k)
nn.fit(c[mask])
r = nn.kneighbors()[0]
radius[mask] = np.nextafter(r[:, -1], 0)
k_all[mask] = k
label_counts[mask] = count
# Ignore points with unique labels.
mask = label_counts > 1
n_samples = np.sum(mask)
label_counts = label_counts[mask]
k_all = k_all[mask]
c = c[mask]
radius = radius[mask]
# kd = KDTree(c)
kd = (
BallTree(c, metric="chebyshev")
if n_samples >= 20
else KDTree(c, metric="chebyshev")
) )
m_all = kd.query_radius(
c, radius, count_only=True, return_distance=False
)
m_all = np.array(m_all) - 1.0
mi = (
digamma(n_samples)
+ np.mean(digamma(k_all))
- np.mean(digamma(label_counts))
- np.mean(digamma(m_all + 1))
)
return max(0, mi)
@staticmethod @staticmethod
def _nearest_distances(X, k=1): def _nearest_distances(X, k=1):

View File

@@ -17,6 +17,14 @@ print(Metrics.information_gain(X, y))
for i in range(n): for i in range(n):
print(i, Metrics.information_gain(X[:, i], y)) print(i, Metrics.information_gain(X[:, i], y))
print("- Continuous features") print("- Continuous features")
# print(Metrics.information_gain_cont(X, y)) print(Metrics.information_gain_cont(X, y))
for i in range(n): for i in range(n):
print(i, Metrics.information_gain_cont(X[:, i], y)) print(i, Metrics.information_gain_cont(X[:, i], y))
print("CFS Discrete")
print(mfsd.cfs(X, y).get_results())
print("CFS continuous")
print(mfsc.cfs(X, y).get_results())
print("FCBF Discrete")
print(mfsd.fcbf(X, y, 1e-7).get_results())
print("FCBF continuous")
print(mfsc.fcbf(X, y, 1e-7).get_results())