first try with sklearn mi

This commit is contained in:
2021-06-02 01:46:21 +02:00
parent eb00e1516a
commit 96098e9fe1
2 changed files with 95 additions and 7 deletions

View File

@@ -1,9 +1,11 @@
from math import log
import numpy as np
from scipy.special import gamma, psi
from scipy.special import digamma, gamma, psi
from sklearn.neighbors import BallTree, KDTree
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection._mutual_info import _compute_mi
# from sklearn.feature_selection._mutual_info import _compute_mi
class Metrics:
@@ -19,17 +21,95 @@ class Metrics:
values of the continuous variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
Information gained
"""
return _compute_mi(
x, y, x_discrete=False, y_discrete=True, n_neighbors=3
# return _compute_mi(
# x, y, x_discrete=False, y_discrete=True, n_neighbors=3
# )
return Metrics._compute_mi_cd(x, y, n_neighbors=3)
def _compute_mi_cd(c, d, n_neighbors):
"""Compute mutual information between continuous and discrete variables.
Parameters
----------
c : ndarray, shape (n_samples,)
Samples of a continuous random variable.
d : ndarray, shape (n_samples,)
Samples of a discrete random variable.
n_neighbors : int
Number of nearest neighbors to search for each point, see [1]_.
Returns
-------
mi : float
Estimated mutual information. If it turned out to be negative it is
replace by 0.
Notes
-----
True mutual information can't be negative. If its estimate by a numerical
method is negative, it means (providing the method is adequate) that the
mutual information is close to 0 and replacing it by 0 is a reasonable
strategy.
References
----------
.. [1] B. C. Ross "Mutual Information between Discrete and Continuous
Data Sets". PLoS ONE 9(2), 2014.
"""
n_samples = c.shape[0]
if c.ndim == 1:
c = c.reshape((-1, 1))
radius = np.empty(n_samples)
label_counts = np.empty(n_samples)
k_all = np.empty(n_samples)
nn = NearestNeighbors()
for label in np.unique(d):
mask = d == label
count = np.sum(mask)
if count > 1:
k = min(n_neighbors, count - 1)
nn.set_params(n_neighbors=k)
nn.fit(c[mask])
r = nn.kneighbors()[0]
radius[mask] = np.nextafter(r[:, -1], 0)
k_all[mask] = k
label_counts[mask] = count
# Ignore points with unique labels.
mask = label_counts > 1
n_samples = np.sum(mask)
label_counts = label_counts[mask]
k_all = k_all[mask]
c = c[mask]
radius = radius[mask]
# kd = KDTree(c)
kd = (
BallTree(c, metric="chebyshev")
if n_samples >= 20
else KDTree(c, metric="chebyshev")
)
m_all = kd.query_radius(
c, radius, count_only=True, return_distance=False
)
m_all = np.array(m_all) - 1.0
mi = (
digamma(n_samples)
+ np.mean(digamma(k_all))
- np.mean(digamma(label_counts))
- np.mean(digamma(m_all + 1))
)
return max(0, mi)
@staticmethod
def _nearest_distances(X, k=1):

View File

@@ -17,6 +17,14 @@ print(Metrics.information_gain(X, y))
for i in range(n):
print(i, Metrics.information_gain(X[:, i], y))
print("- Continuous features")
# print(Metrics.information_gain_cont(X, y))
print(Metrics.information_gain_cont(X, y))
for i in range(n):
print(i, Metrics.information_gain_cont(X[:, i], y))
print("CFS Discrete")
print(mfsd.cfs(X, y).get_results())
print("CFS continuous")
print(mfsc.cfs(X, y).get_results())
print("FCBF Discrete")
print(mfsd.fcbf(X, y, 1e-7).get_results())
print("FCBF continuous")
print(mfsc.fcbf(X, y, 1e-7).get_results())