mirror of
https://github.com/Doctorado-ML/mufs.git
synced 2025-08-17 16:45:53 +00:00
first try with sklearn mi
This commit is contained in:
@@ -1,9 +1,11 @@
|
|||||||
from math import log
|
from math import log
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from scipy.special import gamma, psi
|
from scipy.special import digamma, gamma, psi
|
||||||
|
from sklearn.neighbors import BallTree, KDTree
|
||||||
from sklearn.neighbors import NearestNeighbors
|
from sklearn.neighbors import NearestNeighbors
|
||||||
from sklearn.feature_selection._mutual_info import _compute_mi
|
|
||||||
|
# from sklearn.feature_selection._mutual_info import _compute_mi
|
||||||
|
|
||||||
|
|
||||||
class Metrics:
|
class Metrics:
|
||||||
@@ -19,17 +21,95 @@ class Metrics:
|
|||||||
values of the continuous variable
|
values of the continuous variable
|
||||||
y : np.array
|
y : np.array
|
||||||
array of labels
|
array of labels
|
||||||
base : int, optional
|
|
||||||
base of the logarithm, by default 2
|
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
float
|
float
|
||||||
Information gained
|
Information gained
|
||||||
"""
|
"""
|
||||||
return _compute_mi(
|
# return _compute_mi(
|
||||||
x, y, x_discrete=False, y_discrete=True, n_neighbors=3
|
# x, y, x_discrete=False, y_discrete=True, n_neighbors=3
|
||||||
|
# )
|
||||||
|
return Metrics._compute_mi_cd(x, y, n_neighbors=3)
|
||||||
|
|
||||||
|
def _compute_mi_cd(c, d, n_neighbors):
|
||||||
|
"""Compute mutual information between continuous and discrete variables.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
c : ndarray, shape (n_samples,)
|
||||||
|
Samples of a continuous random variable.
|
||||||
|
|
||||||
|
d : ndarray, shape (n_samples,)
|
||||||
|
Samples of a discrete random variable.
|
||||||
|
|
||||||
|
n_neighbors : int
|
||||||
|
Number of nearest neighbors to search for each point, see [1]_.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
mi : float
|
||||||
|
Estimated mutual information. If it turned out to be negative it is
|
||||||
|
replace by 0.
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
True mutual information can't be negative. If its estimate by a numerical
|
||||||
|
method is negative, it means (providing the method is adequate) that the
|
||||||
|
mutual information is close to 0 and replacing it by 0 is a reasonable
|
||||||
|
strategy.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
.. [1] B. C. Ross "Mutual Information between Discrete and Continuous
|
||||||
|
Data Sets". PLoS ONE 9(2), 2014.
|
||||||
|
"""
|
||||||
|
n_samples = c.shape[0]
|
||||||
|
if c.ndim == 1:
|
||||||
|
c = c.reshape((-1, 1))
|
||||||
|
|
||||||
|
radius = np.empty(n_samples)
|
||||||
|
label_counts = np.empty(n_samples)
|
||||||
|
k_all = np.empty(n_samples)
|
||||||
|
nn = NearestNeighbors()
|
||||||
|
for label in np.unique(d):
|
||||||
|
mask = d == label
|
||||||
|
count = np.sum(mask)
|
||||||
|
if count > 1:
|
||||||
|
k = min(n_neighbors, count - 1)
|
||||||
|
nn.set_params(n_neighbors=k)
|
||||||
|
nn.fit(c[mask])
|
||||||
|
r = nn.kneighbors()[0]
|
||||||
|
radius[mask] = np.nextafter(r[:, -1], 0)
|
||||||
|
k_all[mask] = k
|
||||||
|
label_counts[mask] = count
|
||||||
|
|
||||||
|
# Ignore points with unique labels.
|
||||||
|
mask = label_counts > 1
|
||||||
|
n_samples = np.sum(mask)
|
||||||
|
label_counts = label_counts[mask]
|
||||||
|
k_all = k_all[mask]
|
||||||
|
c = c[mask]
|
||||||
|
radius = radius[mask]
|
||||||
|
|
||||||
|
# kd = KDTree(c)
|
||||||
|
kd = (
|
||||||
|
BallTree(c, metric="chebyshev")
|
||||||
|
if n_samples >= 20
|
||||||
|
else KDTree(c, metric="chebyshev")
|
||||||
)
|
)
|
||||||
|
m_all = kd.query_radius(
|
||||||
|
c, radius, count_only=True, return_distance=False
|
||||||
|
)
|
||||||
|
m_all = np.array(m_all) - 1.0
|
||||||
|
|
||||||
|
mi = (
|
||||||
|
digamma(n_samples)
|
||||||
|
+ np.mean(digamma(k_all))
|
||||||
|
- np.mean(digamma(label_counts))
|
||||||
|
- np.mean(digamma(m_all + 1))
|
||||||
|
)
|
||||||
|
return max(0, mi)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _nearest_distances(X, k=1):
|
def _nearest_distances(X, k=1):
|
||||||
|
10
mfs/k.py
10
mfs/k.py
@@ -17,6 +17,14 @@ print(Metrics.information_gain(X, y))
|
|||||||
for i in range(n):
|
for i in range(n):
|
||||||
print(i, Metrics.information_gain(X[:, i], y))
|
print(i, Metrics.information_gain(X[:, i], y))
|
||||||
print("- Continuous features")
|
print("- Continuous features")
|
||||||
# print(Metrics.information_gain_cont(X, y))
|
print(Metrics.information_gain_cont(X, y))
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
print(i, Metrics.information_gain_cont(X[:, i], y))
|
print(i, Metrics.information_gain_cont(X[:, i], y))
|
||||||
|
print("CFS Discrete")
|
||||||
|
print(mfsd.cfs(X, y).get_results())
|
||||||
|
print("CFS continuous")
|
||||||
|
print(mfsc.cfs(X, y).get_results())
|
||||||
|
print("FCBF Discrete")
|
||||||
|
print(mfsd.fcbf(X, y, 1e-7).get_results())
|
||||||
|
print("FCBF continuous")
|
||||||
|
print(mfsc.fcbf(X, y, 1e-7).get_results())
|
||||||
|
Reference in New Issue
Block a user