mirror of
https://github.com/Doctorado-ML/mufs.git
synced 2025-08-15 23:55:56 +00:00
400 lines
12 KiB
Python
Executable File
400 lines
12 KiB
Python
Executable File
from math import log
|
|
import numpy as np
|
|
|
|
from scipy.special import digamma, gamma, psi
|
|
from sklearn.neighbors import BallTree, KDTree
|
|
from sklearn.neighbors import NearestNeighbors
|
|
|
|
|
|
class Metrics:
|
|
@staticmethod
|
|
def information_gain_cont(x, y):
|
|
"""Measures the reduction in uncertainty about the value of y when the
|
|
value of X continuous is known (also called mutual information)
|
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
|
|
|
Parameters
|
|
----------
|
|
x : np.array
|
|
values of the continuous variable
|
|
y : np.array
|
|
array of labels
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
Information gained
|
|
"""
|
|
return Metrics._compute_mi_cd(x, y, n_neighbors=3)
|
|
|
|
@staticmethod
|
|
def information_gain_cont_features(xa, xb):
|
|
"""Measures the reduction in uncertainty about the value of xb when the
|
|
value of xa continuous is known (also called mutual information)
|
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
|
|
|
Parameters
|
|
----------
|
|
xa : np.array
|
|
values of the continuous variable
|
|
xb : np.array
|
|
values of the continuous variable
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
Information gained
|
|
"""
|
|
return Metrics._compute_mi_cc(xa, xb, n_neighbors=3)
|
|
|
|
@staticmethod
|
|
def _compute_mi_cc(x, y, n_neighbors):
|
|
"""Compute mutual information between two continuous variables.
|
|
|
|
# Author: Nikolay Mayorov <n59_ru@hotmail.com>
|
|
# License: 3-clause BSD
|
|
|
|
Parameters
|
|
----------
|
|
x, y : ndarray, shape (n_samples,)
|
|
Samples of two continuous random variables, must have an identical
|
|
shape.
|
|
|
|
n_neighbors : int
|
|
Number of nearest neighbors to search for each point, see [1]_.
|
|
|
|
Returns
|
|
-------
|
|
mi : float
|
|
Estimated mutual information. If it turned out to be negative it is
|
|
replace by 0.
|
|
|
|
Notes
|
|
-----
|
|
True mutual information can't be negative. If its estimate by a
|
|
numerical method is negative, it means (providing the method is
|
|
adequate) that the mutual information is close to 0 and replacing it by
|
|
0 is a reasonable strategy.
|
|
|
|
References
|
|
----------
|
|
.. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
|
|
information". Phys. Rev. E 69, 2004.
|
|
"""
|
|
|
|
n_samples = x.size
|
|
|
|
x = x.reshape((-1, 1))
|
|
y = y.reshape((-1, 1))
|
|
xy = np.hstack((x, y))
|
|
|
|
# Here we rely on NearestNeighbors to select the fastest algorithm.
|
|
nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
|
|
|
|
nn.fit(xy)
|
|
radius = nn.kneighbors()[0]
|
|
radius = np.nextafter(radius[:, -1], 0)
|
|
|
|
# KDTree is explicitly fit to allow for the querying of number of
|
|
# neighbors within a specified radius
|
|
kd = KDTree(x, metric="chebyshev")
|
|
nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
|
|
nx = np.array(nx) - 1.0
|
|
|
|
kd = KDTree(y, metric="chebyshev")
|
|
ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
|
|
ny = np.array(ny) - 1.0
|
|
|
|
mi = (
|
|
digamma(n_samples)
|
|
+ digamma(n_neighbors)
|
|
- np.mean(digamma(nx + 1))
|
|
- np.mean(digamma(ny + 1))
|
|
)
|
|
|
|
return max(0, mi)
|
|
|
|
@staticmethod
|
|
def _compute_mi_cd(c, d, n_neighbors):
|
|
"""Compute mutual information between continuous and discrete
|
|
variable.
|
|
|
|
# Author: Nikolay Mayorov <n59_ru@hotmail.com>
|
|
# License: 3-clause BSD
|
|
|
|
|
|
Parameters
|
|
----------
|
|
c : ndarray, shape (n_samples,)
|
|
Samples of a continuous random variable.
|
|
|
|
d : ndarray, shape (n_samples,)
|
|
Samples of a discrete random variable.
|
|
|
|
n_neighbors : int
|
|
Number of nearest neighbors to search for each point, see [1]_.
|
|
|
|
Returns
|
|
-------
|
|
mi : float
|
|
Estimated mutual information. If it turned out to be negative it is
|
|
replace by 0.
|
|
|
|
Notes
|
|
-----
|
|
True mutual information can't be negative. If its estimate by a
|
|
numerical method is negative, it means (providing the method is
|
|
adequate) that the mutual information is close to 0 and replacing it
|
|
by 0 is a reasonable strategy.
|
|
|
|
References
|
|
----------
|
|
.. [1] B. C. Ross "Mutual Information between Discrete and Continuous
|
|
Data Sets". PLoS ONE 9(2), 2014.
|
|
"""
|
|
n_samples = c.shape[0]
|
|
if c.ndim == 1:
|
|
c = c.reshape((-1, 1))
|
|
radius = np.empty(n_samples)
|
|
label_counts = np.empty(n_samples)
|
|
k_all = np.empty(n_samples)
|
|
nn = NearestNeighbors()
|
|
for label in np.unique(d):
|
|
mask = d == label
|
|
count = np.sum(mask)
|
|
if count > 1:
|
|
k = min(n_neighbors, count - 1)
|
|
nn.set_params(n_neighbors=k)
|
|
nn.fit(c[mask])
|
|
r = nn.kneighbors()[0]
|
|
radius[mask] = np.nextafter(r[:, -1], 0)
|
|
k_all[mask] = k
|
|
label_counts[mask] = count
|
|
# Ignore points with unique labels.
|
|
mask = label_counts > 1
|
|
n_samples = np.sum(mask)
|
|
label_counts = label_counts[mask]
|
|
k_all = k_all[mask]
|
|
c = c[mask]
|
|
radius = radius[mask]
|
|
if n_samples == 0:
|
|
return 0.0
|
|
kd = (
|
|
BallTree(c, metric="chebyshev")
|
|
if n_samples >= 20
|
|
else KDTree(c, metric="chebyshev")
|
|
)
|
|
m_all = kd.query_radius(
|
|
c, radius, count_only=True, return_distance=False
|
|
)
|
|
m_all = np.array(m_all) - 1.0
|
|
mi = (
|
|
digamma(n_samples)
|
|
+ np.mean(digamma(k_all))
|
|
- np.mean(digamma(label_counts))
|
|
- np.mean(digamma(m_all + 1))
|
|
)
|
|
return max(0.0, mi)
|
|
|
|
@staticmethod
|
|
def _nearest_distances(X, k=1):
|
|
"""
|
|
X = array(N,M)
|
|
N = number of points
|
|
M = number of dimensions
|
|
returns the distance to the kth nearest neighbor for every point in X
|
|
"""
|
|
knn = NearestNeighbors(n_neighbors=k + 1)
|
|
knn.fit(X)
|
|
d, _ = knn.kneighbors(X) # the first nearest neighbor is itself
|
|
return d[:, -1] # returns the distance to the kth nearest neighbor
|
|
|
|
@staticmethod
|
|
def differential_entropy(x, k=1):
|
|
"""Returns the entropy of the X.
|
|
Parameters
|
|
===========
|
|
x : array-like, shape (n_samples, n_features)
|
|
The data the entropy of which is computed
|
|
k : int, optional
|
|
number of nearest neighbors for density estimation
|
|
Notes
|
|
======
|
|
Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy
|
|
of a random vector. Probl. Inf. Transm. 23, 95-101.
|
|
See also: Evans, D. 2008 A computationally efficient estimator for
|
|
mutual information, Proc. R. Soc. A 464 (2093), 1203-1215.
|
|
and:
|
|
Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual
|
|
information. Phys Rev E 69(6 Pt 2):066138.
|
|
|
|
Differential entropy can be negative
|
|
https://stats.stackexchange.com/questions/73881/
|
|
when-is-the-differential-entropy-negative
|
|
"""
|
|
if x.ndim == 1:
|
|
x = x.reshape(-1, 1)
|
|
# Distance to kth nearest neighbor
|
|
r = Metrics._nearest_distances(x, k) # squared distances
|
|
n, d = x.shape
|
|
volume_unit_ball = (np.pi ** (0.5 * d)) / gamma(0.5 * d + 1)
|
|
"""
|
|
F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures
|
|
for Continuous Random Variables. Advances in Neural Information
|
|
Processing Systems 21 (NIPS). Vancouver (Canada), December.
|
|
return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k)
|
|
"""
|
|
return (
|
|
d * np.mean(np.log(r + np.finfo(x.dtype).eps))
|
|
+ np.log(volume_unit_ball)
|
|
+ psi(n)
|
|
- psi(k)
|
|
)
|
|
|
|
@staticmethod
|
|
def symmetrical_unc_continuous(x, y):
|
|
"""Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
|
|
https://github.com/gregversteeg/NPEET
|
|
|
|
Parameters
|
|
----------
|
|
x : np.array
|
|
values of the continuous variable
|
|
y : np.array
|
|
array of labels
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
symmetrical uncertainty
|
|
"""
|
|
|
|
return (
|
|
2.0
|
|
* Metrics.information_gain_cont(x, y)
|
|
/ (
|
|
Metrics.differential_entropy(x, k=len(x) - 1)
|
|
+ Metrics.entropy(y)
|
|
)
|
|
)
|
|
|
|
@staticmethod
|
|
def symmetrical_unc_continuous_features(x, y):
|
|
"""Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
|
|
https://github.com/gregversteeg/NPEET
|
|
|
|
Parameters
|
|
----------
|
|
x : np.array
|
|
values of the continuous variable
|
|
y : np.array
|
|
array of labels
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
symmetrical uncertainty
|
|
"""
|
|
|
|
return (
|
|
2.0
|
|
* Metrics.information_gain_cont_features(x, y)
|
|
/ (
|
|
Metrics.differential_entropy(x, k=len(x) - 1)
|
|
+ Metrics.entropy(y)
|
|
)
|
|
)
|
|
|
|
@staticmethod
|
|
def symmetrical_uncertainty(x, y):
|
|
"""Compute symmetrical uncertainty. Normalize* information gain (mutual
|
|
information) with the entropies of the features in order to compensate
|
|
the bias due to high cardinality features. *Range [0, 1]
|
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
|
|
|
Parameters
|
|
----------
|
|
x : np.array
|
|
values of the variable
|
|
y : np.array
|
|
array of labels
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
symmetrical uncertainty
|
|
"""
|
|
return (
|
|
2.0
|
|
* Metrics.information_gain(x, y)
|
|
/ (Metrics.entropy(x) + Metrics.entropy(y))
|
|
)
|
|
|
|
@staticmethod
|
|
def conditional_entropy(x, y, base=2):
|
|
"""quantifies the amount of information needed to describe the outcome
|
|
of Y given that the value of X is known
|
|
computes H(Y|X)
|
|
|
|
Parameters
|
|
----------
|
|
x : np.array
|
|
values of the variable
|
|
y : np.array
|
|
array of labels
|
|
base : int, optional
|
|
base of the logarithm, by default 2
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
conditional entropy of y given x
|
|
"""
|
|
xy = np.c_[x, y]
|
|
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
|
|
|
|
@staticmethod
|
|
def entropy(y, base=2):
|
|
"""measure of the uncertainty in predicting the value of y
|
|
|
|
Parameters
|
|
----------
|
|
y : np.array
|
|
array of labels
|
|
base : int, optional
|
|
base of the logarithm, by default 2
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
entropy of y
|
|
"""
|
|
_, count = np.unique(y, return_counts=True, axis=0)
|
|
proba = count.astype(float) / len(y)
|
|
proba = proba[proba > 0.0]
|
|
return np.sum(proba * np.log(1.0 / proba)) / log(base)
|
|
|
|
@staticmethod
|
|
def information_gain(x, y, base=2):
|
|
"""Measures the reduction in uncertainty about the value of y when the
|
|
value of X is known (also called mutual information)
|
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
|
|
|
Parameters
|
|
----------
|
|
x : np.array
|
|
values of the variable
|
|
y : np.array
|
|
array of labels
|
|
base : int, optional
|
|
base of the logarithm, by default 2
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
Information gained
|
|
"""
|
|
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
|
|
x, y, base
|
|
)
|