Files
mufs/mfs/Metrics.py

314 lines
9.3 KiB
Python
Executable File

from math import log
import numpy as np
from scipy.special import digamma, gamma, psi
from sklearn.neighbors import BallTree, KDTree
from sklearn.neighbors import NearestNeighbors
# from sklearn.feature_selection._mutual_info import _compute_mi
class Metrics:
@staticmethod
def information_gain_cont(x, y):
"""Measures the reduction in uncertainty about the value of y when the
value of X continuous is known (also called mutual information)
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
Parameters
----------
x : np.array
values of the continuous variable
y : np.array
array of labels
Returns
-------
float
Information gained
"""
# return _compute_mi(
# x, y, x_discrete=False, y_discrete=True, n_neighbors=3
# )
return Metrics._compute_mi_cd(x, y, n_neighbors=3)
def _compute_mi_cd(c, d, n_neighbors):
"""Compute mutual information between continuous and discrete variables.
Parameters
----------
c : ndarray, shape (n_samples,)
Samples of a continuous random variable.
d : ndarray, shape (n_samples,)
Samples of a discrete random variable.
n_neighbors : int
Number of nearest neighbors to search for each point, see [1]_.
Returns
-------
mi : float
Estimated mutual information. If it turned out to be negative it is
replace by 0.
Notes
-----
True mutual information can't be negative. If its estimate by a numerical
method is negative, it means (providing the method is adequate) that the
mutual information is close to 0 and replacing it by 0 is a reasonable
strategy.
References
----------
.. [1] B. C. Ross "Mutual Information between Discrete and Continuous
Data Sets". PLoS ONE 9(2), 2014.
"""
n_samples = c.shape[0]
if c.ndim == 1:
c = c.reshape((-1, 1))
radius = np.empty(n_samples)
label_counts = np.empty(n_samples)
k_all = np.empty(n_samples)
nn = NearestNeighbors()
for label in np.unique(d):
mask = d == label
count = np.sum(mask)
if count > 1:
k = min(n_neighbors, count - 1)
nn.set_params(n_neighbors=k)
nn.fit(c[mask])
r = nn.kneighbors()[0]
radius[mask] = np.nextafter(r[:, -1], 0)
k_all[mask] = k
label_counts[mask] = count
# Ignore points with unique labels.
mask = label_counts > 1
n_samples = np.sum(mask)
label_counts = label_counts[mask]
k_all = k_all[mask]
c = c[mask]
radius = radius[mask]
# kd = KDTree(c)
kd = (
BallTree(c, metric="chebyshev")
if n_samples >= 20
else KDTree(c, metric="chebyshev")
)
m_all = kd.query_radius(
c, radius, count_only=True, return_distance=False
)
m_all = np.array(m_all) - 1.0
mi = (
digamma(n_samples)
+ np.mean(digamma(k_all))
- np.mean(digamma(label_counts))
- np.mean(digamma(m_all + 1))
)
return max(0, mi)
@staticmethod
def _nearest_distances(X, k=1):
"""
X = array(N,M)
N = number of points
M = number of dimensions
returns the distance to the kth nearest neighbor for every point in X
"""
knn = NearestNeighbors(n_neighbors=k + 1)
knn.fit(X)
d, _ = knn.kneighbors(X) # the first nearest neighbor is itself
return d[:, -1] # returns the distance to the kth nearest neighbor
@staticmethod
def differential_entropy(x, k=1):
"""Returns the entropy of the X.
Parameters
===========
x : array-like, shape (n_samples, n_features)
The data the entropy of which is computed
k : int, optional
number of nearest neighbors for density estimation
Notes
======
Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy
of a random vector. Probl. Inf. Transm. 23, 95-101.
See also: Evans, D. 2008 A computationally efficient estimator for
mutual information, Proc. R. Soc. A 464 (2093), 1203-1215.
and:
Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual
information. Phys Rev E 69(6 Pt 2):066138.
Differential entropy can be negative
https://stats.stackexchange.com/questions/73881/
when-is-the-differential-entropy-negative
"""
if x.ndim == 1:
x = x.reshape(-1, 1)
# Distance to kth nearest neighbor
r = Metrics._nearest_distances(x, k) # squared distances
n, d = x.shape
volume_unit_ball = (np.pi ** (0.5 * d)) / gamma(0.5 * d + 1)
"""
F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures
for Continuous Random Variables. Advances in Neural Information
Processing Systems 21 (NIPS). Vancouver (Canada), December.
return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k)
"""
return (
d * np.mean(np.log(r + np.finfo(x.dtype).eps))
+ np.log(volume_unit_ball)
+ psi(n)
- psi(k)
)
@staticmethod
def conditional_differential_entropy(x, y):
"""quantifies the amount of information needed to describe the outcome
of Y discrete given that the value of X continuous is known
computes H(Y|X)
Parameters
----------
x : np.array
values of the continuous variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
conditional entropy of y given x
"""
xy = np.c_[x, y]
return Metrics.differential_entropy(xy) - Metrics.differential_entropy(
x
)
@staticmethod
def symmetrical_unc_continuous(x, y):
"""Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
https://github.com/gregversteeg/NPEET
Parameters
----------
x : np.array
values of the continuous variable
y : np.array
array of labels
Returns
-------
float
symmetrical uncertainty
"""
return (
2.0
* Metrics.information_gain_cont(x, y)
/ (
Metrics.differential_entropy(x, k=len(x) - 1)
+ Metrics.entropy(y)
)
)
@staticmethod
def symmetrical_uncertainty(x, y):
"""Compute symmetrical uncertainty. Normalize* information gain (mutual
information) with the entropies of the features in order to compensate
the bias due to high cardinality features. *Range [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
Returns
-------
float
symmetrical uncertainty
"""
return (
2.0
* Metrics.information_gain(x, y)
/ (Metrics.entropy(x) + Metrics.entropy(y))
)
@staticmethod
def conditional_entropy(x, y, base=2):
"""quantifies the amount of information needed to describe the outcome
of Y given that the value of X is known
computes H(Y|X)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
conditional entropy of y given x
"""
xy = np.c_[x, y]
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
@staticmethod
def entropy(y, base=2):
"""measure of the uncertainty in predicting the value of y
Parameters
----------
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
entropy of y
"""
_, count = np.unique(y, return_counts=True, axis=0)
proba = count.astype(float) / len(y)
proba = proba[proba > 0.0]
return np.sum(proba * np.log(1.0 / proba)) / log(base)
@staticmethod
def information_gain(x, y, base=2):
"""Measures the reduction in uncertainty about the value of y when the
value of X is known (also called mutual information)
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
Information gained
"""
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
x, y, base
)