mirror of
https://github.com/Doctorado-ML/mufs.git
synced 2025-08-17 16:45:53 +00:00
Add max_features to selection
Add first approach to continuous variables
This commit is contained in:
228
mfs/Metrics.py
Executable file
228
mfs/Metrics.py
Executable file
@@ -0,0 +1,228 @@
|
||||
from math import log
|
||||
import numpy as np
|
||||
|
||||
from scipy.special import gamma, psi
|
||||
from sklearn.neighbors import BallTree, KDTree, NearestNeighbors
|
||||
from sklearn.feature_selection._mutual_info import _compute_mi
|
||||
|
||||
# from .entropy_estimators import mi, entropy as c_entropy
|
||||
|
||||
|
||||
class Metrics:
|
||||
@staticmethod
|
||||
def information_gain_cont(x, y):
|
||||
"""Measures the reduction in uncertainty about the value of y when the
|
||||
value of X continuous is known (also called mutual information)
|
||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.array
|
||||
values of the continuous variable
|
||||
y : np.array
|
||||
array of labels
|
||||
base : int, optional
|
||||
base of the logarithm, by default 2
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
Information gained
|
||||
"""
|
||||
return _compute_mi(
|
||||
x, y, x_discrete=False, y_discrete=True, n_neighbors=3
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _nearest_distances(X, k=1):
|
||||
"""
|
||||
X = array(N,M)
|
||||
N = number of points
|
||||
M = number of dimensions
|
||||
returns the distance to the kth nearest neighbor for every point in X
|
||||
"""
|
||||
knn = NearestNeighbors(n_neighbors=k + 1)
|
||||
knn.fit(X)
|
||||
d, _ = knn.kneighbors(X) # the first nearest neighbor is itself
|
||||
return d[:, -1] # returns the distance to the kth nearest neighbor
|
||||
|
||||
@staticmethod
|
||||
def differential_entropy(X, k=1):
|
||||
|
||||
"""Returns the entropy of the X.
|
||||
Parameters
|
||||
===========
|
||||
X : array-like, shape (n_samples, n_features)
|
||||
The data the entropy of which is computed
|
||||
k : int, optional
|
||||
number of nearest neighbors for density estimation
|
||||
Notes
|
||||
======
|
||||
Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy
|
||||
of a random vector. Probl. Inf. Transm. 23, 95-101.
|
||||
See also: Evans, D. 2008 A computationally efficient estimator for
|
||||
mutual information, Proc. R. Soc. A 464 (2093), 1203-1215.
|
||||
and:
|
||||
Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual
|
||||
information. Phys Rev E 69(6 Pt 2):066138.
|
||||
"""
|
||||
if X.ndim == 1:
|
||||
X = X.reshape(-1, 1)
|
||||
# Distance to kth nearest neighbor
|
||||
r = Metrics._nearest_distances(X, k) # squared distances
|
||||
n, d = X.shape
|
||||
volume_unit_ball = (np.pi ** (0.5 * d)) / gamma(0.5 * d + 1)
|
||||
"""
|
||||
F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures
|
||||
for Continuous Random Variables. Advances in Neural Information
|
||||
Processing Systems 21 (NIPS). Vancouver (Canada), December.
|
||||
return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k)
|
||||
"""
|
||||
return (
|
||||
d * np.mean(np.log(r + np.finfo(X.dtype).eps))
|
||||
+ np.log(volume_unit_ball)
|
||||
+ psi(n)
|
||||
- psi(k)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def conditional_differential_entropy(x, y):
|
||||
"""quantifies the amount of information needed to describe the outcome
|
||||
of Y discrete given that the value of X continuous is known
|
||||
computes H(Y|X)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.array
|
||||
values of the continuous variable
|
||||
y : np.array
|
||||
array of labels
|
||||
base : int, optional
|
||||
base of the logarithm, by default 2
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
conditional entropy of y given x
|
||||
"""
|
||||
xy = np.c_[x, y]
|
||||
return Metrics.differential_entropy(xy) - Metrics.differential_entropy(
|
||||
x
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def symmetrical_unc_continuous(x, y):
|
||||
"""Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
|
||||
https://github.com/gregversteeg/NPEET
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.array
|
||||
values of the continuous variable
|
||||
y : np.array
|
||||
array of labels
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
symmetrical uncertainty
|
||||
"""
|
||||
|
||||
return (
|
||||
2.0
|
||||
* Metrics.information_gain_cont(x, y)
|
||||
/ (Metrics.differential_entropy(x) + Metrics.entropy(y))
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def symmetrical_uncertainty(x, y):
|
||||
"""Compute symmetrical uncertainty. Normalize* information gain (mutual
|
||||
information) with the entropies of the features in order to compensate
|
||||
the bias due to high cardinality features. *Range [0, 1]
|
||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.array
|
||||
values of the variable
|
||||
y : np.array
|
||||
array of labels
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
symmetrical uncertainty
|
||||
"""
|
||||
return (
|
||||
2.0
|
||||
* Metrics.information_gain(x, y)
|
||||
/ (Metrics.entropy(x) + Metrics.entropy(y))
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def conditional_entropy(x, y, base=2):
|
||||
"""quantifies the amount of information needed to describe the outcome
|
||||
of Y given that the value of X is known
|
||||
computes H(Y|X)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.array
|
||||
values of the variable
|
||||
y : np.array
|
||||
array of labels
|
||||
base : int, optional
|
||||
base of the logarithm, by default 2
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
conditional entropy of y given x
|
||||
"""
|
||||
xy = np.c_[x, y]
|
||||
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
|
||||
|
||||
@staticmethod
|
||||
def entropy(y, base=2):
|
||||
"""measure of the uncertainty in predicting the value of y
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : np.array
|
||||
array of labels
|
||||
base : int, optional
|
||||
base of the logarithm, by default 2
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
entropy of y
|
||||
"""
|
||||
_, count = np.unique(y, return_counts=True, axis=0)
|
||||
proba = count.astype(float) / len(y)
|
||||
proba = proba[proba > 0.0]
|
||||
return np.sum(proba * np.log(1.0 / proba)) / log(base)
|
||||
|
||||
@staticmethod
|
||||
def information_gain(x, y, base=2):
|
||||
"""Measures the reduction in uncertainty about the value of y when the
|
||||
value of X is known (also called mutual information)
|
||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : np.array
|
||||
values of the variable
|
||||
y : np.array
|
||||
array of labels
|
||||
base : int, optional
|
||||
base of the logarithm, by default 2
|
||||
|
||||
Returns
|
||||
-------
|
||||
float
|
||||
Information gained
|
||||
"""
|
||||
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
|
||||
x, y, base
|
||||
)
|
Reference in New Issue
Block a user