from math import log
import numpy as np

from scipy.special import digamma, gamma, psi
from sklearn.neighbors import BallTree, KDTree
from sklearn.neighbors import NearestNeighbors


class Metrics:
    @staticmethod
    def information_gain_cont(x, y):
        """Measures the reduction in uncertainty about the value of y when the
        value of X continuous is known (also called mutual information)
        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)

        Parameters
        ----------
        x : np.array
            values of the continuous variable
        y : np.array
            array of labels

        Returns
        -------
        float
            Information gained
        """
        return Metrics._compute_mi_cd(x, y, n_neighbors=3)

    @staticmethod
    def information_gain_cont_features(xa, xb):
        """Measures the reduction in uncertainty about the value of xb when the
        value of xa continuous is known (also called mutual information)
        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)

        Parameters
        ----------
        xa : np.array
            values of the continuous variable
        xb : np.array
            values of the continuous variable

        Returns
        -------
        float
            Information gained
        """
        return Metrics._compute_mi_cc(xa, xb, n_neighbors=3)

    @staticmethod
    def _compute_mi_cc(x, y, n_neighbors):
        """Compute mutual information between two continuous variables.

        # Author: Nikolay Mayorov <n59_ru@hotmail.com>
        # License: 3-clause BSD

        Parameters
        ----------
        x, y : ndarray, shape (n_samples,)
            Samples of two continuous random variables, must have an identical
            shape.

        n_neighbors : int
            Number of nearest neighbors to search for each point, see [1]_.

        Returns
        -------
        mi : float
            Estimated mutual information. If it turned out to be negative it is
            replace by 0.

        Notes
        -----
        True mutual information can't be negative. If its estimate by a
        numerical method is negative, it means (providing the method is
        adequate) that the mutual information is close to 0 and replacing it by
        0 is a reasonable strategy.

        References
        ----------
        .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
            information". Phys. Rev. E 69, 2004.
        """

        n_samples = x.size

        x = x.reshape((-1, 1))
        y = y.reshape((-1, 1))
        xy = np.hstack((x, y))

        # Here we rely on NearestNeighbors to select the fastest algorithm.
        nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)

        nn.fit(xy)
        radius = nn.kneighbors()[0]
        radius = np.nextafter(radius[:, -1], 0)

        # KDTree is explicitly fit to allow for the querying of number of
        # neighbors within a specified radius
        kd = KDTree(x, metric="chebyshev")
        nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
        nx = np.array(nx) - 1.0

        kd = KDTree(y, metric="chebyshev")
        ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
        ny = np.array(ny) - 1.0

        mi = (
            digamma(n_samples)
            + digamma(n_neighbors)
            - np.mean(digamma(nx + 1))
            - np.mean(digamma(ny + 1))
        )

        return max(0, mi)

    @staticmethod
    def _compute_mi_cd(c, d, n_neighbors):
        """Compute mutual information between continuous and discrete
        variable.

        # Author: Nikolay Mayorov <n59_ru@hotmail.com>
        # License: 3-clause BSD


        Parameters
        ----------
        c : ndarray, shape (n_samples,)
            Samples of a continuous random variable.

        d : ndarray, shape (n_samples,)
            Samples of a discrete random variable.

        n_neighbors : int
            Number of nearest neighbors to search for each point, see [1]_.

        Returns
        -------
        mi : float
            Estimated mutual information. If it turned out to be negative it is
            replace by 0.

        Notes
        -----
        True mutual information can't be negative. If its estimate by a
        numerical method is negative, it means (providing the method is
        adequate) that the mutual information is close to 0 and replacing it
        by 0 is a reasonable strategy.

        References
        ----------
        .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
        Data Sets". PLoS ONE 9(2), 2014.
        """
        n_samples = c.shape[0]
        if c.ndim == 1:
            c = c.reshape((-1, 1))
        radius = np.empty(n_samples)
        label_counts = np.empty(n_samples)
        k_all = np.empty(n_samples)
        nn = NearestNeighbors()
        for label in np.unique(d):
            mask = d == label
            count = np.sum(mask)
            if count > 1:
                k = min(n_neighbors, count - 1)
                nn.set_params(n_neighbors=k)
                nn.fit(c[mask])
                r = nn.kneighbors()[0]
                radius[mask] = np.nextafter(r[:, -1], 0)
                k_all[mask] = k
            label_counts[mask] = count
        # Ignore points with unique labels.
        mask = label_counts > 1
        n_samples = np.sum(mask)
        label_counts = label_counts[mask]
        k_all = k_all[mask]
        c = c[mask]
        radius = radius[mask]
        if n_samples == 0:
            return 0.0
        kd = (
            BallTree(c, metric="chebyshev")
            if n_samples >= 20
            else KDTree(c, metric="chebyshev")
        )
        m_all = kd.query_radius(
            c, radius, count_only=True, return_distance=False
        )
        m_all = np.array(m_all) - 1.0
        mi = (
            digamma(n_samples)
            + np.mean(digamma(k_all))
            - np.mean(digamma(label_counts))
            - np.mean(digamma(m_all + 1))
        )
        return max(0.0, mi)

    @staticmethod
    def _nearest_distances(X, k=1):
        """
        X = array(N,M)
        N = number of points
        M = number of dimensions
        returns the distance to the kth nearest neighbor for every point in X
        """
        knn = NearestNeighbors(n_neighbors=k + 1)
        knn.fit(X)
        d, _ = knn.kneighbors(X)  # the first nearest neighbor is itself
        return d[:, -1]  # returns the distance to the kth nearest neighbor

    @staticmethod
    def differential_entropy(x, k=1):
        """Returns the entropy of the X.
        Parameters
        ===========
        x : array-like, shape (n_samples, n_features)
            The data the entropy of which is computed
        k : int, optional
            number of nearest neighbors for density estimation
        Notes
        ======
        Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy
        of a random vector. Probl. Inf. Transm. 23, 95-101.
        See also: Evans, D. 2008 A computationally efficient estimator for
        mutual information, Proc. R. Soc. A 464 (2093), 1203-1215.
        and:
        Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual
        information. Phys Rev E 69(6 Pt 2):066138.

        Differential entropy can be negative
        https://stats.stackexchange.com/questions/73881/
        when-is-the-differential-entropy-negative
        """
        if x.ndim == 1:
            x = x.reshape(-1, 1)
        # Distance to kth nearest neighbor
        r = Metrics._nearest_distances(x, k)  # squared distances
        n, d = x.shape
        volume_unit_ball = (np.pi ** (0.5 * d)) / gamma(0.5 * d + 1)
        """
        F. Perez-Cruz, (2008). Estimation of Information Theoretic Measures
        for Continuous Random Variables. Advances in Neural Information
        Processing Systems 21 (NIPS). Vancouver (Canada), December.
        return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k)
        """
        return (
            d * np.mean(np.log(r + np.finfo(x.dtype).eps))
            + np.log(volume_unit_ball)
            + psi(n)
            - psi(k)
        )

    @staticmethod
    def symmetrical_unc_continuous(x, y):
        """Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
        https://github.com/gregversteeg/NPEET

        Parameters
        ----------
        x : np.array
            values of the continuous variable
        y : np.array
            array of labels

        Returns
        -------
        float
            symmetrical uncertainty
        """

        return (
            2.0
            * Metrics.information_gain_cont(x, y)
            / (
                Metrics.differential_entropy(x, k=len(x) - 1)
                + Metrics.entropy(y)
            )
        )

    @staticmethod
    def symmetrical_unc_continuous_features(x, y):
        """Compute symmetrical uncertainty. Using Greg Ver Steeg's npeet
        https://github.com/gregversteeg/NPEET

        Parameters
        ----------
        x : np.array
            values of the continuous variable
        y : np.array
            array of labels

        Returns
        -------
        float
            symmetrical uncertainty
        """

        return (
            2.0
            * Metrics.information_gain_cont_features(x, y)
            / (
                Metrics.differential_entropy(x, k=len(x) - 1)
                + Metrics.entropy(y)
            )
        )

    @staticmethod
    def symmetrical_uncertainty(x, y):
        """Compute symmetrical uncertainty. Normalize* information gain (mutual
        information) with the entropies of the features in order to compensate
        the bias due to high cardinality features. *Range [0, 1]
        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)

        Parameters
        ----------
        x : np.array
            values of the variable
        y : np.array
            array of labels

        Returns
        -------
        float
            symmetrical uncertainty
        """
        return (
            2.0
            * Metrics.information_gain(x, y)
            / (Metrics.entropy(x) + Metrics.entropy(y))
        )

    @staticmethod
    def conditional_entropy(x, y, base=2):
        """quantifies the amount of information needed to describe the outcome
        of Y given that the value of X is known
        computes H(Y|X)

        Parameters
        ----------
        x : np.array
            values of the variable
        y : np.array
            array of labels
        base : int, optional
            base of the logarithm, by default 2

        Returns
        -------
        float
            conditional entropy of y given x
        """
        xy = np.c_[x, y]
        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)

    @staticmethod
    def entropy(y, base=2):
        """measure of the uncertainty in predicting the value of y

        Parameters
        ----------
        y : np.array
            array of labels
        base : int, optional
            base of the logarithm, by default 2

        Returns
        -------
        float
            entropy of y
        """
        _, count = np.unique(y, return_counts=True, axis=0)
        proba = count.astype(float) / len(y)
        proba = proba[proba > 0.0]
        return np.sum(proba * np.log(1.0 / proba)) / log(base)

    @staticmethod
    def information_gain(x, y, base=2):
        """Measures the reduction in uncertainty about the value of y when the
        value of X is known (also called mutual information)
        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)

        Parameters
        ----------
        x : np.array
            values of the variable
        y : np.array
            array of labels
        base : int, optional
            base of the logarithm, by default 2

        Returns
        -------
        float
            Information gained
        """
        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
            x, y, base
        )