Implement Metric methods and tests

2025-08-15 23:55:56 +00:00 · 2021-05-23 20:32:33 +02:00
parent ba73a7f5c0
commit a19f2cc12a
6 changed files with 265 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -128,4 +128,5 @@ dmypy.json
 # Pyre type checker
 .pyre/
-.vscode
+.vscode
 junk/*
--- a/49
+++ b/49
@@ -0,0 +1,49 @@
 SHELL := /bin/bash
 .DEFAULT_GOAL := help
 .PHONY: coverage deps help lint push test doc build
 coverage:  ## Run tests with coverage
 	coverage erase
 	coverage run -m unittest -v cfs.tests
 	coverage report -m
 deps:  ## Install dependencies
 	pip install -r requirements.txt
 lint:  ## Lint and static-check
 	black cfs
 	flake8 cfs
 	mypy cfs
 push:  ## Push code with tags
 	git push && git push --tags
 test:  ## Run tests
 	python -m unittest -v cfs.tests
 doc:  ## Update documentation
 	make -C docs --makefile=Makefile html
 build:  ## Build package
 	rm -fr dist/*
 	python setup.py sdist bdist_wheel
 doc-clean:  ## Update documentation
 	make -C docs --makefile=Makefile clean
 help: ## Show help message
 	@IFS=$$'\n' ; \
 	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
 	printf "%s\n\n" "Usage: make [task]"; \
 	printf "%-20s %s\n" "task" "help" ; \
 	printf "%-20s %s\n" "------" "----" ; \
 	for help_line in $${help_lines[@]}; do \
 		IFS=$$':' ; \
 		help_split=($$help_line) ; \
 		help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
 		help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
 		printf '\033[36m'; \
 		printf "%-20s %s" $$help_command ; \
 		printf '\033[0m'; \
 		printf "%s\n" $$help_info; \
 	done
--- a/cfs/Entropy.py
+++ b/cfs/Entropy.py
@@ -0,0 +1,39 @@
 ##Entropy
 def entropy(Y):
    """
    Also known as Shanon Entropy
    Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
    """
    unique, count = np.unique(Y, return_counts=True, axis=0)
    prob = count / len(Y)
    en = -np.sum(prob * np.log2(prob))
    return en
 # Joint Entropy
 def jEntropy(Y, X):
    """
    H(Y;X)
    Reference: https://en.wikipedia.org/wiki/Joint_entropy
    """
    YX = np.c_[Y, X]
    return entropy(YX)
 # Conditional Entropy
 def cEntropy(Y, X):
    """
    conditional entropy = Joint Entropy - Entropy of X
    H(Y|X) = H(Y;X) - H(X)
    Reference: https://en.wikipedia.org/wiki/Conditional_entropy
    """
    return jEntropy(Y, X) - entropy(X)
 # Information Gain
 def gain(Y, X):
    """
    Information Gain, I(Y;X) = H(Y) - H(Y|X)
    Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
    """
    return entropy(Y) - cEntropy(Y, X)
--- a/cfs/Selection.py
+++ b/cfs/Selection.py
@@ -1,3 +1,86 @@
 from math import log
 import numpy as np
 class Metrics:
    @staticmethod
    def conditional_entropy(x, y, base=2):
        """quantifies the amount of information needed to describe the outcome
        of Y given that the value of X is known
        computes H(Y|X)
        Parameters
        ----------
        x : np.array
            values of the variable
        y : np.array
            array of labels
        base : int, optional
            base of the logarithm, by default 2
        Returns
        -------
        float
            conditional entropy of y given x
        """
        xy = np.c_[x, y]
        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
    @staticmethod
    def entropy(y, base=2):
        """measure of the uncertainty in predicting the value of y
        Parameters
        ----------
        y : np.array
            array of labels
        base : int, optional
            base of the logarithm, by default 2
        Returns
        -------
        float
            entropy of y
        """
        _, count = np.unique(y, return_counts=True, axis=0)
        proba = count.astype(float) / len(y)
        proba = proba[proba > 0.0]
        return np.sum(proba * np.log(1.0 / proba)) / log(base)
    @staticmethod
    def information_gain(x, y, base=2):
        """Measures the reduction in uncertainty about the value of y when the
        value of X is known (also called mutual information)
        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
        Parameters
        ----------
        x : np.array
            values of the variable
        y : np.array
            array of labels
        base : int, optional
            base of the logarithm, by default 2
        Returns
        -------
        float
            Information gained
        """
        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
            x, y, base
        )
    @staticmethod
    def symmetrical_uncertainty(x, y):
        return (
            2.0
            * Metrics.information_gain(x, y)
            / (Metrics.entropy(x) + Metrics.entropy(y))
        )
 class CFS:
    def __init__(self, a):
        self.a = a
--- a/cfs/tests/Metrics_test.py
+++ b/cfs/tests/Metrics_test.py
@@ -0,0 +1,90 @@
 import unittest
 from sklearn.datasets import load_iris
 from mdlp import MDLP
 import numpy as np
 from ..Selection import Metrics
 class Metrics_test(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        mdlp = MDLP(random_state=1)
        X, self.y = load_iris(return_X_y=True)
        self.X = mdlp.fit_transform(X, self.y).astype("int64")
        self.m, self.n = self.X.shape
    # @classmethod
    # def setup(cls):
    def test_entropy(self):
        metric = Metrics()
        datasets = [
            ([0, 0, 0, 0, 1, 1, 1, 1], 2, 1.0),
            ([0, 1, 0, 2, 1, 2], 3, 1.0),
            ([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 2, 0.8812908992306927),
            ([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263),
            ([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263),
            ([1, 1, 5], 2, 0.9182958340544896),
            (self.y, 3, 0.999999999),
        ]
        for dataset, base, entropy in datasets:
            computed = metric.entropy(dataset, base)
            self.assertAlmostEqual(entropy, computed)
    def test_conditional_entropy(self):
        metric = Metrics()
        results_expected = [
            0.490953458537736,
            0.7110077966379169,
            0.15663362014829718,
            0.13032469395094992,
        ]
        for expected, col in zip(results_expected, range(self.n)):
            computed = metric.conditional_entropy(self.X[:, col], self.y, 3)
            self.assertAlmostEqual(expected, computed)
        self.assertAlmostEqual(
            0.6309297535714573,
            metric.conditional_entropy(
                [1, 3, 2, 3, 2, 1], [1, 2, 0, 1, 1, 2], 3
            ),
        )
        # https://planetcalc.com/8414/?joint=0.4%200%0A0.2%200.4&showDetails=1
        self.assertAlmostEqual(
            0.5509775004326938,
            metric.conditional_entropy([1, 1, 2, 2, 2], [0, 0, 0, 2, 2], 2),
        )
    def test_information_gain(self):
        metric = Metrics()
        results_expected = [
            0.5090465414622638,
            0.28899220336208287,
            0.8433663798517026,
            0.8696753060490499,
        ]
        for expected, col in zip(results_expected, range(self.n)):
            computed = metric.information_gain(self.X[:, col], self.y, 3)
            self.assertAlmostEqual(expected, computed)
        # https://planetcalc.com/8419/
        # ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_
        results_expected = [
            0.806819679,
            0.458041805,
            1.336704086,
            1.378402748,
        ]
        for expected, col in zip(results_expected, range(self.n)):
            computed = metric.information_gain(self.X[:, col], self.y, 2)
            self.assertAlmostEqual(expected, computed)
    def test_symmetrical_uncertainty(self):
        metric = Metrics()
        results_expected = [
            0.33296547388990266,
            0.19068147573570668,
            0.810724587460511,
            0.870521418179061,
        ]
        for expected, col in zip(results_expected, range(self.n)):
            computed = metric.symmetrical_uncertainty(self.X[:, col], self.y)
            self.assertAlmostEqual(expected, computed)
--- a/cfs/tests/init.py
+++ b/cfs/tests/init.py
@@ -1,3 +1,4 @@
 from .CFS_test import CFS_test
 from .Metrics_test import Metrics_test
-__all__ = ["CFS_test"]
+__all__ = ["CFS_test", "Metrics_test"]