Implement Metric methods and tests

2025-08-15 15:45:53 +00:00 · 2021-05-23 20:32:33 +02:00
parent ba73a7f5c0
commit a19f2cc12a
6 changed files with 265 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -128,4 +128,5 @@ dmypy.json
 # Pyre type checker
 .pyre/

-.vscode
+.vscode
+junk/*
--- a/49
+++ b/49
@@ -0,0 +1,49 @@
+SHELL := /bin/bash
+.DEFAULT_GOAL := help
+.PHONY: coverage deps help lint push test doc build
+
+coverage:  ## Run tests with coverage
+	coverage erase
+	coverage run -m unittest -v cfs.tests
+	coverage report -m
+
+deps:  ## Install dependencies
+	pip install -r requirements.txt
+
+lint:  ## Lint and static-check
+	black cfs
+	flake8 cfs
+	mypy cfs
+
+push:  ## Push code with tags
+	git push && git push --tags
+
+test:  ## Run tests
+	python -m unittest -v cfs.tests
+
+doc:  ## Update documentation
+	make -C docs --makefile=Makefile html
+
+build:  ## Build package
+	rm -fr dist/*
+	python setup.py sdist bdist_wheel
+
+doc-clean:  ## Update documentation
+	make -C docs --makefile=Makefile clean
+
+help: ## Show help message
+	@IFS=$$'\n' ; \
+	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
+	printf "%s\n\n" "Usage: make [task]"; \
+	printf "%-20s %s\n" "task" "help" ; \
+	printf "%-20s %s\n" "------" "----" ; \
+	for help_line in $${help_lines[@]}; do \
+		IFS=$$':' ; \
+		help_split=($$help_line) ; \
+		help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
+		help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
+		printf '\033[36m'; \
+		printf "%-20s %s" $$help_command ; \
+		printf '\033[0m'; \
+		printf "%s\n" $$help_info; \
+	done
--- a/cfs/Entropy.py
+++ b/cfs/Entropy.py
@@ -0,0 +1,39 @@
+##Entropy
+def entropy(Y):
+    """
+    Also known as Shanon Entropy
+    Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
+    """
+    unique, count = np.unique(Y, return_counts=True, axis=0)
+    prob = count / len(Y)
+    en = -np.sum(prob * np.log2(prob))
+    return en
+
+
+# Joint Entropy
+def jEntropy(Y, X):
+    """
+    H(Y;X)
+    Reference: https://en.wikipedia.org/wiki/Joint_entropy
+    """
+    YX = np.c_[Y, X]
+    return entropy(YX)
+
+
+# Conditional Entropy
+def cEntropy(Y, X):
+    """
+    conditional entropy = Joint Entropy - Entropy of X
+    H(Y|X) = H(Y;X) - H(X)
+    Reference: https://en.wikipedia.org/wiki/Conditional_entropy
+    """
+    return jEntropy(Y, X) - entropy(X)
+
+
+# Information Gain
+def gain(Y, X):
+    """
+    Information Gain, I(Y;X) = H(Y) - H(Y|X)
+    Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
+    """
+    return entropy(Y) - cEntropy(Y, X)
--- a/cfs/Selection.py
+++ b/cfs/Selection.py
@@ -1,3 +1,86 @@
+from math import log
+import numpy as np
+
+
+class Metrics:
+    @staticmethod
+    def conditional_entropy(x, y, base=2):
+        """quantifies the amount of information needed to describe the outcome
+        of Y given that the value of X is known
+        computes H(Y|X)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            conditional entropy of y given x
+        """
+        xy = np.c_[x, y]
+        return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
+
+    @staticmethod
+    def entropy(y, base=2):
+        """measure of the uncertainty in predicting the value of y
+
+        Parameters
+        ----------
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            entropy of y
+        """
+        _, count = np.unique(y, return_counts=True, axis=0)
+        proba = count.astype(float) / len(y)
+        proba = proba[proba > 0.0]
+        return np.sum(proba * np.log(1.0 / proba)) / log(base)
+
+    @staticmethod
+    def information_gain(x, y, base=2):
+        """Measures the reduction in uncertainty about the value of y when the
+        value of X is known (also called mutual information)
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+
+        Parameters
+        ----------
+        x : np.array
+            values of the variable
+        y : np.array
+            array of labels
+        base : int, optional
+            base of the logarithm, by default 2
+
+        Returns
+        -------
+        float
+            Information gained
+        """
+        return Metrics.entropy(y, base) - Metrics.conditional_entropy(
+            x, y, base
+        )
+
+    @staticmethod
+    def symmetrical_uncertainty(x, y):
+
+        return (
+            2.0
+            * Metrics.information_gain(x, y)
+            / (Metrics.entropy(x) + Metrics.entropy(y))
+        )
+
+
 class CFS:
    def __init__(self, a):
        self.a = a
--- a/cfs/tests/Metrics_test.py
+++ b/cfs/tests/Metrics_test.py
@@ -0,0 +1,90 @@
+import unittest
+from sklearn.datasets import load_iris
+from mdlp import MDLP
+import numpy as np
+from ..Selection import Metrics
+
+
+class Metrics_test(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        mdlp = MDLP(random_state=1)
+        X, self.y = load_iris(return_X_y=True)
+        self.X = mdlp.fit_transform(X, self.y).astype("int64")
+        self.m, self.n = self.X.shape
+
+    # @classmethod
+    # def setup(cls):
+
+    def test_entropy(self):
+        metric = Metrics()
+        datasets = [
+            ([0, 0, 0, 0, 1, 1, 1, 1], 2, 1.0),
+            ([0, 1, 0, 2, 1, 2], 3, 1.0),
+            ([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 2, 0.8812908992306927),
+            ([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263),
+            ([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263),
+            ([1, 1, 5], 2, 0.9182958340544896),
+            (self.y, 3, 0.999999999),
+        ]
+        for dataset, base, entropy in datasets:
+            computed = metric.entropy(dataset, base)
+            self.assertAlmostEqual(entropy, computed)
+
+    def test_conditional_entropy(self):
+        metric = Metrics()
+        results_expected = [
+            0.490953458537736,
+            0.7110077966379169,
+            0.15663362014829718,
+            0.13032469395094992,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.conditional_entropy(self.X[:, col], self.y, 3)
+            self.assertAlmostEqual(expected, computed)
+        self.assertAlmostEqual(
+            0.6309297535714573,
+            metric.conditional_entropy(
+                [1, 3, 2, 3, 2, 1], [1, 2, 0, 1, 1, 2], 3
+            ),
+        )
+        # https://planetcalc.com/8414/?joint=0.4%200%0A0.2%200.4&showDetails=1
+        self.assertAlmostEqual(
+            0.5509775004326938,
+            metric.conditional_entropy([1, 1, 2, 2, 2], [0, 0, 0, 2, 2], 2),
+        )
+
+    def test_information_gain(self):
+        metric = Metrics()
+        results_expected = [
+            0.5090465414622638,
+            0.28899220336208287,
+            0.8433663798517026,
+            0.8696753060490499,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.information_gain(self.X[:, col], self.y, 3)
+            self.assertAlmostEqual(expected, computed)
+        # https://planetcalc.com/8419/
+        # ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_
+        results_expected = [
+            0.806819679,
+            0.458041805,
+            1.336704086,
+            1.378402748,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.information_gain(self.X[:, col], self.y, 2)
+            self.assertAlmostEqual(expected, computed)
+
+    def test_symmetrical_uncertainty(self):
+        metric = Metrics()
+        results_expected = [
+            0.33296547388990266,
+            0.19068147573570668,
+            0.810724587460511,
+            0.870521418179061,
+        ]
+        for expected, col in zip(results_expected, range(self.n)):
+            computed = metric.symmetrical_uncertainty(self.X[:, col], self.y)
+            self.assertAlmostEqual(expected, computed)
--- a/cfs/tests/init.py
+++ b/cfs/tests/init.py
@@ -1,3 +1,4 @@
 from .CFS_test import CFS_test
+from .Metrics_test import Metrics_test

-__all__ = ["CFS_test"]
+__all__ = ["CFS_test", "Metrics_test"]