diff --git a/.gitignore b/.gitignore index c42b4c7..b2e7667 100644 --- a/.gitignore +++ b/.gitignore @@ -128,4 +128,5 @@ dmypy.json # Pyre type checker .pyre/ -.vscode \ No newline at end of file +.vscode +junk/* diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9af4987 --- /dev/null +++ b/Makefile @@ -0,0 +1,49 @@ +SHELL := /bin/bash +.DEFAULT_GOAL := help +.PHONY: coverage deps help lint push test doc build + +coverage: ## Run tests with coverage + coverage erase + coverage run -m unittest -v cfs.tests + coverage report -m + +deps: ## Install dependencies + pip install -r requirements.txt + +lint: ## Lint and static-check + black cfs + flake8 cfs + mypy cfs + +push: ## Push code with tags + git push && git push --tags + +test: ## Run tests + python -m unittest -v cfs.tests + +doc: ## Update documentation + make -C docs --makefile=Makefile html + +build: ## Build package + rm -fr dist/* + python setup.py sdist bdist_wheel + +doc-clean: ## Update documentation + make -C docs --makefile=Makefile clean + +help: ## Show help message + @IFS=$$'\n' ; \ + help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ + printf "%s\n\n" "Usage: make [task]"; \ + printf "%-20s %s\n" "task" "help" ; \ + printf "%-20s %s\n" "------" "----" ; \ + for help_line in $${help_lines[@]}; do \ + IFS=$$':' ; \ + help_split=($$help_line) ; \ + help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \ + help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \ + printf '\033[36m'; \ + printf "%-20s %s" $$help_command ; \ + printf '\033[0m'; \ + printf "%s\n" $$help_info; \ + done diff --git a/cfs/Entropy.py b/cfs/Entropy.py new file mode 100644 index 0000000..3f51471 --- /dev/null +++ b/cfs/Entropy.py @@ -0,0 +1,39 @@ +##Entropy +def entropy(Y): + """ + Also known as Shanon Entropy + Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory) + """ + unique, count = np.unique(Y, return_counts=True, axis=0) + prob = count / len(Y) + en = -np.sum(prob * np.log2(prob)) + return en + + +# Joint Entropy +def jEntropy(Y, X): + """ + H(Y;X) + Reference: https://en.wikipedia.org/wiki/Joint_entropy + """ + YX = np.c_[Y, X] + return entropy(YX) + + +# Conditional Entropy +def cEntropy(Y, X): + """ + conditional entropy = Joint Entropy - Entropy of X + H(Y|X) = H(Y;X) - H(X) + Reference: https://en.wikipedia.org/wiki/Conditional_entropy + """ + return jEntropy(Y, X) - entropy(X) + + +# Information Gain +def gain(Y, X): + """ + Information Gain, I(Y;X) = H(Y) - H(Y|X) + Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition + """ + return entropy(Y) - cEntropy(Y, X) \ No newline at end of file diff --git a/cfs/Selection.py b/cfs/Selection.py index 7402c62..3d655be 100644 --- a/cfs/Selection.py +++ b/cfs/Selection.py @@ -1,3 +1,86 @@ +from math import log +import numpy as np + + +class Metrics: + @staticmethod + def conditional_entropy(x, y, base=2): + """quantifies the amount of information needed to describe the outcome + of Y given that the value of X is known + computes H(Y|X) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + conditional entropy of y given x + """ + xy = np.c_[x, y] + return Metrics.entropy(xy, base) - Metrics.entropy(x, base) + + @staticmethod + def entropy(y, base=2): + """measure of the uncertainty in predicting the value of y + + Parameters + ---------- + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + entropy of y + """ + _, count = np.unique(y, return_counts=True, axis=0) + proba = count.astype(float) / len(y) + proba = proba[proba > 0.0] + return np.sum(proba * np.log(1.0 / proba)) / log(base) + + @staticmethod + def information_gain(x, y, base=2): + """Measures the reduction in uncertainty about the value of y when the + value of X is known (also called mutual information) + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + + Parameters + ---------- + x : np.array + values of the variable + y : np.array + array of labels + base : int, optional + base of the logarithm, by default 2 + + Returns + ------- + float + Information gained + """ + return Metrics.entropy(y, base) - Metrics.conditional_entropy( + x, y, base + ) + + @staticmethod + def symmetrical_uncertainty(x, y): + + return ( + 2.0 + * Metrics.information_gain(x, y) + / (Metrics.entropy(x) + Metrics.entropy(y)) + ) + + class CFS: def __init__(self, a): self.a = a diff --git a/cfs/tests/Metrics_test.py b/cfs/tests/Metrics_test.py new file mode 100644 index 0000000..be148c2 --- /dev/null +++ b/cfs/tests/Metrics_test.py @@ -0,0 +1,90 @@ +import unittest +from sklearn.datasets import load_iris +from mdlp import MDLP +import numpy as np +from ..Selection import Metrics + + +class Metrics_test(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + mdlp = MDLP(random_state=1) + X, self.y = load_iris(return_X_y=True) + self.X = mdlp.fit_transform(X, self.y).astype("int64") + self.m, self.n = self.X.shape + + # @classmethod + # def setup(cls): + + def test_entropy(self): + metric = Metrics() + datasets = [ + ([0, 0, 0, 0, 1, 1, 1, 1], 2, 1.0), + ([0, 1, 0, 2, 1, 2], 3, 1.0), + ([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 2, 0.8812908992306927), + ([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263), + ([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263), + ([1, 1, 5], 2, 0.9182958340544896), + (self.y, 3, 0.999999999), + ] + for dataset, base, entropy in datasets: + computed = metric.entropy(dataset, base) + self.assertAlmostEqual(entropy, computed) + + def test_conditional_entropy(self): + metric = Metrics() + results_expected = [ + 0.490953458537736, + 0.7110077966379169, + 0.15663362014829718, + 0.13032469395094992, + ] + for expected, col in zip(results_expected, range(self.n)): + computed = metric.conditional_entropy(self.X[:, col], self.y, 3) + self.assertAlmostEqual(expected, computed) + self.assertAlmostEqual( + 0.6309297535714573, + metric.conditional_entropy( + [1, 3, 2, 3, 2, 1], [1, 2, 0, 1, 1, 2], 3 + ), + ) + # https://planetcalc.com/8414/?joint=0.4%200%0A0.2%200.4&showDetails=1 + self.assertAlmostEqual( + 0.5509775004326938, + metric.conditional_entropy([1, 1, 2, 2, 2], [0, 0, 0, 2, 2], 2), + ) + + def test_information_gain(self): + metric = Metrics() + results_expected = [ + 0.5090465414622638, + 0.28899220336208287, + 0.8433663798517026, + 0.8696753060490499, + ] + for expected, col in zip(results_expected, range(self.n)): + computed = metric.information_gain(self.X[:, col], self.y, 3) + self.assertAlmostEqual(expected, computed) + # https://planetcalc.com/8419/ + # ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_ + results_expected = [ + 0.806819679, + 0.458041805, + 1.336704086, + 1.378402748, + ] + for expected, col in zip(results_expected, range(self.n)): + computed = metric.information_gain(self.X[:, col], self.y, 2) + self.assertAlmostEqual(expected, computed) + + def test_symmetrical_uncertainty(self): + metric = Metrics() + results_expected = [ + 0.33296547388990266, + 0.19068147573570668, + 0.810724587460511, + 0.870521418179061, + ] + for expected, col in zip(results_expected, range(self.n)): + computed = metric.symmetrical_uncertainty(self.X[:, col], self.y) + self.assertAlmostEqual(expected, computed) diff --git a/cfs/tests/__init__.py b/cfs/tests/__init__.py index 9cbd6eb..f1f5c1b 100644 --- a/cfs/tests/__init__.py +++ b/cfs/tests/__init__.py @@ -1,3 +1,4 @@ from .CFS_test import CFS_test +from .Metrics_test import Metrics_test -__all__ = ["CFS_test"] +__all__ = ["CFS_test", "Metrics_test"]