Implement Metric methods and tests

This commit is contained in:
2021-05-23 20:32:33 +02:00
parent ba73a7f5c0
commit a19f2cc12a
6 changed files with 265 additions and 2 deletions

3
.gitignore vendored
View File

@@ -128,4 +128,5 @@ dmypy.json
# Pyre type checker # Pyre type checker
.pyre/ .pyre/
.vscode .vscode
junk/*

49
Makefile Normal file
View File

@@ -0,0 +1,49 @@
SHELL := /bin/bash
.DEFAULT_GOAL := help
.PHONY: coverage deps help lint push test doc build
coverage: ## Run tests with coverage
coverage erase
coverage run -m unittest -v cfs.tests
coverage report -m
deps: ## Install dependencies
pip install -r requirements.txt
lint: ## Lint and static-check
black cfs
flake8 cfs
mypy cfs
push: ## Push code with tags
git push && git push --tags
test: ## Run tests
python -m unittest -v cfs.tests
doc: ## Update documentation
make -C docs --makefile=Makefile html
build: ## Build package
rm -fr dist/*
python setup.py sdist bdist_wheel
doc-clean: ## Update documentation
make -C docs --makefile=Makefile clean
help: ## Show help message
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
printf "%s\n\n" "Usage: make [task]"; \
printf "%-20s %s\n" "task" "help" ; \
printf "%-20s %s\n" "------" "----" ; \
for help_line in $${help_lines[@]}; do \
IFS=$$':' ; \
help_split=($$help_line) ; \
help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
printf '\033[36m'; \
printf "%-20s %s" $$help_command ; \
printf '\033[0m'; \
printf "%s\n" $$help_info; \
done

39
cfs/Entropy.py Normal file
View File

@@ -0,0 +1,39 @@
##Entropy
def entropy(Y):
"""
Also known as Shanon Entropy
Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
"""
unique, count = np.unique(Y, return_counts=True, axis=0)
prob = count / len(Y)
en = -np.sum(prob * np.log2(prob))
return en
# Joint Entropy
def jEntropy(Y, X):
"""
H(Y;X)
Reference: https://en.wikipedia.org/wiki/Joint_entropy
"""
YX = np.c_[Y, X]
return entropy(YX)
# Conditional Entropy
def cEntropy(Y, X):
"""
conditional entropy = Joint Entropy - Entropy of X
H(Y|X) = H(Y;X) - H(X)
Reference: https://en.wikipedia.org/wiki/Conditional_entropy
"""
return jEntropy(Y, X) - entropy(X)
# Information Gain
def gain(Y, X):
"""
Information Gain, I(Y;X) = H(Y) - H(Y|X)
Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
"""
return entropy(Y) - cEntropy(Y, X)

View File

@@ -1,3 +1,86 @@
from math import log
import numpy as np
class Metrics:
@staticmethod
def conditional_entropy(x, y, base=2):
"""quantifies the amount of information needed to describe the outcome
of Y given that the value of X is known
computes H(Y|X)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
conditional entropy of y given x
"""
xy = np.c_[x, y]
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
@staticmethod
def entropy(y, base=2):
"""measure of the uncertainty in predicting the value of y
Parameters
----------
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
entropy of y
"""
_, count = np.unique(y, return_counts=True, axis=0)
proba = count.astype(float) / len(y)
proba = proba[proba > 0.0]
return np.sum(proba * np.log(1.0 / proba)) / log(base)
@staticmethod
def information_gain(x, y, base=2):
"""Measures the reduction in uncertainty about the value of y when the
value of X is known (also called mutual information)
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
Parameters
----------
x : np.array
values of the variable
y : np.array
array of labels
base : int, optional
base of the logarithm, by default 2
Returns
-------
float
Information gained
"""
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
x, y, base
)
@staticmethod
def symmetrical_uncertainty(x, y):
return (
2.0
* Metrics.information_gain(x, y)
/ (Metrics.entropy(x) + Metrics.entropy(y))
)
class CFS: class CFS:
def __init__(self, a): def __init__(self, a):
self.a = a self.a = a

90
cfs/tests/Metrics_test.py Normal file
View File

@@ -0,0 +1,90 @@
import unittest
from sklearn.datasets import load_iris
from mdlp import MDLP
import numpy as np
from ..Selection import Metrics
class Metrics_test(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
mdlp = MDLP(random_state=1)
X, self.y = load_iris(return_X_y=True)
self.X = mdlp.fit_transform(X, self.y).astype("int64")
self.m, self.n = self.X.shape
# @classmethod
# def setup(cls):
def test_entropy(self):
metric = Metrics()
datasets = [
([0, 0, 0, 0, 1, 1, 1, 1], 2, 1.0),
([0, 1, 0, 2, 1, 2], 3, 1.0),
([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 2, 0.8812908992306927),
([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263),
([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263),
([1, 1, 5], 2, 0.9182958340544896),
(self.y, 3, 0.999999999),
]
for dataset, base, entropy in datasets:
computed = metric.entropy(dataset, base)
self.assertAlmostEqual(entropy, computed)
def test_conditional_entropy(self):
metric = Metrics()
results_expected = [
0.490953458537736,
0.7110077966379169,
0.15663362014829718,
0.13032469395094992,
]
for expected, col in zip(results_expected, range(self.n)):
computed = metric.conditional_entropy(self.X[:, col], self.y, 3)
self.assertAlmostEqual(expected, computed)
self.assertAlmostEqual(
0.6309297535714573,
metric.conditional_entropy(
[1, 3, 2, 3, 2, 1], [1, 2, 0, 1, 1, 2], 3
),
)
# https://planetcalc.com/8414/?joint=0.4%200%0A0.2%200.4&showDetails=1
self.assertAlmostEqual(
0.5509775004326938,
metric.conditional_entropy([1, 1, 2, 2, 2], [0, 0, 0, 2, 2], 2),
)
def test_information_gain(self):
metric = Metrics()
results_expected = [
0.5090465414622638,
0.28899220336208287,
0.8433663798517026,
0.8696753060490499,
]
for expected, col in zip(results_expected, range(self.n)):
computed = metric.information_gain(self.X[:, col], self.y, 3)
self.assertAlmostEqual(expected, computed)
# https://planetcalc.com/8419/
# ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_
results_expected = [
0.806819679,
0.458041805,
1.336704086,
1.378402748,
]
for expected, col in zip(results_expected, range(self.n)):
computed = metric.information_gain(self.X[:, col], self.y, 2)
self.assertAlmostEqual(expected, computed)
def test_symmetrical_uncertainty(self):
metric = Metrics()
results_expected = [
0.33296547388990266,
0.19068147573570668,
0.810724587460511,
0.870521418179061,
]
for expected, col in zip(results_expected, range(self.n)):
computed = metric.symmetrical_uncertainty(self.X[:, col], self.y)
self.assertAlmostEqual(expected, computed)

View File

@@ -1,3 +1,4 @@
from .CFS_test import CFS_test from .CFS_test import CFS_test
from .Metrics_test import Metrics_test
__all__ = ["CFS_test"] __all__ = ["CFS_test", "Metrics_test"]