mirror of
https://github.com/Doctorado-ML/mufs.git
synced 2025-08-15 23:55:56 +00:00
Implement Metric methods and tests
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -128,4 +128,5 @@ dmypy.json
|
|||||||
# Pyre type checker
|
# Pyre type checker
|
||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
.vscode
|
.vscode
|
||||||
|
junk/*
|
||||||
|
49
Makefile
Normal file
49
Makefile
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
SHELL := /bin/bash
|
||||||
|
.DEFAULT_GOAL := help
|
||||||
|
.PHONY: coverage deps help lint push test doc build
|
||||||
|
|
||||||
|
coverage: ## Run tests with coverage
|
||||||
|
coverage erase
|
||||||
|
coverage run -m unittest -v cfs.tests
|
||||||
|
coverage report -m
|
||||||
|
|
||||||
|
deps: ## Install dependencies
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
lint: ## Lint and static-check
|
||||||
|
black cfs
|
||||||
|
flake8 cfs
|
||||||
|
mypy cfs
|
||||||
|
|
||||||
|
push: ## Push code with tags
|
||||||
|
git push && git push --tags
|
||||||
|
|
||||||
|
test: ## Run tests
|
||||||
|
python -m unittest -v cfs.tests
|
||||||
|
|
||||||
|
doc: ## Update documentation
|
||||||
|
make -C docs --makefile=Makefile html
|
||||||
|
|
||||||
|
build: ## Build package
|
||||||
|
rm -fr dist/*
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
|
||||||
|
doc-clean: ## Update documentation
|
||||||
|
make -C docs --makefile=Makefile clean
|
||||||
|
|
||||||
|
help: ## Show help message
|
||||||
|
@IFS=$$'\n' ; \
|
||||||
|
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||||
|
printf "%s\n\n" "Usage: make [task]"; \
|
||||||
|
printf "%-20s %s\n" "task" "help" ; \
|
||||||
|
printf "%-20s %s\n" "------" "----" ; \
|
||||||
|
for help_line in $${help_lines[@]}; do \
|
||||||
|
IFS=$$':' ; \
|
||||||
|
help_split=($$help_line) ; \
|
||||||
|
help_command=`echo $${help_split[0]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
|
||||||
|
help_info=`echo $${help_split[2]} | sed -e 's/^ *//' -e 's/ *$$//'` ; \
|
||||||
|
printf '\033[36m'; \
|
||||||
|
printf "%-20s %s" $$help_command ; \
|
||||||
|
printf '\033[0m'; \
|
||||||
|
printf "%s\n" $$help_info; \
|
||||||
|
done
|
39
cfs/Entropy.py
Normal file
39
cfs/Entropy.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
##Entropy
|
||||||
|
def entropy(Y):
|
||||||
|
"""
|
||||||
|
Also known as Shanon Entropy
|
||||||
|
Reference: https://en.wikipedia.org/wiki/Entropy_(information_theory)
|
||||||
|
"""
|
||||||
|
unique, count = np.unique(Y, return_counts=True, axis=0)
|
||||||
|
prob = count / len(Y)
|
||||||
|
en = -np.sum(prob * np.log2(prob))
|
||||||
|
return en
|
||||||
|
|
||||||
|
|
||||||
|
# Joint Entropy
|
||||||
|
def jEntropy(Y, X):
|
||||||
|
"""
|
||||||
|
H(Y;X)
|
||||||
|
Reference: https://en.wikipedia.org/wiki/Joint_entropy
|
||||||
|
"""
|
||||||
|
YX = np.c_[Y, X]
|
||||||
|
return entropy(YX)
|
||||||
|
|
||||||
|
|
||||||
|
# Conditional Entropy
|
||||||
|
def cEntropy(Y, X):
|
||||||
|
"""
|
||||||
|
conditional entropy = Joint Entropy - Entropy of X
|
||||||
|
H(Y|X) = H(Y;X) - H(X)
|
||||||
|
Reference: https://en.wikipedia.org/wiki/Conditional_entropy
|
||||||
|
"""
|
||||||
|
return jEntropy(Y, X) - entropy(X)
|
||||||
|
|
||||||
|
|
||||||
|
# Information Gain
|
||||||
|
def gain(Y, X):
|
||||||
|
"""
|
||||||
|
Information Gain, I(Y;X) = H(Y) - H(Y|X)
|
||||||
|
Reference: https://en.wikipedia.org/wiki/Information_gain_in_decision_trees#Formal_definition
|
||||||
|
"""
|
||||||
|
return entropy(Y) - cEntropy(Y, X)
|
@@ -1,3 +1,86 @@
|
|||||||
|
from math import log
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class Metrics:
|
||||||
|
@staticmethod
|
||||||
|
def conditional_entropy(x, y, base=2):
|
||||||
|
"""quantifies the amount of information needed to describe the outcome
|
||||||
|
of Y given that the value of X is known
|
||||||
|
computes H(Y|X)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : np.array
|
||||||
|
values of the variable
|
||||||
|
y : np.array
|
||||||
|
array of labels
|
||||||
|
base : int, optional
|
||||||
|
base of the logarithm, by default 2
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
conditional entropy of y given x
|
||||||
|
"""
|
||||||
|
xy = np.c_[x, y]
|
||||||
|
return Metrics.entropy(xy, base) - Metrics.entropy(x, base)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def entropy(y, base=2):
|
||||||
|
"""measure of the uncertainty in predicting the value of y
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
y : np.array
|
||||||
|
array of labels
|
||||||
|
base : int, optional
|
||||||
|
base of the logarithm, by default 2
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
entropy of y
|
||||||
|
"""
|
||||||
|
_, count = np.unique(y, return_counts=True, axis=0)
|
||||||
|
proba = count.astype(float) / len(y)
|
||||||
|
proba = proba[proba > 0.0]
|
||||||
|
return np.sum(proba * np.log(1.0 / proba)) / log(base)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def information_gain(x, y, base=2):
|
||||||
|
"""Measures the reduction in uncertainty about the value of y when the
|
||||||
|
value of X is known (also called mutual information)
|
||||||
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
x : np.array
|
||||||
|
values of the variable
|
||||||
|
y : np.array
|
||||||
|
array of labels
|
||||||
|
base : int, optional
|
||||||
|
base of the logarithm, by default 2
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
float
|
||||||
|
Information gained
|
||||||
|
"""
|
||||||
|
return Metrics.entropy(y, base) - Metrics.conditional_entropy(
|
||||||
|
x, y, base
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def symmetrical_uncertainty(x, y):
|
||||||
|
|
||||||
|
return (
|
||||||
|
2.0
|
||||||
|
* Metrics.information_gain(x, y)
|
||||||
|
/ (Metrics.entropy(x) + Metrics.entropy(y))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CFS:
|
class CFS:
|
||||||
def __init__(self, a):
|
def __init__(self, a):
|
||||||
self.a = a
|
self.a = a
|
||||||
|
90
cfs/tests/Metrics_test.py
Normal file
90
cfs/tests/Metrics_test.py
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
import unittest
|
||||||
|
from sklearn.datasets import load_iris
|
||||||
|
from mdlp import MDLP
|
||||||
|
import numpy as np
|
||||||
|
from ..Selection import Metrics
|
||||||
|
|
||||||
|
|
||||||
|
class Metrics_test(unittest.TestCase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
mdlp = MDLP(random_state=1)
|
||||||
|
X, self.y = load_iris(return_X_y=True)
|
||||||
|
self.X = mdlp.fit_transform(X, self.y).astype("int64")
|
||||||
|
self.m, self.n = self.X.shape
|
||||||
|
|
||||||
|
# @classmethod
|
||||||
|
# def setup(cls):
|
||||||
|
|
||||||
|
def test_entropy(self):
|
||||||
|
metric = Metrics()
|
||||||
|
datasets = [
|
||||||
|
([0, 0, 0, 0, 1, 1, 1, 1], 2, 1.0),
|
||||||
|
([0, 1, 0, 2, 1, 2], 3, 1.0),
|
||||||
|
([0, 0, 0, 0, 0, 0, 0, 2, 2, 2], 2, 0.8812908992306927),
|
||||||
|
([1, 1, 1, 5, 2, 2, 3, 3, 3], 4, 0.9455305560363263),
|
||||||
|
([1, 1, 1, 2, 2, 3, 3, 3, 5], 4, 0.9455305560363263),
|
||||||
|
([1, 1, 5], 2, 0.9182958340544896),
|
||||||
|
(self.y, 3, 0.999999999),
|
||||||
|
]
|
||||||
|
for dataset, base, entropy in datasets:
|
||||||
|
computed = metric.entropy(dataset, base)
|
||||||
|
self.assertAlmostEqual(entropy, computed)
|
||||||
|
|
||||||
|
def test_conditional_entropy(self):
|
||||||
|
metric = Metrics()
|
||||||
|
results_expected = [
|
||||||
|
0.490953458537736,
|
||||||
|
0.7110077966379169,
|
||||||
|
0.15663362014829718,
|
||||||
|
0.13032469395094992,
|
||||||
|
]
|
||||||
|
for expected, col in zip(results_expected, range(self.n)):
|
||||||
|
computed = metric.conditional_entropy(self.X[:, col], self.y, 3)
|
||||||
|
self.assertAlmostEqual(expected, computed)
|
||||||
|
self.assertAlmostEqual(
|
||||||
|
0.6309297535714573,
|
||||||
|
metric.conditional_entropy(
|
||||||
|
[1, 3, 2, 3, 2, 1], [1, 2, 0, 1, 1, 2], 3
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# https://planetcalc.com/8414/?joint=0.4%200%0A0.2%200.4&showDetails=1
|
||||||
|
self.assertAlmostEqual(
|
||||||
|
0.5509775004326938,
|
||||||
|
metric.conditional_entropy([1, 1, 2, 2, 2], [0, 0, 0, 2, 2], 2),
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_information_gain(self):
|
||||||
|
metric = Metrics()
|
||||||
|
results_expected = [
|
||||||
|
0.5090465414622638,
|
||||||
|
0.28899220336208287,
|
||||||
|
0.8433663798517026,
|
||||||
|
0.8696753060490499,
|
||||||
|
]
|
||||||
|
for expected, col in zip(results_expected, range(self.n)):
|
||||||
|
computed = metric.information_gain(self.X[:, col], self.y, 3)
|
||||||
|
self.assertAlmostEqual(expected, computed)
|
||||||
|
# https://planetcalc.com/8419/
|
||||||
|
# ?_d=FrDfFN2COAhqh9Pb5ycqy5CeKgIOxlfSjKgyyIR.Q5L0np-g-hw6yv8M1Q8_
|
||||||
|
results_expected = [
|
||||||
|
0.806819679,
|
||||||
|
0.458041805,
|
||||||
|
1.336704086,
|
||||||
|
1.378402748,
|
||||||
|
]
|
||||||
|
for expected, col in zip(results_expected, range(self.n)):
|
||||||
|
computed = metric.information_gain(self.X[:, col], self.y, 2)
|
||||||
|
self.assertAlmostEqual(expected, computed)
|
||||||
|
|
||||||
|
def test_symmetrical_uncertainty(self):
|
||||||
|
metric = Metrics()
|
||||||
|
results_expected = [
|
||||||
|
0.33296547388990266,
|
||||||
|
0.19068147573570668,
|
||||||
|
0.810724587460511,
|
||||||
|
0.870521418179061,
|
||||||
|
]
|
||||||
|
for expected, col in zip(results_expected, range(self.n)):
|
||||||
|
computed = metric.symmetrical_uncertainty(self.X[:, col], self.y)
|
||||||
|
self.assertAlmostEqual(expected, computed)
|
@@ -1,3 +1,4 @@
|
|||||||
from .CFS_test import CFS_test
|
from .CFS_test import CFS_test
|
||||||
|
from .Metrics_test import Metrics_test
|
||||||
|
|
||||||
__all__ = ["CFS_test"]
|
__all__ = ["CFS_test", "Metrics_test"]
|
||||||
|
Reference in New Issue
Block a user