mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-17 08:25:51 +00:00
143 lines
4.8 KiB
Python
143 lines
4.8 KiB
Python
import numpy as np
|
|
from math import log
|
|
from types import SimpleNamespace
|
|
|
|
|
|
class PyFImdlp:
|
|
def __init__(self, proposal=True):
|
|
self.proposal = proposal
|
|
self.n_features_ = None
|
|
self.X_ = None
|
|
self.y_ = None
|
|
self.features_ = None
|
|
self.cut_points_ = []
|
|
self.entropy_cache = {}
|
|
self.information_gain_cache = {}
|
|
|
|
def fit(self, X, y):
|
|
self.n_features_ = len(X)
|
|
self.indices_ = np.argsort(X)
|
|
self.use_indices = True
|
|
self.X_ = X[self.indices_] if not self.use_indices else X
|
|
self.y_ = y[self.indices_] if not self.use_indices else y
|
|
self.compute_cut_points(0, len(y))
|
|
return self
|
|
|
|
def get_cut_points(self):
|
|
return sorted(list(set([cut.value for cut in self.cut_points_])))
|
|
|
|
def compute_cut_points(self, start, end):
|
|
cut = self.get_candidate(start, end)
|
|
if cut.value is None:
|
|
return
|
|
if self.mdlp(cut, start, end):
|
|
print("¡Ding!", cut.value, cut.index)
|
|
self.cut_points_.append(cut)
|
|
self.compute_cut_points(start, cut.index)
|
|
self.compute_cut_points(cut.index, end)
|
|
|
|
def mdlp(self, cut, start, end):
|
|
N = end - start
|
|
k = self.num_classes(start, end)
|
|
k1 = self.num_classes(start, cut.index)
|
|
k2 = self.num_classes(cut.index, end)
|
|
ent = self.entropy(start, end)
|
|
ent1 = self.entropy(start, cut.index)
|
|
ent2 = self.entropy(cut.index, end)
|
|
ig = self.information_gain(start, cut.index, end)
|
|
delta = log(pow(3, k) - 2, 2) - (
|
|
float(k) * ent - float(k1) * ent1 - float(k2) * ent2
|
|
)
|
|
term = 1 / N * (log(N - 1, 2) + delta)
|
|
return ig > term
|
|
|
|
def num_classes(self, start, end):
|
|
n_classes = set()
|
|
for i in range(start, end):
|
|
n_classes.add(
|
|
self.y_[self.indices_[i]] if self.use_indices else self.y_[i]
|
|
)
|
|
return len(n_classes)
|
|
|
|
def get_candidate(self, start, end):
|
|
"""Return the best cutpoint candidate for the given range.
|
|
|
|
Parameters
|
|
----------
|
|
start : int
|
|
Start of the range.
|
|
end : int
|
|
End of the range.
|
|
|
|
Returns
|
|
-------
|
|
candidate : SimpleNamespace with attributes index and value
|
|
value == None if no candidate is found.
|
|
"""
|
|
candidate = SimpleNamespace()
|
|
candidate.value = None
|
|
minEntropy = float("inf")
|
|
for idx in range(start + 1, end):
|
|
condition = (
|
|
self.y_[self.indices_[idx]] == self.y_[self.indices_[idx - 1]]
|
|
if self.use_indices
|
|
else self.y_[idx] == self.y_[idx - 1]
|
|
)
|
|
if condition:
|
|
continue
|
|
entropy_left = self.entropy(start, idx)
|
|
entropy_right = self.entropy(idx, end)
|
|
entropy_cut = entropy_left + entropy_right
|
|
if entropy_cut < minEntropy:
|
|
minEntropy = entropy_cut
|
|
candidate.index = idx
|
|
if self.use_indices:
|
|
candidate.value = (
|
|
self.X_[self.indices_[idx]]
|
|
+ self.X_[self.indices_[idx - 1]]
|
|
) / 2
|
|
else:
|
|
candidate.value = (self.X_[idx] + self.X_[idx - 1]) / 2
|
|
return candidate
|
|
|
|
def entropy(self, start, end) -> float:
|
|
n_labels = end - start
|
|
if n_labels <= 1:
|
|
return 0
|
|
if (start, end) in self.entropy_cache:
|
|
return self.entropy_cache[(start, end)]
|
|
if self.use_indices:
|
|
counts = np.bincount(self.y_[self.indices_[start:end]])
|
|
else:
|
|
counts = np.bincount(self.y_[start:end])
|
|
proportions = counts / n_labels
|
|
n_classes = np.count_nonzero(proportions)
|
|
if n_classes <= 1:
|
|
return 0
|
|
entropy = 0.0
|
|
# Compute standard entropy.
|
|
for prop in proportions:
|
|
if prop != 0.0:
|
|
entropy -= prop * log(prop, 2)
|
|
self.entropy_cache[(start, end)] = entropy
|
|
return entropy
|
|
|
|
def information_gain(self, start, cut, end):
|
|
if (start, cut, end) in self.information_gain_cache:
|
|
return self.information_gain_cache[(start, cut, end)]
|
|
labels = end - start
|
|
if labels == 0:
|
|
return 0.0
|
|
entropy = self.entropy(start, end)
|
|
card_left = cut - start
|
|
entropy_left = self.entropy(start, cut)
|
|
card_right = end - cut
|
|
entropy_right = self.entropy(cut, end)
|
|
result = (
|
|
entropy
|
|
- (card_left / labels) * entropy_left
|
|
- (card_right / labels) * entropy_right
|
|
)
|
|
self.information_gain_cache[(start, cut, end)] = result
|
|
return result
|