mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-17 00:15:52 +00:00
287 lines
12 KiB
C++
287 lines
12 KiB
C++
#include "CPPFImdlp.h"
|
|
#include <numeric>
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include "Metrics.h"
|
|
|
|
namespace mdlp {
|
|
ostream& operator << (ostream& os, const cutPoint_t& cut)
|
|
{
|
|
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
|
|
") - (" << cut.fromValue << ", " << cut.toValue << ") "
|
|
<< endl;
|
|
return os;
|
|
|
|
}
|
|
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false)
|
|
{
|
|
divider = pow(10, precision);
|
|
numClasses = 0;
|
|
}
|
|
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug)
|
|
{
|
|
divider = pow(10, precision);
|
|
numClasses = 0;
|
|
}
|
|
CPPFImdlp::~CPPFImdlp()
|
|
= default;
|
|
samples CPPFImdlp::getCutPoints()
|
|
{
|
|
samples output(cutPoints.size());
|
|
::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
|
|
[](cutPoint_t cut) { return cut.toValue; });
|
|
return output;
|
|
}
|
|
labels CPPFImdlp::getDiscretizedValues()
|
|
{
|
|
return xDiscretized;
|
|
}
|
|
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
|
|
{
|
|
X = X_;
|
|
y = y_;
|
|
if (X.size() != y.size()) {
|
|
throw invalid_argument("X and y must have the same size");
|
|
}
|
|
if (X.size() == 0 || y.size() == 0) {
|
|
throw invalid_argument("X and y must have at least one element");
|
|
}
|
|
indices = sortIndices(X_);
|
|
xDiscretized = labels(X.size(), -1);
|
|
numClasses = Metrics::numClasses(y, indices, 0, X.size());
|
|
|
|
if (proposal) {
|
|
computeCutPointsProposal();
|
|
} else {
|
|
computeCutPointsOriginal();
|
|
}
|
|
filterCutPoints();
|
|
// Apply cut points to the input vector
|
|
for (auto cut : cutPoints) {
|
|
for (size_t i = cut.start; i < cut.end; i++) {
|
|
xDiscretized[indices[i]] = cut.classNumber;
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
|
|
{
|
|
int k, k1, k2;
|
|
precision_t ig, delta;
|
|
precision_t ent, ent1, ent2;
|
|
auto N = precision_t(rest.end - rest.start);
|
|
if (N < 2) {
|
|
return false;
|
|
}
|
|
k = Metrics::numClasses(y, indices, rest.start, rest.end);
|
|
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
|
|
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
|
|
ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses);
|
|
ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses);
|
|
ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses);
|
|
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
|
|
delta = log2(pow(3, precision_t(k)) - 2) - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
|
|
precision_t term = 1 / N * (log2(N - 1) + delta);
|
|
if (debug) {
|
|
cout << "Rest: " << rest;
|
|
cout << "Candidate: " << candidate;
|
|
cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl;
|
|
cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl;
|
|
}
|
|
return (ig > term);
|
|
}
|
|
void CPPFImdlp::filterCutPoints()
|
|
{
|
|
cutPoints_t filtered;
|
|
cutPoint_t rest, item;
|
|
int classNumber = 0;
|
|
|
|
rest.start = 0;
|
|
rest.end = X.size();
|
|
rest.fromValue = numeric_limits<precision_t>::lowest();
|
|
rest.toValue = numeric_limits<precision_t>::max();
|
|
rest.classNumber = classNumber;
|
|
bool first = true;
|
|
for (size_t index = 0; index < size_t(cutPoints.size()); index++) {
|
|
item = cutPoints[index];
|
|
if (evaluateCutPoint(rest, item)) {
|
|
if (debug)
|
|
cout << "Accepted: " << item << endl;
|
|
//Assign class number to the interval (cutpoint)
|
|
item.classNumber = classNumber++;
|
|
filtered.push_back(item);
|
|
first = false;
|
|
rest.start = item.end;
|
|
} else {
|
|
if (debug)
|
|
cout << "Rejected: " << item << endl;
|
|
if (index != size_t(cutPoints.size()) - 1) {
|
|
// Try to merge the rejected cutpoint with the next one
|
|
if (first) {
|
|
cutPoints[index + 1].fromValue = numeric_limits<precision_t>::lowest();
|
|
cutPoints[index + 1].start = indices[0];
|
|
} else {
|
|
cutPoints[index + 1].fromValue = item.fromValue;
|
|
cutPoints[index + 1].start = item.start;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (!first) {
|
|
filtered.back().toValue = numeric_limits<precision_t>::max();
|
|
filtered.back().end = X.size() - 1;
|
|
} else {
|
|
filtered.push_back(rest);
|
|
}
|
|
cutPoints = filtered;
|
|
}
|
|
void CPPFImdlp::computeCutPointsProposal()
|
|
{
|
|
cutPoints_t cutPts;
|
|
cutPoint_t cutPoint;
|
|
precision_t xPrev, xCur, xPivot;
|
|
int yPrev, yCur, yPivot;
|
|
size_t idx, numElements, start;
|
|
|
|
xCur = xPrev = X[indices[0]];
|
|
yCur = yPrev = y[indices[0]];
|
|
numElements = indices.size() - 1;
|
|
idx = start = 0;
|
|
bool firstCutPoint = true;
|
|
if (debug)
|
|
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
|
|
while (idx < numElements) {
|
|
xPivot = xCur;
|
|
yPivot = yCur;
|
|
if (debug)
|
|
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
// Read the same values and check class changes
|
|
do {
|
|
idx++;
|
|
xCur = X[indices[idx]];
|
|
yCur = y[indices[idx]];
|
|
if (yCur != yPivot && xCur == xPivot) {
|
|
yPivot = -1;
|
|
}
|
|
if (debug)
|
|
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
}
|
|
while (idx < numElements && xCur == xPivot);
|
|
// Check if the class changed and there are more than 1 element
|
|
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) {
|
|
// Must we add the entropy criteria here?
|
|
// if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point }
|
|
cutPoint.start = start;
|
|
cutPoint.end = idx;
|
|
start = idx;
|
|
cutPoint.fromValue = firstCutPoint ? numeric_limits<precision_t>::lowest() : cutPts.back().toValue;
|
|
cutPoint.toValue = (xPrev + xCur) / 2;
|
|
cutPoint.classNumber = -1;
|
|
firstCutPoint = false;
|
|
if (debug) {
|
|
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
|
}
|
|
cutPts.push_back(cutPoint);
|
|
}
|
|
yPrev = yPivot;
|
|
xPrev = xPivot;
|
|
}
|
|
if (idx == numElements) {
|
|
cutPoint.start = start;
|
|
cutPoint.end = numElements + 1;
|
|
cutPoint.fromValue = firstCutPoint ? numeric_limits<precision_t>::lowest() : cutPts.back().toValue;
|
|
cutPoint.toValue = numeric_limits<precision_t>::max();
|
|
cutPoint.classNumber = -1;
|
|
if (debug)
|
|
printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
|
cutPts.push_back(cutPoint);
|
|
}
|
|
if (debug) {
|
|
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << endl;
|
|
for (auto cutPt : cutPts)
|
|
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
|
|
}
|
|
cutPoints = cutPts;
|
|
}
|
|
void CPPFImdlp::computeCutPointsOriginal()
|
|
{
|
|
cutPoints_t cutPts;
|
|
cutPoint_t cutPoint;
|
|
precision_t xPrev;
|
|
int yPrev;
|
|
bool first = true;
|
|
// idxPrev is the index of the init instance of the cutPoint
|
|
size_t index, idxPrev = 0, last, idx = indices[0];
|
|
xPrev = X[idx];
|
|
yPrev = y[idx];
|
|
last = indices.size() - 1;
|
|
for (index = 0; index < last; index++) {
|
|
idx = indices[index];
|
|
// Definition 2 Cut points are always on class boundaries &&
|
|
// there are more than 1 items in the interval
|
|
// if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut)
|
|
if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) {
|
|
// Must we add the entropy criteria here?
|
|
if (first) {
|
|
first = false;
|
|
cutPoint.fromValue = numeric_limits<precision_t>::lowest();
|
|
} else {
|
|
cutPoint.fromValue = cutPts.back().toValue;
|
|
}
|
|
cutPoint.start = idxPrev;
|
|
cutPoint.end = index;
|
|
cutPoint.classNumber = -1;
|
|
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
|
|
idxPrev = index;
|
|
cutPts.push_back(cutPoint);
|
|
}
|
|
xPrev = X[idx];
|
|
yPrev = y[idx];
|
|
}
|
|
if (first) {
|
|
cutPoint.start = 0;
|
|
cutPoint.classNumber = -1;
|
|
cutPoint.fromValue = numeric_limits<precision_t>::lowest();
|
|
cutPoint.toValue = numeric_limits<precision_t>::max();
|
|
cutPts.push_back(cutPoint);
|
|
} else
|
|
cutPts.back().toValue = numeric_limits<precision_t>::max();
|
|
cutPts.back().end = X.size();
|
|
if (debug) {
|
|
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << endl;
|
|
for (auto cutPt : cutPts)
|
|
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt;
|
|
}
|
|
cutPoints = cutPts;
|
|
}
|
|
bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end)
|
|
{
|
|
/*
|
|
Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla
|
|
|
|
|
|
*/
|
|
precision_t entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses);
|
|
precision_t entropyRight = Metrics::entropy(y, indices, cut, end, numClasses);
|
|
precision_t entropyInterval = Metrics::entropy(y, indices, start, end, numClasses);
|
|
if (debug)
|
|
printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval);
|
|
//return (entropyInterval - (entropyLeft + entropyRight) > 0);
|
|
return true;
|
|
}
|
|
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
|
indices_t CPPFImdlp::sortIndices(samples& X_)
|
|
{
|
|
indices_t idx(X_.size());
|
|
iota(idx.begin(), idx.end(), 0);
|
|
for (size_t i = 0; i < X_.size(); i++)
|
|
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
|
{ return X_[i1] < X_[i2]; });
|
|
return idx;
|
|
}
|
|
void CPPFImdlp::setCutPoints(cutPoints_t cutPoints_)
|
|
{
|
|
cutPoints = cutPoints_;
|
|
}
|
|
}
|