mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-17 08:25:51 +00:00
277 lines
10 KiB
C++
277 lines
10 KiB
C++
#include "CPPFImdlp.h"
|
|
#include <numeric>
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include "Metrics.h"
|
|
namespace mdlp {
|
|
std::ostream& operator << (std::ostream& os, const cutPoint_t& cut)
|
|
{
|
|
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
|
|
") - (" << cut.fromValue << ", " << cut.toValue << ") "
|
|
<< std::endl;
|
|
return os;
|
|
|
|
}
|
|
CPPFImdlp::CPPFImdlp() : proposed(true), precision(6), debug(false)
|
|
{
|
|
divider = pow(10, precision);
|
|
numClasses = 0;
|
|
}
|
|
CPPFImdlp::CPPFImdlp(bool proposed, int precision, bool debug) : proposed(proposed), precision(precision), debug(debug)
|
|
{
|
|
divider = pow(10, precision);
|
|
numClasses = 0;
|
|
}
|
|
CPPFImdlp::~CPPFImdlp()
|
|
= default;
|
|
std::vector<cutPoint_t> CPPFImdlp::getCutPoints()
|
|
{
|
|
return cutPoints;
|
|
}
|
|
labels CPPFImdlp::getDiscretizedValues()
|
|
{
|
|
return xDiscretized;
|
|
}
|
|
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
|
|
{
|
|
X = X_;
|
|
y = y_;
|
|
if (X.size() != y.size()) {
|
|
std::cerr << "X and y must have the same size" << std::endl;
|
|
return *this;
|
|
}
|
|
if (X.size() == 0) {
|
|
std::cerr << "X and y must have at least one element" << std::endl;
|
|
return *this;
|
|
}
|
|
this->indices = sortIndices(X_);
|
|
this->xDiscretized = labels(X.size(), -1);
|
|
this->numClasses = Metrics::numClasses(y, indices, 0, X.size());
|
|
|
|
if (proposed) {
|
|
computeCutPointsProposed();
|
|
} else {
|
|
computeCutPointsOriginal();
|
|
}
|
|
filterCutPoints();
|
|
applyCutPoints();
|
|
return *this;
|
|
}
|
|
labels& CPPFImdlp::transform(samples& X_)
|
|
{
|
|
indices_t indices_transform = sortIndices(X_);
|
|
applyCutPoints();
|
|
return xDiscretized;
|
|
}
|
|
void CPPFImdlp::debugPoints(samples& X_, labels& y_)
|
|
{
|
|
std::cout << "+++++++++++++++++++++++" << std::endl;
|
|
// for (auto i : sortIndices(X))
|
|
indices_t indices_n = sortIndices(X);
|
|
for (size_t i = 0; i < indices_n.size(); i++) {
|
|
printf("(%3lu, %3lu) -> (%3.1f, %d)\n", i, indices_n[i], X_[indices_n[i]], y_[indices_n[i]]);
|
|
}
|
|
std::cout << "+++++++++++++++++++++++" << std::endl;
|
|
fit(X_, y_);
|
|
for (auto item : cutPoints) {
|
|
std::cout << item.start << " X_[" << item.end << "]=" << X_[item.end] << std::endl;
|
|
}
|
|
}
|
|
void CPPFImdlp::applyCutPoints()
|
|
{
|
|
for (auto cut : cutPoints) {
|
|
for (size_t i = cut.start; i < cut.end; i++) {
|
|
xDiscretized[indices[i]] = cut.classNumber;
|
|
}
|
|
}
|
|
}
|
|
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
|
|
{
|
|
int k, k1, k2;
|
|
float ig, delta;
|
|
float ent, ent1, ent2;
|
|
auto N = float(rest.end - rest.start);
|
|
if (N < 2) {
|
|
return false;
|
|
}
|
|
k = Metrics::numClasses(y, indices, rest.start, rest.end);
|
|
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
|
|
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
|
|
ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses);
|
|
ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses);
|
|
ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses);
|
|
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
|
|
delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2);
|
|
float term = 1 / N * (log2(N - 1) + delta);
|
|
if (debug) {
|
|
std::cout << "Rest: " << rest;
|
|
std::cout << "Candidate: " << candidate;
|
|
std::cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << std::endl;
|
|
std::cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << std::endl;
|
|
}
|
|
return (ig > term);
|
|
}
|
|
void CPPFImdlp::filterCutPoints()
|
|
{
|
|
cutPoints_t filtered;
|
|
cutPoint_t rest;
|
|
int classNumber = 0;
|
|
|
|
rest.start = 0;
|
|
rest.end = X.size();
|
|
rest.fromValue = std::numeric_limits<float>::lowest();
|
|
rest.toValue = std::numeric_limits<float>::max();
|
|
rest.classNumber = classNumber;
|
|
bool lastReject = false, first = true;
|
|
for (auto item : cutPoints) {
|
|
if (evaluateCutPoint(rest, item)) {
|
|
if (debug)
|
|
std::cout << "Accepted" << std::endl;
|
|
if (lastReject) {
|
|
if (first) {
|
|
item.fromValue = std::numeric_limits<float>::lowest();
|
|
item.start = indices[0];
|
|
} else {
|
|
item.fromValue = filtered.back().toValue;
|
|
item.start = filtered.back().end;
|
|
}
|
|
}
|
|
//Assign class number to the interval (cutpoint)
|
|
item.classNumber = classNumber++;
|
|
filtered.push_back(item);
|
|
first = false;
|
|
rest.start = item.end;
|
|
} else {
|
|
if (debug)
|
|
std::cout << "Rejected" << std::endl;
|
|
lastReject = true;
|
|
}
|
|
}
|
|
if (!first) {
|
|
filtered.back().toValue = std::numeric_limits<float>::max();
|
|
filtered.back().end = X.size() - 1;
|
|
} else {
|
|
filtered.push_back(rest);
|
|
}
|
|
|
|
cutPoints = filtered;
|
|
}
|
|
void CPPFImdlp::computeCutPointsProposed()
|
|
{
|
|
cutPoints_t cutPts;
|
|
cutPoint_t cutPoint;
|
|
float xPrev, xCur, xPivot;
|
|
int yPrev, yCur, yPivot;
|
|
size_t idx, numElements, start;
|
|
|
|
xCur = xPrev = X[indices[0]];
|
|
yCur = yPrev = y[indices[0]];
|
|
numElements = indices.size() - 1;
|
|
idx = start = 0;
|
|
bool firstCutPoint = true;
|
|
if (debug)
|
|
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
|
|
while (idx < numElements) {
|
|
xPivot = xCur;
|
|
yPivot = yCur;
|
|
if (debug)
|
|
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
// Read the same values and check class changes
|
|
do {
|
|
idx++;
|
|
xCur = X[indices[idx]];
|
|
yCur = y[indices[idx]];
|
|
if (yCur != yPivot && xCur == xPivot) {
|
|
yPivot = -1;
|
|
}
|
|
if (debug)
|
|
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
}
|
|
while (idx < numElements && xCur == xPivot);
|
|
if (yPivot == -1 || yPrev != yCur) {
|
|
cutPoint.start = start;
|
|
cutPoint.end = idx - 1;
|
|
start = idx;
|
|
cutPoint.fromValue = firstCutPoint ? std::numeric_limits<float>::lowest() : cutPts.back().toValue;
|
|
cutPoint.toValue = (xPrev + xCur) / 2;
|
|
cutPoint.classNumber = -1;
|
|
firstCutPoint = false;
|
|
if (debug) {
|
|
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
|
}
|
|
cutPts.push_back(cutPoint);
|
|
}
|
|
yPrev = yPivot;
|
|
xPrev = xPivot;
|
|
}
|
|
if (idx == numElements) {
|
|
cutPoint.start = start;
|
|
cutPoint.end = numElements + 1;
|
|
cutPoint.fromValue = firstCutPoint ? std::numeric_limits<float>::lowest() : cutPts.back().toValue;
|
|
cutPoint.toValue = std::numeric_limits<float>::max();
|
|
cutPoint.classNumber = -1;
|
|
if (debug)
|
|
printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
|
cutPts.push_back(cutPoint);
|
|
}
|
|
for (auto cutPt : cutPts)
|
|
std::cout << "Cut point: " << cutPt;
|
|
cutPoints = cutPts;
|
|
}
|
|
void CPPFImdlp::computeCutPointsOriginal()
|
|
{
|
|
cutPoints_t cutPts;
|
|
cutPoint_t cutPoint;
|
|
float xPrev;
|
|
int yPrev;
|
|
bool first = true;
|
|
// idxPrev is the index of the init instance of the cutPoint
|
|
size_t index, idxPrev = 0, idx = indices[0];
|
|
xPrev = X[idx];
|
|
yPrev = y[idx];
|
|
for (index = 0; index < size_t(indices.size()) - 1; index++) {
|
|
idx = indices[index];
|
|
// Definition 2 Cut points are always on boundaries
|
|
if (y[idx] != yPrev && xPrev < X[idx]) {
|
|
if (first) {
|
|
first = false;
|
|
cutPoint.fromValue = std::numeric_limits<float>::lowest();
|
|
} else {
|
|
cutPoint.fromValue = cutPts.back().toValue;
|
|
}
|
|
cutPoint.start = idxPrev;
|
|
cutPoint.end = index;
|
|
cutPoint.classNumber = -1;
|
|
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
|
|
idxPrev = index;
|
|
cutPts.push_back(cutPoint);
|
|
}
|
|
xPrev = X[idx];
|
|
yPrev = y[idx];
|
|
}
|
|
if (first) {
|
|
cutPoint.start = 0;
|
|
cutPoint.classNumber = -1;
|
|
cutPoint.fromValue = std::numeric_limits<float>::lowest();
|
|
cutPoint.toValue = std::numeric_limits<float>::max();
|
|
cutPts.push_back(cutPoint);
|
|
} else
|
|
cutPts.back().toValue = std::numeric_limits<float>::max();
|
|
cutPts.back().end = X.size();
|
|
if (debug)
|
|
for (auto cutPt : cutPts)
|
|
std::cout << "-Cut point: " << cutPt;
|
|
cutPoints = cutPts;
|
|
}
|
|
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
|
indices_t CPPFImdlp::sortIndices(samples& X_)
|
|
{
|
|
indices_t idx(X_.size());
|
|
std::iota(idx.begin(), idx.end(), 0);
|
|
for (std::size_t i = 0; i < X_.size(); i++)
|
|
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
|
{ return X_[i1] < X_[i2]; });
|
|
return idx;
|
|
}
|
|
}
|