mirror of
https://github.com/Doctorado-ML/FImdlp.git
synced 2025-08-17 16:35:52 +00:00
Refactor base algorithm
This commit is contained in:
@@ -1,41 +1,20 @@
|
|||||||
#include "CPPFImdlp.h"
|
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <set>
|
||||||
|
#include "CPPFImdlp.h"
|
||||||
#include "Metrics.h"
|
#include "Metrics.h"
|
||||||
|
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
ostream& operator << (ostream& os, const cutPoint_t& cut)
|
CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
||||||
{
|
{
|
||||||
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
|
|
||||||
") - (" << cut.fromValue << ", " << cut.toValue << ") "
|
|
||||||
<< endl;
|
|
||||||
return os;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false)
|
CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
||||||
{
|
{
|
||||||
divider = pow(10, precision);
|
|
||||||
numClasses = 0;
|
|
||||||
}
|
|
||||||
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug)
|
|
||||||
{
|
|
||||||
divider = pow(10, precision);
|
|
||||||
numClasses = 0;
|
|
||||||
}
|
}
|
||||||
CPPFImdlp::~CPPFImdlp()
|
CPPFImdlp::~CPPFImdlp()
|
||||||
= default;
|
= default;
|
||||||
samples CPPFImdlp::getCutPoints()
|
|
||||||
{
|
|
||||||
samples output(cutPoints.size());
|
|
||||||
::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
|
|
||||||
[](cutPoint_t cut) { return cut.toValue; });
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
labels CPPFImdlp::getDiscretizedValues()
|
|
||||||
{
|
|
||||||
return xDiscretized;
|
|
||||||
}
|
|
||||||
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
|
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
|
||||||
{
|
{
|
||||||
X = X_;
|
X = X_;
|
||||||
@@ -47,227 +26,78 @@ namespace mdlp {
|
|||||||
throw invalid_argument("X and y must have at least one element");
|
throw invalid_argument("X and y must have at least one element");
|
||||||
}
|
}
|
||||||
indices = sortIndices(X_);
|
indices = sortIndices(X_);
|
||||||
xDiscretized = labels(X.size(), -1);
|
metrics.setData(y, indices);
|
||||||
numClasses = Metrics::numClasses(y, indices, 0, X.size());
|
computeCutPoints(0, X.size());
|
||||||
|
|
||||||
if (proposal) {
|
|
||||||
computeCutPointsProposal();
|
|
||||||
} else {
|
|
||||||
computeCutPointsOriginal();
|
|
||||||
}
|
|
||||||
filterCutPoints();
|
|
||||||
// Apply cut points to the input vector
|
|
||||||
for (auto cut : cutPoints) {
|
|
||||||
for (size_t i = cut.start; i < cut.end; i++) {
|
|
||||||
xDiscretized[indices[i]] = cut.classNumber;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return *this;
|
return *this;
|
||||||
}
|
}
|
||||||
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
|
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
|
||||||
|
{
|
||||||
|
int cut;
|
||||||
|
if (end - start < 2)
|
||||||
|
return;
|
||||||
|
cut = getCandidate(start, end);
|
||||||
|
if (cut == -1 || !mdlp(start, cut, end)) {
|
||||||
|
// cut.value == -1 means that there is no candidate in the interval
|
||||||
|
// No boundary found, so we add both ends of the interval as cutpoints
|
||||||
|
// because they were selected by the algorithm before
|
||||||
|
if (start != 0)
|
||||||
|
cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2);
|
||||||
|
if (end != X.size())
|
||||||
|
cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
computeCutPoints(start, cut);
|
||||||
|
computeCutPoints(cut, end);
|
||||||
|
}
|
||||||
|
long int CPPFImdlp::getCandidate(size_t start, size_t end)
|
||||||
|
{
|
||||||
|
long int candidate = -1, elements = end - start;
|
||||||
|
precision_t entropy_left, entropy_right, minEntropy = numeric_limits<precision_t>::max();
|
||||||
|
for (auto idx = start + 1; idx < end; idx++) {
|
||||||
|
// Cutpoints are always on boudndaries
|
||||||
|
if (y[indices[idx]] == y[indices[idx - 1]])
|
||||||
|
continue;
|
||||||
|
entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
|
||||||
|
entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end);
|
||||||
|
if (entropy_left + entropy_right < minEntropy) {
|
||||||
|
minEntropy = entropy_left + entropy_right;
|
||||||
|
candidate = idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return candidate;
|
||||||
|
}
|
||||||
|
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
||||||
{
|
{
|
||||||
int k, k1, k2;
|
int k, k1, k2;
|
||||||
float ig, delta;
|
precision_t ig, delta;
|
||||||
float ent, ent1, ent2;
|
precision_t ent, ent1, ent2;
|
||||||
auto N = float(rest.end - rest.start);
|
auto N = precision_t(end - start);
|
||||||
if (N < 2) {
|
if (N < 2) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
k = Metrics::numClasses(y, indices, rest.start, rest.end);
|
k = metrics.computeNumClasses(start, end);
|
||||||
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
|
k1 = metrics.computeNumClasses(start, cut);
|
||||||
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
|
k2 = metrics.computeNumClasses(cut, end);
|
||||||
ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses);
|
ent = metrics.entropy(start, end);
|
||||||
ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses);
|
ent1 = metrics.entropy(start, cut);
|
||||||
ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses);
|
ent2 = metrics.entropy(cut, end);
|
||||||
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
|
ig = metrics.informationGain(start, cut, end);
|
||||||
delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2);
|
delta = log2(pow(3, precision_t(k)) - 2) -
|
||||||
float term = 1 / N * (log2(N - 1) + delta);
|
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
|
||||||
if (debug) {
|
precision_t term = 1 / N * (log2(N - 1) + delta);
|
||||||
cout << "Rest: " << rest;
|
return ig > term;
|
||||||
cout << "Candidate: " << candidate;
|
|
||||||
cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl;
|
|
||||||
cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl;
|
|
||||||
}
|
}
|
||||||
return (ig > term);
|
cutPoints_t CPPFImdlp::getCutPoints()
|
||||||
}
|
|
||||||
void CPPFImdlp::filterCutPoints()
|
|
||||||
{
|
{
|
||||||
cutPoints_t filtered;
|
// Remove duplicates and sort
|
||||||
cutPoint_t rest, item;
|
cutPoints_t output(cutPoints.size());
|
||||||
int classNumber = 0;
|
set<precision_t> s;
|
||||||
|
unsigned size = cutPoints.size();
|
||||||
rest.start = 0;
|
for (unsigned i = 0; i < size; i++)
|
||||||
rest.end = X.size();
|
s.insert(cutPoints[i]);
|
||||||
rest.fromValue = numeric_limits<float>::lowest();
|
output.assign(s.begin(), s.end());
|
||||||
rest.toValue = numeric_limits<float>::max();
|
sort(output.begin(), output.end());
|
||||||
rest.classNumber = classNumber;
|
return output;
|
||||||
bool first = true;
|
|
||||||
for (size_t index = 0; index < size_t(cutPoints.size()); index++) {
|
|
||||||
item = cutPoints[index];
|
|
||||||
if (evaluateCutPoint(rest, item)) {
|
|
||||||
if (debug)
|
|
||||||
cout << "Accepted: " << item << endl;
|
|
||||||
//Assign class number to the interval (cutpoint)
|
|
||||||
item.classNumber = classNumber++;
|
|
||||||
filtered.push_back(item);
|
|
||||||
first = false;
|
|
||||||
rest.start = item.end;
|
|
||||||
} else {
|
|
||||||
if (debug)
|
|
||||||
cout << "Rejected: " << item << endl;
|
|
||||||
if (index != size_t(cutPoints.size()) - 1) {
|
|
||||||
// Try to merge the rejected cutpoint with the next one
|
|
||||||
if (first) {
|
|
||||||
cutPoints[index + 1].fromValue = numeric_limits<float>::lowest();
|
|
||||||
cutPoints[index + 1].start = indices[0];
|
|
||||||
} else {
|
|
||||||
cutPoints[index + 1].fromValue = item.fromValue;
|
|
||||||
cutPoints[index + 1].start = item.start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!first) {
|
|
||||||
filtered.back().toValue = numeric_limits<float>::max();
|
|
||||||
filtered.back().end = X.size() - 1;
|
|
||||||
} else {
|
|
||||||
filtered.push_back(rest);
|
|
||||||
}
|
|
||||||
cutPoints = filtered;
|
|
||||||
}
|
|
||||||
void CPPFImdlp::computeCutPointsProposal()
|
|
||||||
{
|
|
||||||
cutPoints_t cutPts;
|
|
||||||
cutPoint_t cutPoint;
|
|
||||||
float xPrev, xCur, xPivot;
|
|
||||||
int yPrev, yCur, yPivot;
|
|
||||||
size_t idx, numElements, start;
|
|
||||||
|
|
||||||
xCur = xPrev = X[indices[0]];
|
|
||||||
yCur = yPrev = y[indices[0]];
|
|
||||||
numElements = indices.size() - 1;
|
|
||||||
idx = start = 0;
|
|
||||||
bool firstCutPoint = true;
|
|
||||||
if (debug)
|
|
||||||
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
|
|
||||||
while (idx < numElements) {
|
|
||||||
xPivot = xCur;
|
|
||||||
yPivot = yCur;
|
|
||||||
if (debug)
|
|
||||||
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
||||||
// Read the same values and check class changes
|
|
||||||
do {
|
|
||||||
idx++;
|
|
||||||
xCur = X[indices[idx]];
|
|
||||||
yCur = y[indices[idx]];
|
|
||||||
if (yCur != yPivot && xCur == xPivot) {
|
|
||||||
yPivot = -1;
|
|
||||||
}
|
|
||||||
if (debug)
|
|
||||||
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
|
||||||
}
|
|
||||||
while (idx < numElements && xCur == xPivot);
|
|
||||||
// Check if the class changed and there are more than 1 element
|
|
||||||
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) {
|
|
||||||
// Must we add the entropy criteria here?
|
|
||||||
// if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point }
|
|
||||||
cutPoint.start = start;
|
|
||||||
cutPoint.end = idx;
|
|
||||||
start = idx;
|
|
||||||
cutPoint.fromValue = firstCutPoint ? numeric_limits<float>::lowest() : cutPts.back().toValue;
|
|
||||||
cutPoint.toValue = (xPrev + xCur) / 2;
|
|
||||||
cutPoint.classNumber = -1;
|
|
||||||
firstCutPoint = false;
|
|
||||||
if (debug) {
|
|
||||||
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
|
||||||
}
|
|
||||||
cutPts.push_back(cutPoint);
|
|
||||||
}
|
|
||||||
yPrev = yPivot;
|
|
||||||
xPrev = xPivot;
|
|
||||||
}
|
|
||||||
if (idx == numElements) {
|
|
||||||
cutPoint.start = start;
|
|
||||||
cutPoint.end = numElements + 1;
|
|
||||||
cutPoint.fromValue = firstCutPoint ? numeric_limits<float>::lowest() : cutPts.back().toValue;
|
|
||||||
cutPoint.toValue = numeric_limits<float>::max();
|
|
||||||
cutPoint.classNumber = -1;
|
|
||||||
if (debug)
|
|
||||||
printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
|
||||||
cutPts.push_back(cutPoint);
|
|
||||||
}
|
|
||||||
if (debug) {
|
|
||||||
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << endl;
|
|
||||||
for (auto cutPt : cutPts)
|
|
||||||
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
|
|
||||||
}
|
|
||||||
cutPoints = cutPts;
|
|
||||||
}
|
|
||||||
void CPPFImdlp::computeCutPointsOriginal()
|
|
||||||
{
|
|
||||||
cutPoints_t cutPts;
|
|
||||||
cutPoint_t cutPoint;
|
|
||||||
float xPrev;
|
|
||||||
int yPrev;
|
|
||||||
bool first = true;
|
|
||||||
// idxPrev is the index of the init instance of the cutPoint
|
|
||||||
size_t index, idxPrev = 0, last, idx = indices[0];
|
|
||||||
xPrev = X[idx];
|
|
||||||
yPrev = y[idx];
|
|
||||||
last = indices.size() - 1;
|
|
||||||
for (index = 0; index < last; index++) {
|
|
||||||
idx = indices[index];
|
|
||||||
// Definition 2 Cut points are always on class boundaries &&
|
|
||||||
// there are more than 1 items in the interval
|
|
||||||
// if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut)
|
|
||||||
if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) {
|
|
||||||
// Must we add the entropy criteria here?
|
|
||||||
if (first) {
|
|
||||||
first = false;
|
|
||||||
cutPoint.fromValue = numeric_limits<float>::lowest();
|
|
||||||
} else {
|
|
||||||
cutPoint.fromValue = cutPts.back().toValue;
|
|
||||||
}
|
|
||||||
cutPoint.start = idxPrev;
|
|
||||||
cutPoint.end = index;
|
|
||||||
cutPoint.classNumber = -1;
|
|
||||||
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
|
|
||||||
idxPrev = index;
|
|
||||||
cutPts.push_back(cutPoint);
|
|
||||||
}
|
|
||||||
xPrev = X[idx];
|
|
||||||
yPrev = y[idx];
|
|
||||||
}
|
|
||||||
if (first) {
|
|
||||||
cutPoint.start = 0;
|
|
||||||
cutPoint.classNumber = -1;
|
|
||||||
cutPoint.fromValue = numeric_limits<float>::lowest();
|
|
||||||
cutPoint.toValue = numeric_limits<float>::max();
|
|
||||||
cutPts.push_back(cutPoint);
|
|
||||||
} else
|
|
||||||
cutPts.back().toValue = numeric_limits<float>::max();
|
|
||||||
cutPts.back().end = X.size();
|
|
||||||
if (debug) {
|
|
||||||
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << endl;
|
|
||||||
for (auto cutPt : cutPts)
|
|
||||||
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt;
|
|
||||||
}
|
|
||||||
cutPoints = cutPts;
|
|
||||||
}
|
|
||||||
bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla
|
|
||||||
|
|
||||||
|
|
||||||
*/
|
|
||||||
float entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses);
|
|
||||||
float entropyRight = Metrics::entropy(y, indices, cut, end, numClasses);
|
|
||||||
float entropyInterval = Metrics::entropy(y, indices, start, end, numClasses);
|
|
||||||
if (debug)
|
|
||||||
printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval);
|
|
||||||
//return (entropyInterval - (entropyLeft + entropyRight) > 0);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||||
indices_t CPPFImdlp::sortIndices(samples& X_)
|
indices_t CPPFImdlp::sortIndices(samples& X_)
|
||||||
@@ -275,12 +105,8 @@ namespace mdlp {
|
|||||||
indices_t idx(X_.size());
|
indices_t idx(X_.size());
|
||||||
iota(idx.begin(), idx.end(), 0);
|
iota(idx.begin(), idx.end(), 0);
|
||||||
for (size_t i = 0; i < X_.size(); i++)
|
for (size_t i = 0; i < X_.size(); i++)
|
||||||
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
||||||
{ return X_[i1] < X_[i2]; });
|
{ return X_[i1] < X_[i2]; });
|
||||||
return idx;
|
return idx;
|
||||||
}
|
}
|
||||||
void CPPFImdlp::setCutPoints(cutPoints_t cutPoints_)
|
|
||||||
{
|
|
||||||
cutPoints = cutPoints_;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@@ -1,39 +1,30 @@
|
|||||||
#ifndef CPPFIMDLP_H
|
#ifndef CPPFIMDLP_H
|
||||||
#define CPPFIMDLP_H
|
#define CPPFIMDLP_H
|
||||||
#include "typesFImdlp.h"
|
#include "typesFImdlp.h"
|
||||||
|
#include "Metrics.h"
|
||||||
#include <utility>
|
#include <utility>
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
class CPPFImdlp {
|
class CPPFImdlp {
|
||||||
protected:
|
protected:
|
||||||
bool proposal; // proposed algorithm or original algorithm
|
bool proposal; // proposed algorithm or original algorithm
|
||||||
int precision;
|
|
||||||
bool debug;
|
bool debug;
|
||||||
float divider;
|
|
||||||
indices_t indices; // sorted indices to use with X and y
|
indices_t indices; // sorted indices to use with X and y
|
||||||
samples X;
|
samples X;
|
||||||
labels y;
|
labels y;
|
||||||
labels xDiscretized;
|
Metrics metrics;
|
||||||
int numClasses;
|
|
||||||
cutPoints_t cutPoints;
|
cutPoints_t cutPoints;
|
||||||
|
|
||||||
void setCutPoints(cutPoints_t);
|
|
||||||
static indices_t sortIndices(samples&);
|
static indices_t sortIndices(samples&);
|
||||||
void computeCutPointsOriginal();
|
void computeCutPoints(size_t, size_t);
|
||||||
void computeCutPointsProposal();
|
long int getCandidate(size_t, size_t);
|
||||||
bool evaluateCutPoint(cutPoint_t, cutPoint_t);
|
bool mdlp(size_t, size_t, size_t);
|
||||||
void filterCutPoints();
|
|
||||||
bool goodCut(size_t, size_t, size_t); // if the cut candidate reduces entropy
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CPPFImdlp();
|
CPPFImdlp();
|
||||||
CPPFImdlp(bool, int, bool debug = false);
|
CPPFImdlp(bool, bool debug = false);
|
||||||
~CPPFImdlp();
|
~CPPFImdlp();
|
||||||
samples getCutPoints();
|
|
||||||
indices_t getIndices();
|
|
||||||
labels getDiscretizedValues();
|
|
||||||
void debugPoints(samples&, labels&);
|
|
||||||
CPPFImdlp& fit(samples&, labels&);
|
CPPFImdlp& fit(samples&, labels&);
|
||||||
labels transform(samples&);
|
samples getCutPoints();
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
@@ -1,46 +1,63 @@
|
|||||||
#include "Metrics.h"
|
#include "Metrics.h"
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <iostream>
|
||||||
|
using namespace std;
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
Metrics::Metrics()
|
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
|
||||||
= default;
|
|
||||||
int Metrics::numClasses(labels& y, indices_t indices, size_t start, size_t end)
|
|
||||||
{
|
{
|
||||||
std::set<int> numClasses;
|
}
|
||||||
|
int Metrics::computeNumClasses(size_t start, size_t end)
|
||||||
|
{
|
||||||
|
set<int> nClasses;
|
||||||
for (auto i = start; i < end; ++i) {
|
for (auto i = start; i < end; ++i) {
|
||||||
numClasses.insert(y[indices[i]]);
|
nClasses.insert(y[indices[i]]);
|
||||||
}
|
}
|
||||||
return numClasses.size();
|
return nClasses.size();
|
||||||
}
|
}
|
||||||
float Metrics::entropy(labels& y, indices_t& indices, size_t start, size_t end, int nClasses)
|
void Metrics::setData(labels& y_, indices_t& indices_)
|
||||||
{
|
{
|
||||||
float entropy = 0;
|
indices = indices_;
|
||||||
|
y = y_;
|
||||||
|
numClasses = computeNumClasses(0, indices.size());
|
||||||
|
}
|
||||||
|
precision_t Metrics::entropy(size_t start, size_t end)
|
||||||
|
{
|
||||||
|
precision_t p, ventropy = 0;
|
||||||
int nElements = 0;
|
int nElements = 0;
|
||||||
labels counts(nClasses + 1, 0);
|
labels counts(numClasses + 1, 0);
|
||||||
|
if (end - start < 2)
|
||||||
|
return 0;
|
||||||
|
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
|
||||||
|
return entropyCache[make_tuple(start, end)];
|
||||||
|
}
|
||||||
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
||||||
counts[y[*i]]++;
|
counts[y[*i]]++;
|
||||||
nElements++;
|
nElements++;
|
||||||
}
|
}
|
||||||
for (auto count : counts) {
|
for (auto count : counts) {
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
float p = (float)count / nElements;
|
p = (precision_t)count / nElements;
|
||||||
entropy -= p * log2(p);
|
ventropy -= p * log2(p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return entropy < 0 ? 0 : entropy;
|
entropyCache[make_tuple(start, end)] = ventropy;
|
||||||
|
return ventropy;
|
||||||
}
|
}
|
||||||
float Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses)
|
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
|
||||||
{
|
{
|
||||||
float iGain;
|
precision_t iGain;
|
||||||
float entropy, entropyLeft, entropyRight;
|
precision_t entropyInterval, entropyLeft, entropyRight;
|
||||||
int nClassesLeft, nClassesRight;
|
int nElementsLeft = cut - start, nElementsRight = end - cut;
|
||||||
int nElementsLeft = cutPoint - start, nElementsRight = end - cutPoint;
|
|
||||||
int nElements = end - start;
|
int nElements = end - start;
|
||||||
nClassesLeft = Metrics::numClasses(y, indices, start, cutPoint);
|
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
|
||||||
nClassesRight = Metrics::numClasses(y, indices, cutPoint, end);
|
cout << "**********Cache IG hit for " << start << " " << end << endl;
|
||||||
entropy = Metrics::entropy(y, indices, start, end, nClasses);
|
return igCache[make_tuple(start, cut, end)];
|
||||||
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
|
}
|
||||||
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
|
entropyInterval = entropy(start, end);
|
||||||
iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
|
entropyLeft = entropy(start, cut);
|
||||||
|
entropyRight = entropy(cut, end);
|
||||||
|
iGain = entropyInterval - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements;
|
||||||
|
igCache[make_tuple(start, cut, end)] = iGain;
|
||||||
return iGain;
|
return iGain;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,14 +1,21 @@
|
|||||||
#ifndef METRICS_H
|
#ifndef CCMETRICS_H
|
||||||
#define METRICS_H
|
#define CCMETRICS_H
|
||||||
#include "typesFImdlp.h"
|
#include "typesFImdlp.h"
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
class Metrics {
|
class Metrics {
|
||||||
|
protected:
|
||||||
|
labels& y;
|
||||||
|
indices_t& indices;
|
||||||
|
int numClasses;
|
||||||
|
cacheEnt_t entropyCache;
|
||||||
|
cacheIg_t igCache;
|
||||||
public:
|
public:
|
||||||
Metrics();
|
Metrics(labels&, indices_t&);
|
||||||
static int numClasses(labels&, indices_t, size_t, size_t);
|
void setData(labels&, indices_t&);
|
||||||
static float entropy(labels&, indices_t&, size_t, size_t, int);
|
int computeNumClasses(size_t, size_t);
|
||||||
static float informationGain(labels&, indices_t&, size_t, size_t, size_t, int);
|
precision_t entropy(size_t, size_t);
|
||||||
|
precision_t informationGain(size_t, size_t, size_t);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
@@ -1,110 +0,0 @@
|
|||||||
#include "ccFImdlp.h"
|
|
||||||
#include <numeric>
|
|
||||||
#include <iostream>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <set>
|
|
||||||
#include "ccMetrics.h"
|
|
||||||
|
|
||||||
namespace mdlp {
|
|
||||||
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
|
||||||
{
|
|
||||||
}
|
|
||||||
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
|
|
||||||
{
|
|
||||||
}
|
|
||||||
CPPFImdlp::~CPPFImdlp()
|
|
||||||
= default;
|
|
||||||
|
|
||||||
CPPFImdlp& CPPFImdlp::fitx(samples& X_, labels& y_)
|
|
||||||
{
|
|
||||||
X = X_;
|
|
||||||
y = y_;
|
|
||||||
if (X.size() != y.size()) {
|
|
||||||
throw invalid_argument("X and y must have the same size");
|
|
||||||
}
|
|
||||||
if (X.size() == 0 || y.size() == 0) {
|
|
||||||
throw invalid_argument("X and y must have at least one element");
|
|
||||||
}
|
|
||||||
indices = sortIndices(X_);
|
|
||||||
metrics.setData(y, indices);
|
|
||||||
computeCutPoints(0, X.size());
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
int cut;
|
|
||||||
if (end - start < 2)
|
|
||||||
return;
|
|
||||||
cut = getCandidate(start, end);
|
|
||||||
if (cut == -1 || !mdlp(start, cut, end)) {
|
|
||||||
// cut.value == -1 means that there is no candidate in the interval
|
|
||||||
// that enhances the information gain
|
|
||||||
if (start != 0)
|
|
||||||
xCutPoints.push_back(xcutPoint_t({ start, (X[indices[start]] + X[indices[start - 1]]) / 2 }));
|
|
||||||
if (end != X.size())
|
|
||||||
xCutPoints.push_back(xcutPoint_t({ end, (X[indices[end]] + X[indices[end - 1]]) / 2 }));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
computeCutPoints(start, cut);
|
|
||||||
computeCutPoints(cut, end);
|
|
||||||
}
|
|
||||||
long int CPPFImdlp::getCandidate(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
long int candidate = -1, elements = end - start;
|
|
||||||
float entropy_left, entropy_right, minEntropy = numeric_limits<float>::max();
|
|
||||||
for (auto idx = start + 1; idx < end; idx++) {
|
|
||||||
// Cutpoints are always on boudndaries
|
|
||||||
if (y[indices[idx]] == y[indices[idx - 1]])
|
|
||||||
continue;
|
|
||||||
entropy_left = float(idx - start) / elements * metrics.entropy(start, idx);
|
|
||||||
entropy_right = float(end - idx) / elements * metrics.entropy(idx, end);
|
|
||||||
if (entropy_left + entropy_right < minEntropy) {
|
|
||||||
minEntropy = entropy_left + entropy_right;
|
|
||||||
candidate = idx;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return candidate;
|
|
||||||
}
|
|
||||||
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
int k, k1, k2;
|
|
||||||
float ig, delta;
|
|
||||||
float ent, ent1, ent2;
|
|
||||||
auto N = float(end - start);
|
|
||||||
if (N < 2) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
k = metrics.computeNumClasses(start, end);
|
|
||||||
k1 = metrics.computeNumClasses(start, cut);
|
|
||||||
k2 = metrics.computeNumClasses(cut, end);
|
|
||||||
ent = metrics.entropy(start, end);
|
|
||||||
ent1 = metrics.entropy(start, cut);
|
|
||||||
ent2 = metrics.entropy(cut, end);
|
|
||||||
ig = metrics.informationGain(start, cut, end);
|
|
||||||
delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2);
|
|
||||||
float term = 1 / N * (log2(N - 1) + delta);
|
|
||||||
return ig > term;
|
|
||||||
}
|
|
||||||
samples CPPFImdlp::getCutPointsx()
|
|
||||||
{
|
|
||||||
// Remove duplicates and sort
|
|
||||||
samples output(xCutPoints.size());
|
|
||||||
set<float> s;
|
|
||||||
unsigned size = xCutPoints.size();
|
|
||||||
for (unsigned i = 0; i < size; i++)
|
|
||||||
s.insert(xCutPoints[i].value);
|
|
||||||
output.assign(s.begin(), s.end());
|
|
||||||
sort(output.begin(), output.end());
|
|
||||||
return output;
|
|
||||||
}
|
|
||||||
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
|
||||||
indices_t CPPFImdlp::sortIndices(samples& X_)
|
|
||||||
{
|
|
||||||
indices_t idx(X_.size());
|
|
||||||
iota(idx.begin(), idx.end(), 0);
|
|
||||||
for (size_t i = 0; i < X_.size(); i++)
|
|
||||||
sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
|
||||||
{ return X_[i1] < X_[i2]; });
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
}
|
|
@@ -1,32 +0,0 @@
|
|||||||
#ifndef CCFIMDLP_H
|
|
||||||
#define CCFIMDLP_H
|
|
||||||
#include "typesFImdlp.h"
|
|
||||||
#include "ccMetrics.h"
|
|
||||||
#include <utility>
|
|
||||||
namespace mdlp {
|
|
||||||
class CPPFImdlp {
|
|
||||||
protected:
|
|
||||||
bool proposal; // proposed algorithm or original algorithm
|
|
||||||
int precision;
|
|
||||||
bool debug;
|
|
||||||
float divider;
|
|
||||||
indices_t indices; // sorted indices to use with X and y
|
|
||||||
samples X;
|
|
||||||
labels y;
|
|
||||||
Metrics metrics;
|
|
||||||
xcutPoints_t xCutPoints;
|
|
||||||
|
|
||||||
static indices_t sortIndices(samples&);
|
|
||||||
void computeCutPoints(size_t, size_t);
|
|
||||||
long int getCandidate(size_t, size_t);
|
|
||||||
bool mdlp(size_t, size_t, size_t);
|
|
||||||
|
|
||||||
public:
|
|
||||||
CPPFImdlp();
|
|
||||||
CPPFImdlp(bool, int, bool debug = false);
|
|
||||||
~CPPFImdlp();
|
|
||||||
CPPFImdlp& fitx(samples&, labels&);
|
|
||||||
samples getCutPointsx();
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
@@ -1,74 +0,0 @@
|
|||||||
#include "ccMetrics.h"
|
|
||||||
#include <set>
|
|
||||||
#include <iostream>
|
|
||||||
using namespace std;
|
|
||||||
namespace mdlp {
|
|
||||||
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
|
|
||||||
{
|
|
||||||
}
|
|
||||||
int Metrics::computeNumClasses(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
set<int> nClasses;
|
|
||||||
for (auto i = start; i < end; ++i) {
|
|
||||||
nClasses.insert(y[indices[i]]);
|
|
||||||
}
|
|
||||||
return nClasses.size();
|
|
||||||
}
|
|
||||||
void Metrics::setData(labels& y_, indices_t& indices_)
|
|
||||||
{
|
|
||||||
indices = indices_;
|
|
||||||
y = y_;
|
|
||||||
numClasses = computeNumClasses(0, indices.size());
|
|
||||||
}
|
|
||||||
float Metrics::entropy(size_t start, size_t end)
|
|
||||||
{
|
|
||||||
float p, ventropy = 0;
|
|
||||||
int nElements = 0;
|
|
||||||
labels counts(numClasses + 1, 0);
|
|
||||||
if (end - start < 2)
|
|
||||||
return 0;
|
|
||||||
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
|
|
||||||
return entropyCache[make_tuple(start, end)];
|
|
||||||
}
|
|
||||||
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
|
||||||
counts[y[*i]]++;
|
|
||||||
nElements++;
|
|
||||||
}
|
|
||||||
for (auto count : counts) {
|
|
||||||
if (count > 0) {
|
|
||||||
p = (float)count / nElements;
|
|
||||||
ventropy -= p * log2(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entropyCache[make_tuple(start, end)] = ventropy;
|
|
||||||
return ventropy;
|
|
||||||
}
|
|
||||||
float Metrics::informationGain(size_t start, size_t cut, size_t end)
|
|
||||||
{
|
|
||||||
float iGain;
|
|
||||||
float entropyInterval, entropyLeft, entropyRight;
|
|
||||||
int nElementsLeft = cut - start, nElementsRight = end - cut;
|
|
||||||
int nElements = end - start;
|
|
||||||
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
|
|
||||||
cout << "**********Cache IG hit for " << start << " " << end << endl;
|
|
||||||
return igCache[make_tuple(start, cut, end)];
|
|
||||||
}
|
|
||||||
entropyInterval = entropy(start, end);
|
|
||||||
entropyLeft = entropy(start, cut);
|
|
||||||
entropyRight = entropy(cut, end);
|
|
||||||
iGain = entropyInterval - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
|
|
||||||
igCache[make_tuple(start, cut, end)] = iGain;
|
|
||||||
return iGain;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
cache_t entropyCache;
|
|
||||||
std::map<std::tuple<int, int>, double> c;
|
|
||||||
|
|
||||||
// Set the value at index (3, 5) to 7.8.
|
|
||||||
c[std::make_tuple(3, 5)] = 7.8;
|
|
||||||
|
|
||||||
// Print the value at index (3, 5).
|
|
||||||
std::cout << c[std::make_tuple(3, 5)] << std::endl;
|
|
||||||
*/
|
|
@@ -1,21 +0,0 @@
|
|||||||
#ifndef CCMETRICS_H
|
|
||||||
#define CCMETRICS_H
|
|
||||||
#include "typesFImdlp.h"
|
|
||||||
#include <cmath>
|
|
||||||
namespace mdlp {
|
|
||||||
class Metrics {
|
|
||||||
protected:
|
|
||||||
labels& y;
|
|
||||||
indices_t& indices;
|
|
||||||
int numClasses;
|
|
||||||
cacheEnt_t entropyCache;
|
|
||||||
cacheIg_t igCache;
|
|
||||||
public:
|
|
||||||
Metrics(labels&, indices_t&);
|
|
||||||
void setData(labels&, indices_t&);
|
|
||||||
int computeNumClasses(size_t, size_t);
|
|
||||||
float entropy(size_t, size_t);
|
|
||||||
float informationGain(size_t, size_t, size_t);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
@@ -3,16 +3,13 @@
|
|||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libcpp cimport bool
|
from libcpp cimport bool
|
||||||
|
|
||||||
cdef extern from "ccFImdlp.h" namespace "mdlp":
|
cdef extern from "CPPFImdlp.h" namespace "mdlp":
|
||||||
cdef struct CutPointBody:
|
ctypedef float precision_t
|
||||||
size_t start, end;
|
|
||||||
int classNumber;
|
|
||||||
float fromValue, toValue;
|
|
||||||
cdef cppclass CPPFImdlp:
|
cdef cppclass CPPFImdlp:
|
||||||
CPPFImdlp() except +
|
CPPFImdlp() except +
|
||||||
CPPFImdlp(bool, int, bool) except +
|
CPPFImdlp(bool, bool) except +
|
||||||
CPPFImdlp& fitx(vector[float]&, vector[int]&)
|
CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
|
||||||
vector[float] getCutPointsx()
|
vector[precision_t] getCutPoints()
|
||||||
|
|
||||||
|
|
||||||
class PcutPoint_t:
|
class PcutPoint_t:
|
||||||
@@ -24,14 +21,14 @@ class PcutPoint_t:
|
|||||||
|
|
||||||
cdef class CFImdlp:
|
cdef class CFImdlp:
|
||||||
cdef CPPFImdlp *thisptr
|
cdef CPPFImdlp *thisptr
|
||||||
def __cinit__(self, precision=6, debug=False, proposal=True):
|
def __cinit__(self, debug=False, proposal=True):
|
||||||
# Proposal or original algorithm
|
# Proposal or original algorithm
|
||||||
self.thisptr = new CPPFImdlp(proposal, precision, debug)
|
self.thisptr = new CPPFImdlp(proposal, debug)
|
||||||
def __dealloc__(self):
|
def __dealloc__(self):
|
||||||
del self.thisptr
|
del self.thisptr
|
||||||
def fit(self, X, y):
|
def fit(self, X, y):
|
||||||
self.thisptr.fitx(X, y)
|
self.thisptr.fit(X, y)
|
||||||
return self
|
return self
|
||||||
def get_cut_points(self):
|
def get_cut_points(self):
|
||||||
return self.thisptr.getCutPointsx()
|
return self.thisptr.getCutPoints()
|
||||||
|
|
Binary file not shown.
@@ -1,36 +0,0 @@
|
|||||||
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
struct CutPointBody {
|
|
||||||
size_t start, end; // indices of the sorted vector
|
|
||||||
int classNumber; // class assigned to the cut point
|
|
||||||
float fromValue, toValue;
|
|
||||||
};
|
|
||||||
typedef CutPointBody cutPoint_t;
|
|
||||||
typedef vector<float> samples;
|
|
||||||
typedef vector<int> labels;
|
|
||||||
typedef vector<size_t> indices_t;
|
|
||||||
typedef vector<cutPoint_t> cutPoints_t;
|
|
||||||
//typedef std::map<std::tuple<int, int>, float> cache_t;
|
|
||||||
struct cutPointStruct {
|
|
||||||
size_t index;
|
|
||||||
float value;
|
|
||||||
};
|
|
||||||
typedef cutPointStruct xcutPoint_t;
|
|
||||||
typedef vector<xcutPoint_t> xcutPoints_t;
|
|
||||||
class Metrics {
|
|
||||||
private:
|
|
||||||
labels& y;
|
|
||||||
indices_t& indices;
|
|
||||||
int numClasses;
|
|
||||||
public:
|
|
||||||
Metrics(labels&, indices_t&);
|
|
||||||
int computeNumClasses(size_t, size_t);
|
|
||||||
float entropy(size_t, size_t);
|
|
||||||
float informationGain(size_t, size_t, size_t);
|
|
||||||
};
|
|
||||||
Metrics::Metrics(labels& y_, indices_t& indices_) : y(y_), indices(indices_)
|
|
||||||
{
|
|
||||||
numClasses = computeNumClasses(0, indices.size());
|
|
||||||
}
|
|
BIN
fimdlp/main
BIN
fimdlp/main
Binary file not shown.
@@ -1,52 +0,0 @@
|
|||||||
#include "CPPFImdlp.h"
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include <sstream>
|
|
||||||
using namespace std;
|
|
||||||
using namespace mdlp;
|
|
||||||
|
|
||||||
int main()
|
|
||||||
{
|
|
||||||
ifstream fin("kdd_JapaneseVowels.arff");
|
|
||||||
if (!fin.is_open()) {
|
|
||||||
cout << "Error opening file" << endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
// Read the Data from the file
|
|
||||||
// as String Vector
|
|
||||||
size_t col;
|
|
||||||
vector<string> row;
|
|
||||||
string line, word;
|
|
||||||
vector<vector<float>> dataset = vector<vector<float>>(15, vector<float>());
|
|
||||||
while (getline(fin, line)) {
|
|
||||||
if (count++ > 215) {
|
|
||||||
stringstream ss(line);
|
|
||||||
col = 0;
|
|
||||||
while (getline(ss, word, ',')) {
|
|
||||||
col = col % 15;
|
|
||||||
dataset[col].push_back(stof(word));
|
|
||||||
cout << col << "-" << word << " ";
|
|
||||||
col++;
|
|
||||||
}
|
|
||||||
cout << endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
labels y = labels(dataset[0].begin(), dataset[0].end());
|
|
||||||
cout << "Column 0 (y): " << y.size() << endl;
|
|
||||||
for (auto item : y) {
|
|
||||||
cout << item << " ";
|
|
||||||
}
|
|
||||||
CPPFImdlp test = CPPFImdlp(false, 6, true);
|
|
||||||
test.fit(dataset[3], y);
|
|
||||||
cout << "Cut points: " << test.getCutPoints().size() << endl;
|
|
||||||
for (auto item : test.getCutPoints()) {
|
|
||||||
cout << item << " ";
|
|
||||||
}
|
|
||||||
fin.close();
|
|
||||||
return 0;
|
|
||||||
}
|
|
@@ -1,6 +1,5 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from .cppfimdlp import CFImdlp
|
from .cppfimdlp import CFImdlp
|
||||||
from .pyfimdlp import PyFImdlp
|
|
||||||
from sklearn.base import BaseEstimator, TransformerMixin
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
from sklearn.utils.multiclass import unique_labels
|
from sklearn.utils.multiclass import unique_labels
|
||||||
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
|
||||||
|
@@ -1,479 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
from math import log2
|
|
||||||
from types import SimpleNamespace
|
|
||||||
|
|
||||||
|
|
||||||
class PyFImdlp:
|
|
||||||
def __init__(self, proposal=True, debug=False):
|
|
||||||
self.proposal = proposal
|
|
||||||
self.n_features_ = None
|
|
||||||
self.X_ = None
|
|
||||||
self.y_ = None
|
|
||||||
self.debug = debug
|
|
||||||
self.features_ = None
|
|
||||||
self.cut_points_ = []
|
|
||||||
self.entropy_cache = {}
|
|
||||||
self.information_gain_cache = {}
|
|
||||||
|
|
||||||
def fit(self, X, y):
|
|
||||||
self.n_features_ = len(X)
|
|
||||||
self.indices_ = np.argsort(X)
|
|
||||||
self.use_indices = False
|
|
||||||
X = [
|
|
||||||
4.3,
|
|
||||||
4.4,
|
|
||||||
4.4,
|
|
||||||
4.4,
|
|
||||||
4.5,
|
|
||||||
4.6,
|
|
||||||
4.6,
|
|
||||||
4.6,
|
|
||||||
4.6,
|
|
||||||
4.7,
|
|
||||||
4.7,
|
|
||||||
4.8,
|
|
||||||
4.8,
|
|
||||||
4.8,
|
|
||||||
4.8,
|
|
||||||
4.8,
|
|
||||||
4.9,
|
|
||||||
4.9,
|
|
||||||
4.9,
|
|
||||||
4.9,
|
|
||||||
4.9,
|
|
||||||
4.9,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.1,
|
|
||||||
5.2,
|
|
||||||
5.2,
|
|
||||||
5.2,
|
|
||||||
5.2,
|
|
||||||
5.3,
|
|
||||||
5.4,
|
|
||||||
5.4,
|
|
||||||
5.4,
|
|
||||||
5.4,
|
|
||||||
5.4,
|
|
||||||
5.4,
|
|
||||||
5.5,
|
|
||||||
5.5,
|
|
||||||
5.5,
|
|
||||||
5.5,
|
|
||||||
5.5,
|
|
||||||
5.5,
|
|
||||||
5.5,
|
|
||||||
5.6,
|
|
||||||
5.6,
|
|
||||||
5.6,
|
|
||||||
5.6,
|
|
||||||
5.6,
|
|
||||||
5.6,
|
|
||||||
5.7,
|
|
||||||
5.7,
|
|
||||||
5.7,
|
|
||||||
5.7,
|
|
||||||
5.7,
|
|
||||||
5.7,
|
|
||||||
5.7,
|
|
||||||
5.7,
|
|
||||||
5.8,
|
|
||||||
5.8,
|
|
||||||
5.8,
|
|
||||||
5.8,
|
|
||||||
5.8,
|
|
||||||
5.8,
|
|
||||||
5.8,
|
|
||||||
5.9,
|
|
||||||
5.9,
|
|
||||||
5.9,
|
|
||||||
6,
|
|
||||||
6,
|
|
||||||
6,
|
|
||||||
6,
|
|
||||||
6,
|
|
||||||
6,
|
|
||||||
6.1,
|
|
||||||
6.1,
|
|
||||||
6.1,
|
|
||||||
6.1,
|
|
||||||
6.1,
|
|
||||||
6.1,
|
|
||||||
6.2,
|
|
||||||
6.2,
|
|
||||||
6.2,
|
|
||||||
6.2,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.3,
|
|
||||||
6.4,
|
|
||||||
6.4,
|
|
||||||
6.4,
|
|
||||||
6.4,
|
|
||||||
6.4,
|
|
||||||
6.4,
|
|
||||||
6.4,
|
|
||||||
6.5,
|
|
||||||
6.5,
|
|
||||||
6.5,
|
|
||||||
6.5,
|
|
||||||
6.5,
|
|
||||||
6.6,
|
|
||||||
6.6,
|
|
||||||
6.7,
|
|
||||||
6.7,
|
|
||||||
6.7,
|
|
||||||
6.7,
|
|
||||||
6.7,
|
|
||||||
6.7,
|
|
||||||
6.7,
|
|
||||||
6.7,
|
|
||||||
6.8,
|
|
||||||
6.8,
|
|
||||||
6.8,
|
|
||||||
6.9,
|
|
||||||
6.9,
|
|
||||||
6.9,
|
|
||||||
6.9,
|
|
||||||
7,
|
|
||||||
7.1,
|
|
||||||
7.2,
|
|
||||||
7.2,
|
|
||||||
7.2,
|
|
||||||
7.3,
|
|
||||||
7.4,
|
|
||||||
7.6,
|
|
||||||
7.7,
|
|
||||||
7.7,
|
|
||||||
7.7,
|
|
||||||
7.7,
|
|
||||||
7.9,
|
|
||||||
]
|
|
||||||
y = [
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
2,
|
|
||||||
]
|
|
||||||
# self.X_ = X[self.indices_] if not self.use_indices else X
|
|
||||||
# self.y_ = y[self.indices_] if not self.use_indices else y
|
|
||||||
self.X_ = X
|
|
||||||
self.y_ = y
|
|
||||||
self.compute_cut_points(0, len(y))
|
|
||||||
return self
|
|
||||||
|
|
||||||
def get_cut_points(self):
|
|
||||||
return sorted(list(set([cut.value for cut in self.cut_points_])))
|
|
||||||
|
|
||||||
def compute_cut_points(self, start, end):
|
|
||||||
# print((start, end))
|
|
||||||
cut = self.get_candidate(start, end)
|
|
||||||
if cut.value is None:
|
|
||||||
return
|
|
||||||
print("cut: ", cut.value, " index: ", cut.index)
|
|
||||||
if self.mdlp(cut, start, end):
|
|
||||||
print("¡Ding!", cut.value, cut.index)
|
|
||||||
self.cut_points_.append(cut)
|
|
||||||
self.compute_cut_points(start, cut.index)
|
|
||||||
self.compute_cut_points(cut.index, end)
|
|
||||||
|
|
||||||
def mdlp(self, cut, start, end):
|
|
||||||
N = end - start
|
|
||||||
k = self.num_classes(start, end)
|
|
||||||
k1 = self.num_classes(start, cut.index)
|
|
||||||
k2 = self.num_classes(cut.index, end)
|
|
||||||
ent = self.entropy(start, end)
|
|
||||||
ent1 = self.entropy(start, cut.index)
|
|
||||||
ent2 = self.entropy(cut.index, end)
|
|
||||||
ig = self.information_gain(start, cut.index, end)
|
|
||||||
delta = log2(pow(3, k) - 2, 2) - (
|
|
||||||
float(k) * ent - float(k1) * ent1 - float(k2) * ent2
|
|
||||||
)
|
|
||||||
term = 1 / N * (log2(N - 1, 2) + delta)
|
|
||||||
print("start: ", start, " cut: ", cut.index, " end: ", end)
|
|
||||||
print(
|
|
||||||
"k=",
|
|
||||||
k,
|
|
||||||
" k1=",
|
|
||||||
k1,
|
|
||||||
" k2=",
|
|
||||||
k2,
|
|
||||||
" ent=",
|
|
||||||
ent,
|
|
||||||
" ent1=",
|
|
||||||
ent1,
|
|
||||||
" ent2=",
|
|
||||||
ent2,
|
|
||||||
)
|
|
||||||
print("ig=", ig, " delta=", delta, " N ", N, " term ", term)
|
|
||||||
return ig > term
|
|
||||||
|
|
||||||
def num_classes(self, start, end):
|
|
||||||
n_classes = set()
|
|
||||||
for i in range(start, end):
|
|
||||||
n_classes.add(
|
|
||||||
self.y_[self.indices_[i]] if self.use_indices else self.y_[i]
|
|
||||||
)
|
|
||||||
return len(n_classes)
|
|
||||||
|
|
||||||
def get_candidate(self, start, end):
|
|
||||||
"""Return the best cutpoint candidate for the given range.
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
start : int
|
|
||||||
Start of the range.
|
|
||||||
end : int
|
|
||||||
End of the range.
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
candidate : SimpleNamespace with attributes index and value
|
|
||||||
value == None if no candidate is found.
|
|
||||||
"""
|
|
||||||
candidate = SimpleNamespace()
|
|
||||||
candidate.value = None
|
|
||||||
minEntropy = float("inf")
|
|
||||||
for idx in range(start + 1, end):
|
|
||||||
condition = (
|
|
||||||
self.y_[self.indices_[idx]] == self.y_[self.indices_[idx - 1]]
|
|
||||||
if self.use_indices
|
|
||||||
else self.y_[idx] == self.y_[idx - 1]
|
|
||||||
)
|
|
||||||
if condition:
|
|
||||||
continue
|
|
||||||
entropy_left = self.entropy(start, idx)
|
|
||||||
entropy_right = self.entropy(idx, end)
|
|
||||||
entropy_cut = entropy_left + entropy_right
|
|
||||||
print(
|
|
||||||
"idx: ",
|
|
||||||
idx,
|
|
||||||
" entropy_left: ",
|
|
||||||
entropy_left,
|
|
||||||
" entropy_right : ",
|
|
||||||
entropy_right,
|
|
||||||
" -> ",
|
|
||||||
start,
|
|
||||||
" ",
|
|
||||||
end,
|
|
||||||
)
|
|
||||||
if entropy_cut < minEntropy:
|
|
||||||
minEntropy = entropy_cut
|
|
||||||
candidate.index = idx
|
|
||||||
if self.use_indices:
|
|
||||||
candidate.value = (
|
|
||||||
self.X_[self.indices_[idx]]
|
|
||||||
+ self.X_[self.indices_[idx - 1]]
|
|
||||||
) / 2
|
|
||||||
else:
|
|
||||||
candidate.value = (self.X_[idx] + self.X_[idx - 1]) / 2
|
|
||||||
return candidate
|
|
||||||
|
|
||||||
def entropy(self, start, end) -> float:
|
|
||||||
n_labels = end - start
|
|
||||||
if n_labels <= 1:
|
|
||||||
return 0
|
|
||||||
if (start, end) in self.entropy_cache:
|
|
||||||
return self.entropy_cache[(start, end)]
|
|
||||||
if self.use_indices:
|
|
||||||
counts = np.bincount(self.y_[self.indices_[start:end]])
|
|
||||||
else:
|
|
||||||
counts = np.bincount(self.y_[start:end])
|
|
||||||
proportions = counts / n_labels
|
|
||||||
n_classes = np.count_nonzero(proportions)
|
|
||||||
if n_classes <= 1:
|
|
||||||
return 0
|
|
||||||
entropy = 0.0
|
|
||||||
# Compute standard entropy.
|
|
||||||
for prop in proportions:
|
|
||||||
if prop != 0.0:
|
|
||||||
entropy -= prop * log2(prop, 2)
|
|
||||||
self.entropy_cache[(start, end)] = entropy
|
|
||||||
return entropy
|
|
||||||
|
|
||||||
def information_gain(self, start, cut, end):
|
|
||||||
if (start, cut, end) in self.information_gain_cache:
|
|
||||||
return self.information_gain_cache[(start, cut, end)]
|
|
||||||
labels = end - start
|
|
||||||
if labels == 0:
|
|
||||||
return 0.0
|
|
||||||
entropy = self.entropy(start, end)
|
|
||||||
card_left = cut - start
|
|
||||||
entropy_left = self.entropy(start, cut)
|
|
||||||
card_right = end - cut
|
|
||||||
entropy_right = self.entropy(cut, end)
|
|
||||||
result = (
|
|
||||||
entropy
|
|
||||||
- (card_left / labels) * entropy_left
|
|
||||||
- (card_right / labels) * entropy_right
|
|
||||||
)
|
|
||||||
self.information_gain_cache[(start, cut, end)] = result
|
|
||||||
return result
|
|
@@ -34,7 +34,7 @@ namespace mdlp {
|
|||||||
X = X_;
|
X = X_;
|
||||||
indices = indices_;
|
indices = indices_;
|
||||||
indices_t testSortedIndices = sortIndices(X);
|
indices_t testSortedIndices = sortIndices(X);
|
||||||
float prev = X[testSortedIndices[0]];
|
precision_t prev = X[testSortedIndices[0]];
|
||||||
for (auto i = 0; i < X.size(); ++i) {
|
for (auto i = 0; i < X.size(); ++i) {
|
||||||
EXPECT_EQ(testSortedIndices[i], indices[i]);
|
EXPECT_EQ(testSortedIndices[i], indices[i]);
|
||||||
EXPECT_LE(prev, X[testSortedIndices[i]]);
|
EXPECT_LE(prev, X[testSortedIndices[i]]);
|
||||||
@@ -162,7 +162,7 @@ namespace mdlp {
|
|||||||
fit(X, y);
|
fit(X, y);
|
||||||
computeCutPointsOriginal();
|
computeCutPointsOriginal();
|
||||||
cutPoints_t expected;
|
cutPoints_t expected;
|
||||||
vector<float> computed = getCutPoints();
|
vector<precision_t> computed = getCutPoints();
|
||||||
expected = {
|
expected = {
|
||||||
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
|
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
|
||||||
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 }
|
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 }
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
#include "../Metrics.h"
|
#include "../Metrics.h"
|
||||||
|
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
float precision = 0.000001;
|
precision_t precision = 0.000001;
|
||||||
TEST(MetricTest, NumClasses)
|
TEST(MetricTest, NumClasses)
|
||||||
{
|
{
|
||||||
labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
|
||||||
|
286
fimdlp/tests/bak/CPPFImdlp.cpp
Normal file
286
fimdlp/tests/bak/CPPFImdlp.cpp
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
#include "CPPFImdlp.h"
|
||||||
|
#include <numeric>
|
||||||
|
#include <iostream>
|
||||||
|
#include <algorithm>
|
||||||
|
#include "Metrics.h"
|
||||||
|
|
||||||
|
namespace mdlp {
|
||||||
|
ostream& operator << (ostream& os, const cutPoint_t& cut)
|
||||||
|
{
|
||||||
|
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
|
||||||
|
") - (" << cut.fromValue << ", " << cut.toValue << ") "
|
||||||
|
<< endl;
|
||||||
|
return os;
|
||||||
|
|
||||||
|
}
|
||||||
|
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false)
|
||||||
|
{
|
||||||
|
divider = pow(10, precision);
|
||||||
|
numClasses = 0;
|
||||||
|
}
|
||||||
|
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug)
|
||||||
|
{
|
||||||
|
divider = pow(10, precision);
|
||||||
|
numClasses = 0;
|
||||||
|
}
|
||||||
|
CPPFImdlp::~CPPFImdlp()
|
||||||
|
= default;
|
||||||
|
samples CPPFImdlp::getCutPoints()
|
||||||
|
{
|
||||||
|
samples output(cutPoints.size());
|
||||||
|
::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
|
||||||
|
[](cutPoint_t cut) { return cut.toValue; });
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
labels CPPFImdlp::getDiscretizedValues()
|
||||||
|
{
|
||||||
|
return xDiscretized;
|
||||||
|
}
|
||||||
|
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
|
||||||
|
{
|
||||||
|
X = X_;
|
||||||
|
y = y_;
|
||||||
|
if (X.size() != y.size()) {
|
||||||
|
throw invalid_argument("X and y must have the same size");
|
||||||
|
}
|
||||||
|
if (X.size() == 0 || y.size() == 0) {
|
||||||
|
throw invalid_argument("X and y must have at least one element");
|
||||||
|
}
|
||||||
|
indices = sortIndices(X_);
|
||||||
|
xDiscretized = labels(X.size(), -1);
|
||||||
|
numClasses = Metrics::numClasses(y, indices, 0, X.size());
|
||||||
|
|
||||||
|
if (proposal) {
|
||||||
|
computeCutPointsProposal();
|
||||||
|
} else {
|
||||||
|
computeCutPointsOriginal();
|
||||||
|
}
|
||||||
|
filterCutPoints();
|
||||||
|
// Apply cut points to the input vector
|
||||||
|
for (auto cut : cutPoints) {
|
||||||
|
for (size_t i = cut.start; i < cut.end; i++) {
|
||||||
|
xDiscretized[indices[i]] = cut.classNumber;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
|
||||||
|
{
|
||||||
|
int k, k1, k2;
|
||||||
|
precision_t ig, delta;
|
||||||
|
precision_t ent, ent1, ent2;
|
||||||
|
auto N = precision_t(rest.end - rest.start);
|
||||||
|
if (N < 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
k = Metrics::numClasses(y, indices, rest.start, rest.end);
|
||||||
|
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
|
||||||
|
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
|
||||||
|
ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses);
|
||||||
|
ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses);
|
||||||
|
ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses);
|
||||||
|
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
|
||||||
|
delta = log2(pow(3, precision_t(k)) - 2) - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
|
||||||
|
precision_t term = 1 / N * (log2(N - 1) + delta);
|
||||||
|
if (debug) {
|
||||||
|
cout << "Rest: " << rest;
|
||||||
|
cout << "Candidate: " << candidate;
|
||||||
|
cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl;
|
||||||
|
cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl;
|
||||||
|
}
|
||||||
|
return (ig > term);
|
||||||
|
}
|
||||||
|
void CPPFImdlp::filterCutPoints()
|
||||||
|
{
|
||||||
|
cutPoints_t filtered;
|
||||||
|
cutPoint_t rest, item;
|
||||||
|
int classNumber = 0;
|
||||||
|
|
||||||
|
rest.start = 0;
|
||||||
|
rest.end = X.size();
|
||||||
|
rest.fromValue = numeric_limits<precision_t>::lowest();
|
||||||
|
rest.toValue = numeric_limits<precision_t>::max();
|
||||||
|
rest.classNumber = classNumber;
|
||||||
|
bool first = true;
|
||||||
|
for (size_t index = 0; index < size_t(cutPoints.size()); index++) {
|
||||||
|
item = cutPoints[index];
|
||||||
|
if (evaluateCutPoint(rest, item)) {
|
||||||
|
if (debug)
|
||||||
|
cout << "Accepted: " << item << endl;
|
||||||
|
//Assign class number to the interval (cutpoint)
|
||||||
|
item.classNumber = classNumber++;
|
||||||
|
filtered.push_back(item);
|
||||||
|
first = false;
|
||||||
|
rest.start = item.end;
|
||||||
|
} else {
|
||||||
|
if (debug)
|
||||||
|
cout << "Rejected: " << item << endl;
|
||||||
|
if (index != size_t(cutPoints.size()) - 1) {
|
||||||
|
// Try to merge the rejected cutpoint with the next one
|
||||||
|
if (first) {
|
||||||
|
cutPoints[index + 1].fromValue = numeric_limits<precision_t>::lowest();
|
||||||
|
cutPoints[index + 1].start = indices[0];
|
||||||
|
} else {
|
||||||
|
cutPoints[index + 1].fromValue = item.fromValue;
|
||||||
|
cutPoints[index + 1].start = item.start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!first) {
|
||||||
|
filtered.back().toValue = numeric_limits<precision_t>::max();
|
||||||
|
filtered.back().end = X.size() - 1;
|
||||||
|
} else {
|
||||||
|
filtered.push_back(rest);
|
||||||
|
}
|
||||||
|
cutPoints = filtered;
|
||||||
|
}
|
||||||
|
void CPPFImdlp::computeCutPointsProposal()
|
||||||
|
{
|
||||||
|
cutPoints_t cutPts;
|
||||||
|
cutPoint_t cutPoint;
|
||||||
|
precision_t xPrev, xCur, xPivot;
|
||||||
|
int yPrev, yCur, yPivot;
|
||||||
|
size_t idx, numElements, start;
|
||||||
|
|
||||||
|
xCur = xPrev = X[indices[0]];
|
||||||
|
yCur = yPrev = y[indices[0]];
|
||||||
|
numElements = indices.size() - 1;
|
||||||
|
idx = start = 0;
|
||||||
|
bool firstCutPoint = true;
|
||||||
|
if (debug)
|
||||||
|
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
|
||||||
|
while (idx < numElements) {
|
||||||
|
xPivot = xCur;
|
||||||
|
yPivot = yCur;
|
||||||
|
if (debug)
|
||||||
|
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
||||||
|
// Read the same values and check class changes
|
||||||
|
do {
|
||||||
|
idx++;
|
||||||
|
xCur = X[indices[idx]];
|
||||||
|
yCur = y[indices[idx]];
|
||||||
|
if (yCur != yPivot && xCur == xPivot) {
|
||||||
|
yPivot = -1;
|
||||||
|
}
|
||||||
|
if (debug)
|
||||||
|
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
|
||||||
|
}
|
||||||
|
while (idx < numElements && xCur == xPivot);
|
||||||
|
// Check if the class changed and there are more than 1 element
|
||||||
|
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) {
|
||||||
|
// Must we add the entropy criteria here?
|
||||||
|
// if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point }
|
||||||
|
cutPoint.start = start;
|
||||||
|
cutPoint.end = idx;
|
||||||
|
start = idx;
|
||||||
|
cutPoint.fromValue = firstCutPoint ? numeric_limits<precision_t>::lowest() : cutPts.back().toValue;
|
||||||
|
cutPoint.toValue = (xPrev + xCur) / 2;
|
||||||
|
cutPoint.classNumber = -1;
|
||||||
|
firstCutPoint = false;
|
||||||
|
if (debug) {
|
||||||
|
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
||||||
|
}
|
||||||
|
cutPts.push_back(cutPoint);
|
||||||
|
}
|
||||||
|
yPrev = yPivot;
|
||||||
|
xPrev = xPivot;
|
||||||
|
}
|
||||||
|
if (idx == numElements) {
|
||||||
|
cutPoint.start = start;
|
||||||
|
cutPoint.end = numElements + 1;
|
||||||
|
cutPoint.fromValue = firstCutPoint ? numeric_limits<precision_t>::lowest() : cutPts.back().toValue;
|
||||||
|
cutPoint.toValue = numeric_limits<precision_t>::max();
|
||||||
|
cutPoint.classNumber = -1;
|
||||||
|
if (debug)
|
||||||
|
printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
|
||||||
|
cutPts.push_back(cutPoint);
|
||||||
|
}
|
||||||
|
if (debug) {
|
||||||
|
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << endl;
|
||||||
|
for (auto cutPt : cutPts)
|
||||||
|
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
|
||||||
|
}
|
||||||
|
cutPoints = cutPts;
|
||||||
|
}
|
||||||
|
void CPPFImdlp::computeCutPointsOriginal()
|
||||||
|
{
|
||||||
|
cutPoints_t cutPts;
|
||||||
|
cutPoint_t cutPoint;
|
||||||
|
precision_t xPrev;
|
||||||
|
int yPrev;
|
||||||
|
bool first = true;
|
||||||
|
// idxPrev is the index of the init instance of the cutPoint
|
||||||
|
size_t index, idxPrev = 0, last, idx = indices[0];
|
||||||
|
xPrev = X[idx];
|
||||||
|
yPrev = y[idx];
|
||||||
|
last = indices.size() - 1;
|
||||||
|
for (index = 0; index < last; index++) {
|
||||||
|
idx = indices[index];
|
||||||
|
// Definition 2 Cut points are always on class boundaries &&
|
||||||
|
// there are more than 1 items in the interval
|
||||||
|
// if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut)
|
||||||
|
if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) {
|
||||||
|
// Must we add the entropy criteria here?
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
cutPoint.fromValue = numeric_limits<precision_t>::lowest();
|
||||||
|
} else {
|
||||||
|
cutPoint.fromValue = cutPts.back().toValue;
|
||||||
|
}
|
||||||
|
cutPoint.start = idxPrev;
|
||||||
|
cutPoint.end = index;
|
||||||
|
cutPoint.classNumber = -1;
|
||||||
|
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
|
||||||
|
idxPrev = index;
|
||||||
|
cutPts.push_back(cutPoint);
|
||||||
|
}
|
||||||
|
xPrev = X[idx];
|
||||||
|
yPrev = y[idx];
|
||||||
|
}
|
||||||
|
if (first) {
|
||||||
|
cutPoint.start = 0;
|
||||||
|
cutPoint.classNumber = -1;
|
||||||
|
cutPoint.fromValue = numeric_limits<precision_t>::lowest();
|
||||||
|
cutPoint.toValue = numeric_limits<precision_t>::max();
|
||||||
|
cutPts.push_back(cutPoint);
|
||||||
|
} else
|
||||||
|
cutPts.back().toValue = numeric_limits<precision_t>::max();
|
||||||
|
cutPts.back().end = X.size();
|
||||||
|
if (debug) {
|
||||||
|
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << endl;
|
||||||
|
for (auto cutPt : cutPts)
|
||||||
|
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt;
|
||||||
|
}
|
||||||
|
cutPoints = cutPts;
|
||||||
|
}
|
||||||
|
bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla
|
||||||
|
|
||||||
|
|
||||||
|
*/
|
||||||
|
precision_t entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses);
|
||||||
|
precision_t entropyRight = Metrics::entropy(y, indices, cut, end, numClasses);
|
||||||
|
precision_t entropyInterval = Metrics::entropy(y, indices, start, end, numClasses);
|
||||||
|
if (debug)
|
||||||
|
printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval);
|
||||||
|
//return (entropyInterval - (entropyLeft + entropyRight) > 0);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||||
|
indices_t CPPFImdlp::sortIndices(samples& X_)
|
||||||
|
{
|
||||||
|
indices_t idx(X_.size());
|
||||||
|
iota(idx.begin(), idx.end(), 0);
|
||||||
|
for (size_t i = 0; i < X_.size(); i++)
|
||||||
|
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
|
||||||
|
{ return X_[i1] < X_[i2]; });
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
void CPPFImdlp::setCutPoints(cutPoints_t cutPoints_)
|
||||||
|
{
|
||||||
|
cutPoints = cutPoints_;
|
||||||
|
}
|
||||||
|
}
|
39
fimdlp/tests/bak/CPPFImdlp.h
Normal file
39
fimdlp/tests/bak/CPPFImdlp.h
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
#ifndef CPPFIMDLP_H
|
||||||
|
#define CPPFIMDLP_H
|
||||||
|
#include "typesFImdlp.h"
|
||||||
|
#include <utility>
|
||||||
|
namespace mdlp {
|
||||||
|
class CPPFImdlp {
|
||||||
|
protected:
|
||||||
|
bool proposal; // proposed algorithm or original algorithm
|
||||||
|
int precision;
|
||||||
|
bool debug;
|
||||||
|
precision_t divider;
|
||||||
|
indices_t indices; // sorted indices to use with X and y
|
||||||
|
samples X;
|
||||||
|
labels y;
|
||||||
|
labels xDiscretized;
|
||||||
|
int numClasses;
|
||||||
|
cutPoints_t cutPoints;
|
||||||
|
|
||||||
|
void setCutPoints(cutPoints_t);
|
||||||
|
static indices_t sortIndices(samples&);
|
||||||
|
void computeCutPointsOriginal();
|
||||||
|
void computeCutPointsProposal();
|
||||||
|
bool evaluateCutPoint(cutPoint_t, cutPoint_t);
|
||||||
|
void filterCutPoints();
|
||||||
|
bool goodCut(size_t, size_t, size_t); // if the cut candidate reduces entropy
|
||||||
|
|
||||||
|
public:
|
||||||
|
CPPFImdlp();
|
||||||
|
CPPFImdlp(bool, int, bool debug = false);
|
||||||
|
~CPPFImdlp();
|
||||||
|
samples getCutPoints();
|
||||||
|
indices_t getIndices();
|
||||||
|
labels getDiscretizedValues();
|
||||||
|
void debugPoints(samples&, labels&);
|
||||||
|
CPPFImdlp& fit(samples&, labels&);
|
||||||
|
labels transform(samples&);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
47
fimdlp/tests/bak/Metrics.cpp
Normal file
47
fimdlp/tests/bak/Metrics.cpp
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
#include "Metrics.h"
|
||||||
|
#include <set>
|
||||||
|
namespace mdlp {
|
||||||
|
Metrics::Metrics()
|
||||||
|
= default;
|
||||||
|
int Metrics::numClasses(labels& y, indices_t indices, size_t start, size_t end)
|
||||||
|
{
|
||||||
|
std::set<int> numClasses;
|
||||||
|
for (auto i = start; i < end; ++i) {
|
||||||
|
numClasses.insert(y[indices[i]]);
|
||||||
|
}
|
||||||
|
return numClasses.size();
|
||||||
|
}
|
||||||
|
precision_t Metrics::entropy(labels& y, indices_t& indices, size_t start, size_t end, int nClasses)
|
||||||
|
{
|
||||||
|
precision_t entropy = 0;
|
||||||
|
int nElements = 0;
|
||||||
|
labels counts(nClasses + 1, 0);
|
||||||
|
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
||||||
|
counts[y[*i]]++;
|
||||||
|
nElements++;
|
||||||
|
}
|
||||||
|
for (auto count : counts) {
|
||||||
|
if (count > 0) {
|
||||||
|
precision_t p = (precision_t)count / nElements;
|
||||||
|
entropy -= p * log2(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return entropy < 0 ? 0 : entropy;
|
||||||
|
}
|
||||||
|
precision_t Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses)
|
||||||
|
{
|
||||||
|
precision_t iGain;
|
||||||
|
precision_t entropy, entropyLeft, entropyRight;
|
||||||
|
int nClassesLeft, nClassesRight;
|
||||||
|
int nElementsLeft = cutPoint - start, nElementsRight = end - cutPoint;
|
||||||
|
int nElements = end - start;
|
||||||
|
nClassesLeft = Metrics::numClasses(y, indices, start, cutPoint);
|
||||||
|
nClassesRight = Metrics::numClasses(y, indices, cutPoint, end);
|
||||||
|
entropy = Metrics::entropy(y, indices, start, end, nClasses);
|
||||||
|
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
|
||||||
|
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
|
||||||
|
iGain = entropy - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements;
|
||||||
|
return iGain;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
14
fimdlp/tests/bak/Metrics.h
Normal file
14
fimdlp/tests/bak/Metrics.h
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
#ifndef METRICS_H
|
||||||
|
#define METRICS_H
|
||||||
|
#include "typesFImdlp.h"
|
||||||
|
#include <cmath>
|
||||||
|
namespace mdlp {
|
||||||
|
class Metrics {
|
||||||
|
public:
|
||||||
|
Metrics();
|
||||||
|
static int numClasses(labels&, indices_t, size_t, size_t);
|
||||||
|
static precision_t entropy(labels&, indices_t&, size_t, size_t, int);
|
||||||
|
static precision_t informationGain(labels&, indices_t&, size_t, size_t, size_t, int);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
@@ -5,21 +5,12 @@
|
|||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
namespace mdlp {
|
namespace mdlp {
|
||||||
struct CutPointBody {
|
typedef float precision_t;
|
||||||
size_t start, end; // indices of the sorted vector
|
typedef vector<precision_t> samples;
|
||||||
};
|
|
||||||
typedef CutPointBody cutPoint_t;
|
|
||||||
typedef vector<float> samples;
|
|
||||||
typedef vector<int> labels;
|
typedef vector<int> labels;
|
||||||
typedef vector<size_t> indices_t;
|
typedef vector<size_t> indices_t;
|
||||||
typedef vector<cutPoint_t> cutPoints_t;
|
typedef vector<precision_t> cutPoints_t;
|
||||||
typedef map<tuple<int, int>, float> cacheEnt_t;
|
typedef map<tuple<int, int>, precision_t> cacheEnt_t;
|
||||||
typedef map<tuple<int, int, int>, float> cacheIg_t;
|
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
|
||||||
struct cutPointStruct {
|
|
||||||
size_t index;
|
|
||||||
float value;
|
|
||||||
};
|
|
||||||
typedef cutPointStruct xcutPoint_t;
|
|
||||||
typedef vector<xcutPoint_t> xcutPoints_t;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
@@ -13,7 +13,7 @@ namespace FImdlp {
|
|||||||
int n = X.size();
|
int n = X.size();
|
||||||
for (i = 1; i < n; i++) {
|
for (i = 1; i < n; i++) {
|
||||||
if (X.at(i) != ant) {
|
if (X.at(i) != ant) {
|
||||||
cutPts.push_back(float(X.at(i) + ant) / 2);
|
cutPts.push_back(precision_t(X.at(i) + ant) / 2);
|
||||||
ant = X.at(i);
|
ant = X.at(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -5,7 +5,7 @@ from libcpp.vector cimport vector
|
|||||||
cdef extern from "FImdlp.h" namespace "FImdlp":
|
cdef extern from "FImdlp.h" namespace "FImdlp":
|
||||||
cdef cppclass FImdlp:
|
cdef cppclass FImdlp:
|
||||||
FImdlp() except +
|
FImdlp() except +
|
||||||
vector[float] cutPoints(vector[int]&, vector[int]&)
|
vector[precision_t] cutPoints(vector[int]&, vector[int]&)
|
||||||
|
|
||||||
cdef class CFImdlp:
|
cdef class CFImdlp:
|
||||||
cdef FImdlp *thisptr
|
cdef FImdlp *thisptr
|
||||||
|
6
setup.py
6
setup.py
@@ -12,10 +12,8 @@ setup(
|
|||||||
name="cppfimdlp",
|
name="cppfimdlp",
|
||||||
sources=[
|
sources=[
|
||||||
"fimdlp/cfimdlp.pyx",
|
"fimdlp/cfimdlp.pyx",
|
||||||
# "fimdlp/CPPFImdlp.cpp",
|
"fimdlp/CPPFImdlp.cpp",
|
||||||
# "fimdlp/Metrics.cpp",
|
"fimdlp/Metrics.cpp",
|
||||||
"fimdlp/ccMetrics.cc",
|
|
||||||
"fimdlp/ccFImdlp.cc",
|
|
||||||
],
|
],
|
||||||
language="c++",
|
language="c++",
|
||||||
include_dirs=["fimdlp"],
|
include_dirs=["fimdlp"],
|
||||||
|
Reference in New Issue
Block a user