Refactor base algorithm

This commit is contained in:
2022-12-08 22:28:21 +01:00
parent c4e5cf1629
commit 4939a5b673
25 changed files with 538 additions and 1130 deletions

View File

@@ -1,41 +1,20 @@
#include "CPPFImdlp.h"
#include <numeric> #include <numeric>
#include <iostream> #include <iostream>
#include <algorithm> #include <algorithm>
#include <set>
#include "CPPFImdlp.h"
#include "Metrics.h" #include "Metrics.h"
namespace mdlp { namespace mdlp {
ostream& operator << (ostream& os, const cutPoint_t& cut) CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
{ {
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
") - (" << cut.fromValue << ", " << cut.toValue << ") "
<< endl;
return os;
} }
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false) CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
{ {
divider = pow(10, precision);
numClasses = 0;
}
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug)
{
divider = pow(10, precision);
numClasses = 0;
} }
CPPFImdlp::~CPPFImdlp() CPPFImdlp::~CPPFImdlp()
= default; = default;
samples CPPFImdlp::getCutPoints()
{
samples output(cutPoints.size());
::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
[](cutPoint_t cut) { return cut.toValue; });
return output;
}
labels CPPFImdlp::getDiscretizedValues()
{
return xDiscretized;
}
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_) CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
{ {
X = X_; X = X_;
@@ -47,227 +26,78 @@ namespace mdlp {
throw invalid_argument("X and y must have at least one element"); throw invalid_argument("X and y must have at least one element");
} }
indices = sortIndices(X_); indices = sortIndices(X_);
xDiscretized = labels(X.size(), -1); metrics.setData(y, indices);
numClasses = Metrics::numClasses(y, indices, 0, X.size()); computeCutPoints(0, X.size());
if (proposal) {
computeCutPointsProposal();
} else {
computeCutPointsOriginal();
}
filterCutPoints();
// Apply cut points to the input vector
for (auto cut : cutPoints) {
for (size_t i = cut.start; i < cut.end; i++) {
xDiscretized[indices[i]] = cut.classNumber;
}
}
return *this; return *this;
} }
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate) void CPPFImdlp::computeCutPoints(size_t start, size_t end)
{
int cut;
if (end - start < 2)
return;
cut = getCandidate(start, end);
if (cut == -1 || !mdlp(start, cut, end)) {
// cut.value == -1 means that there is no candidate in the interval
// No boundary found, so we add both ends of the interval as cutpoints
// because they were selected by the algorithm before
if (start != 0)
cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2);
if (end != X.size())
cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2);
return;
}
computeCutPoints(start, cut);
computeCutPoints(cut, end);
}
long int CPPFImdlp::getCandidate(size_t start, size_t end)
{
long int candidate = -1, elements = end - start;
precision_t entropy_left, entropy_right, minEntropy = numeric_limits<precision_t>::max();
for (auto idx = start + 1; idx < end; idx++) {
// Cutpoints are always on boudndaries
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right;
candidate = idx;
}
}
return candidate;
}
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
{ {
int k, k1, k2; int k, k1, k2;
float ig, delta; precision_t ig, delta;
float ent, ent1, ent2; precision_t ent, ent1, ent2;
auto N = float(rest.end - rest.start); auto N = precision_t(end - start);
if (N < 2) { if (N < 2) {
return false; return false;
} }
k = Metrics::numClasses(y, indices, rest.start, rest.end); k = metrics.computeNumClasses(start, end);
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end); k1 = metrics.computeNumClasses(start, cut);
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end); k2 = metrics.computeNumClasses(cut, end);
ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses); ent = metrics.entropy(start, end);
ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses); ent1 = metrics.entropy(start, cut);
ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses); ent2 = metrics.entropy(cut, end);
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses); ig = metrics.informationGain(start, cut, end);
delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2); delta = log2(pow(3, precision_t(k)) - 2) -
float term = 1 / N * (log2(N - 1) + delta); (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
if (debug) { precision_t term = 1 / N * (log2(N - 1) + delta);
cout << "Rest: " << rest; return ig > term;
cout << "Candidate: " << candidate;
cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl;
cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl;
} }
return (ig > term); cutPoints_t CPPFImdlp::getCutPoints()
}
void CPPFImdlp::filterCutPoints()
{ {
cutPoints_t filtered; // Remove duplicates and sort
cutPoint_t rest, item; cutPoints_t output(cutPoints.size());
int classNumber = 0; set<precision_t> s;
unsigned size = cutPoints.size();
rest.start = 0; for (unsigned i = 0; i < size; i++)
rest.end = X.size(); s.insert(cutPoints[i]);
rest.fromValue = numeric_limits<float>::lowest(); output.assign(s.begin(), s.end());
rest.toValue = numeric_limits<float>::max(); sort(output.begin(), output.end());
rest.classNumber = classNumber; return output;
bool first = true;
for (size_t index = 0; index < size_t(cutPoints.size()); index++) {
item = cutPoints[index];
if (evaluateCutPoint(rest, item)) {
if (debug)
cout << "Accepted: " << item << endl;
//Assign class number to the interval (cutpoint)
item.classNumber = classNumber++;
filtered.push_back(item);
first = false;
rest.start = item.end;
} else {
if (debug)
cout << "Rejected: " << item << endl;
if (index != size_t(cutPoints.size()) - 1) {
// Try to merge the rejected cutpoint with the next one
if (first) {
cutPoints[index + 1].fromValue = numeric_limits<float>::lowest();
cutPoints[index + 1].start = indices[0];
} else {
cutPoints[index + 1].fromValue = item.fromValue;
cutPoints[index + 1].start = item.start;
}
}
}
}
if (!first) {
filtered.back().toValue = numeric_limits<float>::max();
filtered.back().end = X.size() - 1;
} else {
filtered.push_back(rest);
}
cutPoints = filtered;
}
void CPPFImdlp::computeCutPointsProposal()
{
cutPoints_t cutPts;
cutPoint_t cutPoint;
float xPrev, xCur, xPivot;
int yPrev, yCur, yPivot;
size_t idx, numElements, start;
xCur = xPrev = X[indices[0]];
yCur = yPrev = y[indices[0]];
numElements = indices.size() - 1;
idx = start = 0;
bool firstCutPoint = true;
if (debug)
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
while (idx < numElements) {
xPivot = xCur;
yPivot = yCur;
if (debug)
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
// Read the same values and check class changes
do {
idx++;
xCur = X[indices[idx]];
yCur = y[indices[idx]];
if (yCur != yPivot && xCur == xPivot) {
yPivot = -1;
}
if (debug)
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
}
while (idx < numElements && xCur == xPivot);
// Check if the class changed and there are more than 1 element
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) {
// Must we add the entropy criteria here?
// if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point }
cutPoint.start = start;
cutPoint.end = idx;
start = idx;
cutPoint.fromValue = firstCutPoint ? numeric_limits<float>::lowest() : cutPts.back().toValue;
cutPoint.toValue = (xPrev + xCur) / 2;
cutPoint.classNumber = -1;
firstCutPoint = false;
if (debug) {
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
}
cutPts.push_back(cutPoint);
}
yPrev = yPivot;
xPrev = xPivot;
}
if (idx == numElements) {
cutPoint.start = start;
cutPoint.end = numElements + 1;
cutPoint.fromValue = firstCutPoint ? numeric_limits<float>::lowest() : cutPts.back().toValue;
cutPoint.toValue = numeric_limits<float>::max();
cutPoint.classNumber = -1;
if (debug)
printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
cutPts.push_back(cutPoint);
}
if (debug) {
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << endl;
for (auto cutPt : cutPts)
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
}
cutPoints = cutPts;
}
void CPPFImdlp::computeCutPointsOriginal()
{
cutPoints_t cutPts;
cutPoint_t cutPoint;
float xPrev;
int yPrev;
bool first = true;
// idxPrev is the index of the init instance of the cutPoint
size_t index, idxPrev = 0, last, idx = indices[0];
xPrev = X[idx];
yPrev = y[idx];
last = indices.size() - 1;
for (index = 0; index < last; index++) {
idx = indices[index];
// Definition 2 Cut points are always on class boundaries &&
// there are more than 1 items in the interval
// if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut)
if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) {
// Must we add the entropy criteria here?
if (first) {
first = false;
cutPoint.fromValue = numeric_limits<float>::lowest();
} else {
cutPoint.fromValue = cutPts.back().toValue;
}
cutPoint.start = idxPrev;
cutPoint.end = index;
cutPoint.classNumber = -1;
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
idxPrev = index;
cutPts.push_back(cutPoint);
}
xPrev = X[idx];
yPrev = y[idx];
}
if (first) {
cutPoint.start = 0;
cutPoint.classNumber = -1;
cutPoint.fromValue = numeric_limits<float>::lowest();
cutPoint.toValue = numeric_limits<float>::max();
cutPts.push_back(cutPoint);
} else
cutPts.back().toValue = numeric_limits<float>::max();
cutPts.back().end = X.size();
if (debug) {
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << endl;
for (auto cutPt : cutPts)
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt;
}
cutPoints = cutPts;
}
bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end)
{
/*
Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla
*/
float entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses);
float entropyRight = Metrics::entropy(y, indices, cut, end, numClasses);
float entropyInterval = Metrics::entropy(y, indices, start, end, numClasses);
if (debug)
printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval);
//return (entropyInterval - (entropyLeft + entropyRight) > 0);
return true;
} }
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples& X_) indices_t CPPFImdlp::sortIndices(samples& X_)
@@ -275,12 +105,8 @@ namespace mdlp {
indices_t idx(X_.size()); indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0); iota(idx.begin(), idx.end(), 0);
for (size_t i = 0; i < X_.size(); i++) for (size_t i = 0; i < X_.size(); i++)
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
{ return X_[i1] < X_[i2]; }); { return X_[i1] < X_[i2]; });
return idx; return idx;
} }
void CPPFImdlp::setCutPoints(cutPoints_t cutPoints_)
{
cutPoints = cutPoints_;
}
} }

View File

@@ -1,39 +1,30 @@
#ifndef CPPFIMDLP_H #ifndef CPPFIMDLP_H
#define CPPFIMDLP_H #define CPPFIMDLP_H
#include "typesFImdlp.h" #include "typesFImdlp.h"
#include "Metrics.h"
#include <utility> #include <utility>
namespace mdlp { namespace mdlp {
class CPPFImdlp { class CPPFImdlp {
protected: protected:
bool proposal; // proposed algorithm or original algorithm bool proposal; // proposed algorithm or original algorithm
int precision;
bool debug; bool debug;
float divider;
indices_t indices; // sorted indices to use with X and y indices_t indices; // sorted indices to use with X and y
samples X; samples X;
labels y; labels y;
labels xDiscretized; Metrics metrics;
int numClasses;
cutPoints_t cutPoints; cutPoints_t cutPoints;
void setCutPoints(cutPoints_t);
static indices_t sortIndices(samples&); static indices_t sortIndices(samples&);
void computeCutPointsOriginal(); void computeCutPoints(size_t, size_t);
void computeCutPointsProposal(); long int getCandidate(size_t, size_t);
bool evaluateCutPoint(cutPoint_t, cutPoint_t); bool mdlp(size_t, size_t, size_t);
void filterCutPoints();
bool goodCut(size_t, size_t, size_t); // if the cut candidate reduces entropy
public: public:
CPPFImdlp(); CPPFImdlp();
CPPFImdlp(bool, int, bool debug = false); CPPFImdlp(bool, bool debug = false);
~CPPFImdlp(); ~CPPFImdlp();
samples getCutPoints();
indices_t getIndices();
labels getDiscretizedValues();
void debugPoints(samples&, labels&);
CPPFImdlp& fit(samples&, labels&); CPPFImdlp& fit(samples&, labels&);
labels transform(samples&); samples getCutPoints();
}; };
} }
#endif #endif

View File

@@ -1,46 +1,63 @@
#include "Metrics.h" #include "Metrics.h"
#include <set> #include <set>
#include <iostream>
using namespace std;
namespace mdlp { namespace mdlp {
Metrics::Metrics() Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
= default;
int Metrics::numClasses(labels& y, indices_t indices, size_t start, size_t end)
{ {
std::set<int> numClasses; }
int Metrics::computeNumClasses(size_t start, size_t end)
{
set<int> nClasses;
for (auto i = start; i < end; ++i) { for (auto i = start; i < end; ++i) {
numClasses.insert(y[indices[i]]); nClasses.insert(y[indices[i]]);
} }
return numClasses.size(); return nClasses.size();
} }
float Metrics::entropy(labels& y, indices_t& indices, size_t start, size_t end, int nClasses) void Metrics::setData(labels& y_, indices_t& indices_)
{ {
float entropy = 0; indices = indices_;
y = y_;
numClasses = computeNumClasses(0, indices.size());
}
precision_t Metrics::entropy(size_t start, size_t end)
{
precision_t p, ventropy = 0;
int nElements = 0; int nElements = 0;
labels counts(nClasses + 1, 0); labels counts(numClasses + 1, 0);
if (end - start < 2)
return 0;
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
return entropyCache[make_tuple(start, end)];
}
for (auto i = &indices[start]; i != &indices[end]; ++i) { for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++; counts[y[*i]]++;
nElements++; nElements++;
} }
for (auto count : counts) { for (auto count : counts) {
if (count > 0) { if (count > 0) {
float p = (float)count / nElements; p = (precision_t)count / nElements;
entropy -= p * log2(p); ventropy -= p * log2(p);
} }
} }
return entropy < 0 ? 0 : entropy; entropyCache[make_tuple(start, end)] = ventropy;
return ventropy;
} }
float Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses) precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
{ {
float iGain; precision_t iGain;
float entropy, entropyLeft, entropyRight; precision_t entropyInterval, entropyLeft, entropyRight;
int nClassesLeft, nClassesRight; int nElementsLeft = cut - start, nElementsRight = end - cut;
int nElementsLeft = cutPoint - start, nElementsRight = end - cutPoint;
int nElements = end - start; int nElements = end - start;
nClassesLeft = Metrics::numClasses(y, indices, start, cutPoint); if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
nClassesRight = Metrics::numClasses(y, indices, cutPoint, end); cout << "**********Cache IG hit for " << start << " " << end << endl;
entropy = Metrics::entropy(y, indices, start, end, nClasses); return igCache[make_tuple(start, cut, end)];
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft); }
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight); entropyInterval = entropy(start, end);
iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements; entropyLeft = entropy(start, cut);
entropyRight = entropy(cut, end);
iGain = entropyInterval - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements;
igCache[make_tuple(start, cut, end)] = iGain;
return iGain; return iGain;
} }

View File

@@ -1,14 +1,21 @@
#ifndef METRICS_H #ifndef CCMETRICS_H
#define METRICS_H #define CCMETRICS_H
#include "typesFImdlp.h" #include "typesFImdlp.h"
#include <cmath> #include <cmath>
namespace mdlp { namespace mdlp {
class Metrics { class Metrics {
protected:
labels& y;
indices_t& indices;
int numClasses;
cacheEnt_t entropyCache;
cacheIg_t igCache;
public: public:
Metrics(); Metrics(labels&, indices_t&);
static int numClasses(labels&, indices_t, size_t, size_t); void setData(labels&, indices_t&);
static float entropy(labels&, indices_t&, size_t, size_t, int); int computeNumClasses(size_t, size_t);
static float informationGain(labels&, indices_t&, size_t, size_t, size_t, int); precision_t entropy(size_t, size_t);
precision_t informationGain(size_t, size_t, size_t);
}; };
} }
#endif #endif

View File

@@ -1,110 +0,0 @@
#include "ccFImdlp.h"
#include <numeric>
#include <iostream>
#include <algorithm>
#include <set>
#include "ccMetrics.h"
namespace mdlp {
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
{
}
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices))
{
}
CPPFImdlp::~CPPFImdlp()
= default;
CPPFImdlp& CPPFImdlp::fitx(samples& X_, labels& y_)
{
X = X_;
y = y_;
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
}
if (X.size() == 0 || y.size() == 0) {
throw invalid_argument("X and y must have at least one element");
}
indices = sortIndices(X_);
metrics.setData(y, indices);
computeCutPoints(0, X.size());
return *this;
}
void CPPFImdlp::computeCutPoints(size_t start, size_t end)
{
int cut;
if (end - start < 2)
return;
cut = getCandidate(start, end);
if (cut == -1 || !mdlp(start, cut, end)) {
// cut.value == -1 means that there is no candidate in the interval
// that enhances the information gain
if (start != 0)
xCutPoints.push_back(xcutPoint_t({ start, (X[indices[start]] + X[indices[start - 1]]) / 2 }));
if (end != X.size())
xCutPoints.push_back(xcutPoint_t({ end, (X[indices[end]] + X[indices[end - 1]]) / 2 }));
return;
}
computeCutPoints(start, cut);
computeCutPoints(cut, end);
}
long int CPPFImdlp::getCandidate(size_t start, size_t end)
{
long int candidate = -1, elements = end - start;
float entropy_left, entropy_right, minEntropy = numeric_limits<float>::max();
for (auto idx = start + 1; idx < end; idx++) {
// Cutpoints are always on boudndaries
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = float(idx - start) / elements * metrics.entropy(start, idx);
entropy_right = float(end - idx) / elements * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right;
candidate = idx;
}
}
return candidate;
}
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
{
int k, k1, k2;
float ig, delta;
float ent, ent1, ent2;
auto N = float(end - start);
if (N < 2) {
return false;
}
k = metrics.computeNumClasses(start, end);
k1 = metrics.computeNumClasses(start, cut);
k2 = metrics.computeNumClasses(cut, end);
ent = metrics.entropy(start, end);
ent1 = metrics.entropy(start, cut);
ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end);
delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2);
float term = 1 / N * (log2(N - 1) + delta);
return ig > term;
}
samples CPPFImdlp::getCutPointsx()
{
// Remove duplicates and sort
samples output(xCutPoints.size());
set<float> s;
unsigned size = xCutPoints.size();
for (unsigned i = 0; i < size; i++)
s.insert(xCutPoints[i].value);
output.assign(s.begin(), s.end());
sort(output.begin(), output.end());
return output;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples& X_)
{
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
for (size_t i = 0; i < X_.size(); i++)
sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
{ return X_[i1] < X_[i2]; });
return idx;
}
}

View File

@@ -1,32 +0,0 @@
#ifndef CCFIMDLP_H
#define CCFIMDLP_H
#include "typesFImdlp.h"
#include "ccMetrics.h"
#include <utility>
namespace mdlp {
class CPPFImdlp {
protected:
bool proposal; // proposed algorithm or original algorithm
int precision;
bool debug;
float divider;
indices_t indices; // sorted indices to use with X and y
samples X;
labels y;
Metrics metrics;
xcutPoints_t xCutPoints;
static indices_t sortIndices(samples&);
void computeCutPoints(size_t, size_t);
long int getCandidate(size_t, size_t);
bool mdlp(size_t, size_t, size_t);
public:
CPPFImdlp();
CPPFImdlp(bool, int, bool debug = false);
~CPPFImdlp();
CPPFImdlp& fitx(samples&, labels&);
samples getCutPointsx();
};
}
#endif

View File

@@ -1,74 +0,0 @@
#include "ccMetrics.h"
#include <set>
#include <iostream>
using namespace std;
namespace mdlp {
Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t())
{
}
int Metrics::computeNumClasses(size_t start, size_t end)
{
set<int> nClasses;
for (auto i = start; i < end; ++i) {
nClasses.insert(y[indices[i]]);
}
return nClasses.size();
}
void Metrics::setData(labels& y_, indices_t& indices_)
{
indices = indices_;
y = y_;
numClasses = computeNumClasses(0, indices.size());
}
float Metrics::entropy(size_t start, size_t end)
{
float p, ventropy = 0;
int nElements = 0;
labels counts(numClasses + 1, 0);
if (end - start < 2)
return 0;
if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) {
return entropyCache[make_tuple(start, end)];
}
for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++;
nElements++;
}
for (auto count : counts) {
if (count > 0) {
p = (float)count / nElements;
ventropy -= p * log2(p);
}
}
entropyCache[make_tuple(start, end)] = ventropy;
return ventropy;
}
float Metrics::informationGain(size_t start, size_t cut, size_t end)
{
float iGain;
float entropyInterval, entropyLeft, entropyRight;
int nElementsLeft = cut - start, nElementsRight = end - cut;
int nElements = end - start;
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
cout << "**********Cache IG hit for " << start << " " << end << endl;
return igCache[make_tuple(start, cut, end)];
}
entropyInterval = entropy(start, end);
entropyLeft = entropy(start, cut);
entropyRight = entropy(cut, end);
iGain = entropyInterval - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements;
igCache[make_tuple(start, cut, end)] = iGain;
return iGain;
}
}
/*
cache_t entropyCache;
std::map<std::tuple<int, int>, double> c;
// Set the value at index (3, 5) to 7.8.
c[std::make_tuple(3, 5)] = 7.8;
// Print the value at index (3, 5).
std::cout << c[std::make_tuple(3, 5)] << std::endl;
*/

View File

@@ -1,21 +0,0 @@
#ifndef CCMETRICS_H
#define CCMETRICS_H
#include "typesFImdlp.h"
#include <cmath>
namespace mdlp {
class Metrics {
protected:
labels& y;
indices_t& indices;
int numClasses;
cacheEnt_t entropyCache;
cacheIg_t igCache;
public:
Metrics(labels&, indices_t&);
void setData(labels&, indices_t&);
int computeNumClasses(size_t, size_t);
float entropy(size_t, size_t);
float informationGain(size_t, size_t, size_t);
};
}
#endif

View File

@@ -3,16 +3,13 @@
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp cimport bool from libcpp cimport bool
cdef extern from "ccFImdlp.h" namespace "mdlp": cdef extern from "CPPFImdlp.h" namespace "mdlp":
cdef struct CutPointBody: ctypedef float precision_t
size_t start, end;
int classNumber;
float fromValue, toValue;
cdef cppclass CPPFImdlp: cdef cppclass CPPFImdlp:
CPPFImdlp() except + CPPFImdlp() except +
CPPFImdlp(bool, int, bool) except + CPPFImdlp(bool, bool) except +
CPPFImdlp& fitx(vector[float]&, vector[int]&) CPPFImdlp& fit(vector[precision_t]&, vector[int]&)
vector[float] getCutPointsx() vector[precision_t] getCutPoints()
class PcutPoint_t: class PcutPoint_t:
@@ -24,14 +21,14 @@ class PcutPoint_t:
cdef class CFImdlp: cdef class CFImdlp:
cdef CPPFImdlp *thisptr cdef CPPFImdlp *thisptr
def __cinit__(self, precision=6, debug=False, proposal=True): def __cinit__(self, debug=False, proposal=True):
# Proposal or original algorithm # Proposal or original algorithm
self.thisptr = new CPPFImdlp(proposal, precision, debug) self.thisptr = new CPPFImdlp(proposal, debug)
def __dealloc__(self): def __dealloc__(self):
del self.thisptr del self.thisptr
def fit(self, X, y): def fit(self, X, y):
self.thisptr.fitx(X, y) self.thisptr.fit(X, y)
return self return self
def get_cut_points(self): def get_cut_points(self):
return self.thisptr.getCutPointsx() return self.thisptr.getCutPoints()

View File

@@ -1,36 +0,0 @@
#include <vector>
using namespace std;
struct CutPointBody {
size_t start, end; // indices of the sorted vector
int classNumber; // class assigned to the cut point
float fromValue, toValue;
};
typedef CutPointBody cutPoint_t;
typedef vector<float> samples;
typedef vector<int> labels;
typedef vector<size_t> indices_t;
typedef vector<cutPoint_t> cutPoints_t;
//typedef std::map<std::tuple<int, int>, float> cache_t;
struct cutPointStruct {
size_t index;
float value;
};
typedef cutPointStruct xcutPoint_t;
typedef vector<xcutPoint_t> xcutPoints_t;
class Metrics {
private:
labels& y;
indices_t& indices;
int numClasses;
public:
Metrics(labels&, indices_t&);
int computeNumClasses(size_t, size_t);
float entropy(size_t, size_t);
float informationGain(size_t, size_t, size_t);
};
Metrics::Metrics(labels& y_, indices_t& indices_) : y(y_), indices(indices_)
{
numClasses = computeNumClasses(0, indices.size());
}

Binary file not shown.

View File

@@ -1,52 +0,0 @@
#include "CPPFImdlp.h"
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <sstream>
using namespace std;
using namespace mdlp;
int main()
{
ifstream fin("kdd_JapaneseVowels.arff");
if (!fin.is_open()) {
cout << "Error opening file" << endl;
return 1;
}
int count = 0;
// Read the Data from the file
// as String Vector
size_t col;
vector<string> row;
string line, word;
vector<vector<float>> dataset = vector<vector<float>>(15, vector<float>());
while (getline(fin, line)) {
if (count++ > 215) {
stringstream ss(line);
col = 0;
while (getline(ss, word, ',')) {
col = col % 15;
dataset[col].push_back(stof(word));
cout << col << "-" << word << " ";
col++;
}
cout << endl;
}
}
labels y = labels(dataset[0].begin(), dataset[0].end());
cout << "Column 0 (y): " << y.size() << endl;
for (auto item : y) {
cout << item << " ";
}
CPPFImdlp test = CPPFImdlp(false, 6, true);
test.fit(dataset[3], y);
cout << "Cut points: " << test.getCutPoints().size() << endl;
for (auto item : test.getCutPoints()) {
cout << item << " ";
}
fin.close();
return 0;
}

View File

@@ -1,6 +1,5 @@
import numpy as np import numpy as np
from .cppfimdlp import CFImdlp from .cppfimdlp import CFImdlp
from .pyfimdlp import PyFImdlp
from sklearn.base import BaseEstimator, TransformerMixin from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

View File

@@ -1,479 +0,0 @@
import numpy as np
from math import log2
from types import SimpleNamespace
class PyFImdlp:
def __init__(self, proposal=True, debug=False):
self.proposal = proposal
self.n_features_ = None
self.X_ = None
self.y_ = None
self.debug = debug
self.features_ = None
self.cut_points_ = []
self.entropy_cache = {}
self.information_gain_cache = {}
def fit(self, X, y):
self.n_features_ = len(X)
self.indices_ = np.argsort(X)
self.use_indices = False
X = [
4.3,
4.4,
4.4,
4.4,
4.5,
4.6,
4.6,
4.6,
4.6,
4.7,
4.7,
4.8,
4.8,
4.8,
4.8,
4.8,
4.9,
4.9,
4.9,
4.9,
4.9,
4.9,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.1,
5.2,
5.2,
5.2,
5.2,
5.3,
5.4,
5.4,
5.4,
5.4,
5.4,
5.4,
5.5,
5.5,
5.5,
5.5,
5.5,
5.5,
5.5,
5.6,
5.6,
5.6,
5.6,
5.6,
5.6,
5.7,
5.7,
5.7,
5.7,
5.7,
5.7,
5.7,
5.7,
5.8,
5.8,
5.8,
5.8,
5.8,
5.8,
5.8,
5.9,
5.9,
5.9,
6,
6,
6,
6,
6,
6,
6.1,
6.1,
6.1,
6.1,
6.1,
6.1,
6.2,
6.2,
6.2,
6.2,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.3,
6.4,
6.4,
6.4,
6.4,
6.4,
6.4,
6.4,
6.5,
6.5,
6.5,
6.5,
6.5,
6.6,
6.6,
6.7,
6.7,
6.7,
6.7,
6.7,
6.7,
6.7,
6.7,
6.8,
6.8,
6.8,
6.9,
6.9,
6.9,
6.9,
7,
7.1,
7.2,
7.2,
7.2,
7.3,
7.4,
7.6,
7.7,
7.7,
7.7,
7.7,
7.9,
]
y = [
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
2,
0,
0,
1,
0,
0,
0,
0,
1,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1,
0,
0,
0,
0,
0,
0,
1,
0,
1,
1,
1,
1,
1,
0,
1,
1,
2,
1,
1,
1,
1,
1,
1,
0,
1,
2,
0,
1,
1,
2,
0,
1,
2,
1,
2,
2,
1,
1,
2,
1,
1,
1,
2,
1,
2,
2,
1,
1,
1,
1,
2,
2,
1,
1,
2,
2,
1,
2,
2,
1,
2,
1,
2,
2,
1,
2,
2,
2,
1,
2,
2,
2,
1,
2,
2,
1,
1,
2,
2,
2,
2,
2,
1,
1,
1,
2,
2,
1,
2,
1,
2,
2,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
2,
]
# self.X_ = X[self.indices_] if not self.use_indices else X
# self.y_ = y[self.indices_] if not self.use_indices else y
self.X_ = X
self.y_ = y
self.compute_cut_points(0, len(y))
return self
def get_cut_points(self):
return sorted(list(set([cut.value for cut in self.cut_points_])))
def compute_cut_points(self, start, end):
# print((start, end))
cut = self.get_candidate(start, end)
if cut.value is None:
return
print("cut: ", cut.value, " index: ", cut.index)
if self.mdlp(cut, start, end):
print("¡Ding!", cut.value, cut.index)
self.cut_points_.append(cut)
self.compute_cut_points(start, cut.index)
self.compute_cut_points(cut.index, end)
def mdlp(self, cut, start, end):
N = end - start
k = self.num_classes(start, end)
k1 = self.num_classes(start, cut.index)
k2 = self.num_classes(cut.index, end)
ent = self.entropy(start, end)
ent1 = self.entropy(start, cut.index)
ent2 = self.entropy(cut.index, end)
ig = self.information_gain(start, cut.index, end)
delta = log2(pow(3, k) - 2, 2) - (
float(k) * ent - float(k1) * ent1 - float(k2) * ent2
)
term = 1 / N * (log2(N - 1, 2) + delta)
print("start: ", start, " cut: ", cut.index, " end: ", end)
print(
"k=",
k,
" k1=",
k1,
" k2=",
k2,
" ent=",
ent,
" ent1=",
ent1,
" ent2=",
ent2,
)
print("ig=", ig, " delta=", delta, " N ", N, " term ", term)
return ig > term
def num_classes(self, start, end):
n_classes = set()
for i in range(start, end):
n_classes.add(
self.y_[self.indices_[i]] if self.use_indices else self.y_[i]
)
return len(n_classes)
def get_candidate(self, start, end):
"""Return the best cutpoint candidate for the given range.
Parameters
----------
start : int
Start of the range.
end : int
End of the range.
Returns
-------
candidate : SimpleNamespace with attributes index and value
value == None if no candidate is found.
"""
candidate = SimpleNamespace()
candidate.value = None
minEntropy = float("inf")
for idx in range(start + 1, end):
condition = (
self.y_[self.indices_[idx]] == self.y_[self.indices_[idx - 1]]
if self.use_indices
else self.y_[idx] == self.y_[idx - 1]
)
if condition:
continue
entropy_left = self.entropy(start, idx)
entropy_right = self.entropy(idx, end)
entropy_cut = entropy_left + entropy_right
print(
"idx: ",
idx,
" entropy_left: ",
entropy_left,
" entropy_right : ",
entropy_right,
" -> ",
start,
" ",
end,
)
if entropy_cut < minEntropy:
minEntropy = entropy_cut
candidate.index = idx
if self.use_indices:
candidate.value = (
self.X_[self.indices_[idx]]
+ self.X_[self.indices_[idx - 1]]
) / 2
else:
candidate.value = (self.X_[idx] + self.X_[idx - 1]) / 2
return candidate
def entropy(self, start, end) -> float:
n_labels = end - start
if n_labels <= 1:
return 0
if (start, end) in self.entropy_cache:
return self.entropy_cache[(start, end)]
if self.use_indices:
counts = np.bincount(self.y_[self.indices_[start:end]])
else:
counts = np.bincount(self.y_[start:end])
proportions = counts / n_labels
n_classes = np.count_nonzero(proportions)
if n_classes <= 1:
return 0
entropy = 0.0
# Compute standard entropy.
for prop in proportions:
if prop != 0.0:
entropy -= prop * log2(prop, 2)
self.entropy_cache[(start, end)] = entropy
return entropy
def information_gain(self, start, cut, end):
if (start, cut, end) in self.information_gain_cache:
return self.information_gain_cache[(start, cut, end)]
labels = end - start
if labels == 0:
return 0.0
entropy = self.entropy(start, end)
card_left = cut - start
entropy_left = self.entropy(start, cut)
card_right = end - cut
entropy_right = self.entropy(cut, end)
result = (
entropy
- (card_left / labels) * entropy_left
- (card_right / labels) * entropy_right
)
self.information_gain_cache[(start, cut, end)] = result
return result

View File

@@ -34,7 +34,7 @@ namespace mdlp {
X = X_; X = X_;
indices = indices_; indices = indices_;
indices_t testSortedIndices = sortIndices(X); indices_t testSortedIndices = sortIndices(X);
float prev = X[testSortedIndices[0]]; precision_t prev = X[testSortedIndices[0]];
for (auto i = 0; i < X.size(); ++i) { for (auto i = 0; i < X.size(); ++i) {
EXPECT_EQ(testSortedIndices[i], indices[i]); EXPECT_EQ(testSortedIndices[i], indices[i]);
EXPECT_LE(prev, X[testSortedIndices[i]]); EXPECT_LE(prev, X[testSortedIndices[i]]);
@@ -162,7 +162,7 @@ namespace mdlp {
fit(X, y); fit(X, y);
computeCutPointsOriginal(); computeCutPointsOriginal();
cutPoints_t expected; cutPoints_t expected;
vector<float> computed = getCutPoints(); vector<precision_t> computed = getCutPoints();
expected = { expected = {
{ 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 },
{ 6, 10, -1, 5.45, 3.4028234663852886e+38 } { 6, 10, -1, 5.45, 3.4028234663852886e+38 }

View File

@@ -2,7 +2,7 @@
#include "../Metrics.h" #include "../Metrics.h"
namespace mdlp { namespace mdlp {
float precision = 0.000001; precision_t precision = 0.000001;
TEST(MetricTest, NumClasses) TEST(MetricTest, NumClasses)
{ {
labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };

View File

@@ -0,0 +1,286 @@
#include "CPPFImdlp.h"
#include <numeric>
#include <iostream>
#include <algorithm>
#include "Metrics.h"
namespace mdlp {
ostream& operator << (ostream& os, const cutPoint_t& cut)
{
os << cut.classNumber << " -> (" << cut.start << ", " << cut.end <<
") - (" << cut.fromValue << ", " << cut.toValue << ") "
<< endl;
return os;
}
CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false)
{
divider = pow(10, precision);
numClasses = 0;
}
CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug)
{
divider = pow(10, precision);
numClasses = 0;
}
CPPFImdlp::~CPPFImdlp()
= default;
samples CPPFImdlp::getCutPoints()
{
samples output(cutPoints.size());
::transform(cutPoints.begin(), cutPoints.end(), output.begin(),
[](cutPoint_t cut) { return cut.toValue; });
return output;
}
labels CPPFImdlp::getDiscretizedValues()
{
return xDiscretized;
}
CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_)
{
X = X_;
y = y_;
if (X.size() != y.size()) {
throw invalid_argument("X and y must have the same size");
}
if (X.size() == 0 || y.size() == 0) {
throw invalid_argument("X and y must have at least one element");
}
indices = sortIndices(X_);
xDiscretized = labels(X.size(), -1);
numClasses = Metrics::numClasses(y, indices, 0, X.size());
if (proposal) {
computeCutPointsProposal();
} else {
computeCutPointsOriginal();
}
filterCutPoints();
// Apply cut points to the input vector
for (auto cut : cutPoints) {
for (size_t i = cut.start; i < cut.end; i++) {
xDiscretized[indices[i]] = cut.classNumber;
}
}
return *this;
}
bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate)
{
int k, k1, k2;
precision_t ig, delta;
precision_t ent, ent1, ent2;
auto N = precision_t(rest.end - rest.start);
if (N < 2) {
return false;
}
k = Metrics::numClasses(y, indices, rest.start, rest.end);
k1 = Metrics::numClasses(y, indices, rest.start, candidate.end);
k2 = Metrics::numClasses(y, indices, candidate.end, rest.end);
ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses);
ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses);
ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses);
ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses);
delta = log2(pow(3, precision_t(k)) - 2) - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
precision_t term = 1 / N * (log2(N - 1) + delta);
if (debug) {
cout << "Rest: " << rest;
cout << "Candidate: " << candidate;
cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl;
cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl;
}
return (ig > term);
}
void CPPFImdlp::filterCutPoints()
{
cutPoints_t filtered;
cutPoint_t rest, item;
int classNumber = 0;
rest.start = 0;
rest.end = X.size();
rest.fromValue = numeric_limits<precision_t>::lowest();
rest.toValue = numeric_limits<precision_t>::max();
rest.classNumber = classNumber;
bool first = true;
for (size_t index = 0; index < size_t(cutPoints.size()); index++) {
item = cutPoints[index];
if (evaluateCutPoint(rest, item)) {
if (debug)
cout << "Accepted: " << item << endl;
//Assign class number to the interval (cutpoint)
item.classNumber = classNumber++;
filtered.push_back(item);
first = false;
rest.start = item.end;
} else {
if (debug)
cout << "Rejected: " << item << endl;
if (index != size_t(cutPoints.size()) - 1) {
// Try to merge the rejected cutpoint with the next one
if (first) {
cutPoints[index + 1].fromValue = numeric_limits<precision_t>::lowest();
cutPoints[index + 1].start = indices[0];
} else {
cutPoints[index + 1].fromValue = item.fromValue;
cutPoints[index + 1].start = item.start;
}
}
}
}
if (!first) {
filtered.back().toValue = numeric_limits<precision_t>::max();
filtered.back().end = X.size() - 1;
} else {
filtered.push_back(rest);
}
cutPoints = filtered;
}
void CPPFImdlp::computeCutPointsProposal()
{
cutPoints_t cutPts;
cutPoint_t cutPoint;
precision_t xPrev, xCur, xPivot;
int yPrev, yCur, yPivot;
size_t idx, numElements, start;
xCur = xPrev = X[indices[0]];
yCur = yPrev = y[indices[0]];
numElements = indices.size() - 1;
idx = start = 0;
bool firstCutPoint = true;
if (debug)
printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements);
while (idx < numElements) {
xPivot = xCur;
yPivot = yCur;
if (debug)
printf("<idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
// Read the same values and check class changes
do {
idx++;
xCur = X[indices[idx]];
yCur = y[indices[idx]];
if (yCur != yPivot && xCur == xPivot) {
yPivot = -1;
}
if (debug)
printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur);
}
while (idx < numElements && xCur == xPivot);
// Check if the class changed and there are more than 1 element
if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) {
// Must we add the entropy criteria here?
// if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point }
cutPoint.start = start;
cutPoint.end = idx;
start = idx;
cutPoint.fromValue = firstCutPoint ? numeric_limits<precision_t>::lowest() : cutPts.back().toValue;
cutPoint.toValue = (xPrev + xCur) / 2;
cutPoint.classNumber = -1;
firstCutPoint = false;
if (debug) {
printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
}
cutPts.push_back(cutPoint);
}
yPrev = yPivot;
xPrev = xPivot;
}
if (idx == numElements) {
cutPoint.start = start;
cutPoint.end = numElements + 1;
cutPoint.fromValue = firstCutPoint ? numeric_limits<precision_t>::lowest() : cutPts.back().toValue;
cutPoint.toValue = numeric_limits<precision_t>::max();
cutPoint.classNumber = -1;
if (debug)
printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue);
cutPts.push_back(cutPoint);
}
if (debug) {
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << endl;
for (auto cutPt : cutPts)
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt;
}
cutPoints = cutPts;
}
void CPPFImdlp::computeCutPointsOriginal()
{
cutPoints_t cutPts;
cutPoint_t cutPoint;
precision_t xPrev;
int yPrev;
bool first = true;
// idxPrev is the index of the init instance of the cutPoint
size_t index, idxPrev = 0, last, idx = indices[0];
xPrev = X[idx];
yPrev = y[idx];
last = indices.size() - 1;
for (index = 0; index < last; index++) {
idx = indices[index];
// Definition 2 Cut points are always on class boundaries &&
// there are more than 1 items in the interval
// if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut)
if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) {
// Must we add the entropy criteria here?
if (first) {
first = false;
cutPoint.fromValue = numeric_limits<precision_t>::lowest();
} else {
cutPoint.fromValue = cutPts.back().toValue;
}
cutPoint.start = idxPrev;
cutPoint.end = index;
cutPoint.classNumber = -1;
cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider;
idxPrev = index;
cutPts.push_back(cutPoint);
}
xPrev = X[idx];
yPrev = y[idx];
}
if (first) {
cutPoint.start = 0;
cutPoint.classNumber = -1;
cutPoint.fromValue = numeric_limits<precision_t>::lowest();
cutPoint.toValue = numeric_limits<precision_t>::max();
cutPts.push_back(cutPoint);
} else
cutPts.back().toValue = numeric_limits<precision_t>::max();
cutPts.back().end = X.size();
if (debug) {
cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << endl;
for (auto cutPt : cutPts)
cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt;
}
cutPoints = cutPts;
}
bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end)
{
/*
Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla
*/
precision_t entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses);
precision_t entropyRight = Metrics::entropy(y, indices, cut, end, numClasses);
precision_t entropyInterval = Metrics::entropy(y, indices, start, end, numClasses);
if (debug)
printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval);
//return (entropyInterval - (entropyLeft + entropyRight) > 0);
return true;
}
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
indices_t CPPFImdlp::sortIndices(samples& X_)
{
indices_t idx(X_.size());
iota(idx.begin(), idx.end(), 0);
for (size_t i = 0; i < X_.size(); i++)
stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2)
{ return X_[i1] < X_[i2]; });
return idx;
}
void CPPFImdlp::setCutPoints(cutPoints_t cutPoints_)
{
cutPoints = cutPoints_;
}
}

View File

@@ -0,0 +1,39 @@
#ifndef CPPFIMDLP_H
#define CPPFIMDLP_H
#include "typesFImdlp.h"
#include <utility>
namespace mdlp {
class CPPFImdlp {
protected:
bool proposal; // proposed algorithm or original algorithm
int precision;
bool debug;
precision_t divider;
indices_t indices; // sorted indices to use with X and y
samples X;
labels y;
labels xDiscretized;
int numClasses;
cutPoints_t cutPoints;
void setCutPoints(cutPoints_t);
static indices_t sortIndices(samples&);
void computeCutPointsOriginal();
void computeCutPointsProposal();
bool evaluateCutPoint(cutPoint_t, cutPoint_t);
void filterCutPoints();
bool goodCut(size_t, size_t, size_t); // if the cut candidate reduces entropy
public:
CPPFImdlp();
CPPFImdlp(bool, int, bool debug = false);
~CPPFImdlp();
samples getCutPoints();
indices_t getIndices();
labels getDiscretizedValues();
void debugPoints(samples&, labels&);
CPPFImdlp& fit(samples&, labels&);
labels transform(samples&);
};
}
#endif

View File

@@ -0,0 +1,47 @@
#include "Metrics.h"
#include <set>
namespace mdlp {
Metrics::Metrics()
= default;
int Metrics::numClasses(labels& y, indices_t indices, size_t start, size_t end)
{
std::set<int> numClasses;
for (auto i = start; i < end; ++i) {
numClasses.insert(y[indices[i]]);
}
return numClasses.size();
}
precision_t Metrics::entropy(labels& y, indices_t& indices, size_t start, size_t end, int nClasses)
{
precision_t entropy = 0;
int nElements = 0;
labels counts(nClasses + 1, 0);
for (auto i = &indices[start]; i != &indices[end]; ++i) {
counts[y[*i]]++;
nElements++;
}
for (auto count : counts) {
if (count > 0) {
precision_t p = (precision_t)count / nElements;
entropy -= p * log2(p);
}
}
return entropy < 0 ? 0 : entropy;
}
precision_t Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses)
{
precision_t iGain;
precision_t entropy, entropyLeft, entropyRight;
int nClassesLeft, nClassesRight;
int nElementsLeft = cutPoint - start, nElementsRight = end - cutPoint;
int nElements = end - start;
nClassesLeft = Metrics::numClasses(y, indices, start, cutPoint);
nClassesRight = Metrics::numClasses(y, indices, cutPoint, end);
entropy = Metrics::entropy(y, indices, start, end, nClasses);
entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft);
entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight);
iGain = entropy - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements;
return iGain;
}
}

View File

@@ -0,0 +1,14 @@
#ifndef METRICS_H
#define METRICS_H
#include "typesFImdlp.h"
#include <cmath>
namespace mdlp {
class Metrics {
public:
Metrics();
static int numClasses(labels&, indices_t, size_t, size_t);
static precision_t entropy(labels&, indices_t&, size_t, size_t, int);
static precision_t informationGain(labels&, indices_t&, size_t, size_t, size_t, int);
};
}
#endif

View File

@@ -5,21 +5,12 @@
using namespace std; using namespace std;
namespace mdlp { namespace mdlp {
struct CutPointBody { typedef float precision_t;
size_t start, end; // indices of the sorted vector typedef vector<precision_t> samples;
};
typedef CutPointBody cutPoint_t;
typedef vector<float> samples;
typedef vector<int> labels; typedef vector<int> labels;
typedef vector<size_t> indices_t; typedef vector<size_t> indices_t;
typedef vector<cutPoint_t> cutPoints_t; typedef vector<precision_t> cutPoints_t;
typedef map<tuple<int, int>, float> cacheEnt_t; typedef map<tuple<int, int>, precision_t> cacheEnt_t;
typedef map<tuple<int, int, int>, float> cacheIg_t; typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
struct cutPointStruct {
size_t index;
float value;
};
typedef cutPointStruct xcutPoint_t;
typedef vector<xcutPoint_t> xcutPoints_t;
} }
#endif #endif

View File

@@ -13,7 +13,7 @@ namespace FImdlp {
int n = X.size(); int n = X.size();
for (i = 1; i < n; i++) { for (i = 1; i < n; i++) {
if (X.at(i) != ant) { if (X.at(i) != ant) {
cutPts.push_back(float(X.at(i) + ant) / 2); cutPts.push_back(precision_t(X.at(i) + ant) / 2);
ant = X.at(i); ant = X.at(i);
} }
} }

View File

@@ -5,7 +5,7 @@ from libcpp.vector cimport vector
cdef extern from "FImdlp.h" namespace "FImdlp": cdef extern from "FImdlp.h" namespace "FImdlp":
cdef cppclass FImdlp: cdef cppclass FImdlp:
FImdlp() except + FImdlp() except +
vector[float] cutPoints(vector[int]&, vector[int]&) vector[precision_t] cutPoints(vector[int]&, vector[int]&)
cdef class CFImdlp: cdef class CFImdlp:
cdef FImdlp *thisptr cdef FImdlp *thisptr

View File

@@ -12,10 +12,8 @@ setup(
name="cppfimdlp", name="cppfimdlp",
sources=[ sources=[
"fimdlp/cfimdlp.pyx", "fimdlp/cfimdlp.pyx",
# "fimdlp/CPPFImdlp.cpp", "fimdlp/CPPFImdlp.cpp",
# "fimdlp/Metrics.cpp", "fimdlp/Metrics.cpp",
"fimdlp/ccMetrics.cc",
"fimdlp/ccFImdlp.cc",
], ],
language="c++", language="c++",
include_dirs=["fimdlp"], include_dirs=["fimdlp"],