diff --git a/fimdlp/CPPFImdlp.cpp b/fimdlp/CPPFImdlp.cpp index cdafb99..8113c55 100644 --- a/fimdlp/CPPFImdlp.cpp +++ b/fimdlp/CPPFImdlp.cpp @@ -1,41 +1,20 @@ -#include "CPPFImdlp.h" #include #include #include +#include +#include "CPPFImdlp.h" #include "Metrics.h" namespace mdlp { - ostream& operator << (ostream& os, const cutPoint_t& cut) + CPPFImdlp::CPPFImdlp(): proposal(true), debug(false), indices(indices_t()), y(labels()), metrics(Metrics(y, indices)) { - os << cut.classNumber << " -> (" << cut.start << ", " << cut.end << - ") - (" << cut.fromValue << ", " << cut.toValue << ") " - << endl; - return os; - } - CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false) + CPPFImdlp::CPPFImdlp(bool proposal, bool debug): proposal(proposal), debug(debug), indices(indices_t()), y(labels()), metrics(Metrics(y, indices)) { - divider = pow(10, precision); - numClasses = 0; - } - CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug) - { - divider = pow(10, precision); - numClasses = 0; } CPPFImdlp::~CPPFImdlp() = default; - samples CPPFImdlp::getCutPoints() - { - samples output(cutPoints.size()); - ::transform(cutPoints.begin(), cutPoints.end(), output.begin(), - [](cutPoint_t cut) { return cut.toValue; }); - return output; - } - labels CPPFImdlp::getDiscretizedValues() - { - return xDiscretized; - } + CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_) { X = X_; @@ -47,227 +26,78 @@ namespace mdlp { throw invalid_argument("X and y must have at least one element"); } indices = sortIndices(X_); - xDiscretized = labels(X.size(), -1); - numClasses = Metrics::numClasses(y, indices, 0, X.size()); - - if (proposal) { - computeCutPointsProposal(); - } else { - computeCutPointsOriginal(); - } - filterCutPoints(); - // Apply cut points to the input vector - for (auto cut : cutPoints) { - for (size_t i = cut.start; i < cut.end; i++) { - xDiscretized[indices[i]] = cut.classNumber; - } - } + metrics.setData(y, indices); + computeCutPoints(0, X.size()); return *this; } - bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate) + void CPPFImdlp::computeCutPoints(size_t start, size_t end) + { + int cut; + if (end - start < 2) + return; + cut = getCandidate(start, end); + if (cut == -1 || !mdlp(start, cut, end)) { + // cut.value == -1 means that there is no candidate in the interval + // No boundary found, so we add both ends of the interval as cutpoints + // because they were selected by the algorithm before + if (start != 0) + cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2); + if (end != X.size()) + cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2); + return; + } + computeCutPoints(start, cut); + computeCutPoints(cut, end); + } + long int CPPFImdlp::getCandidate(size_t start, size_t end) + { + long int candidate = -1, elements = end - start; + precision_t entropy_left, entropy_right, minEntropy = numeric_limits::max(); + for (auto idx = start + 1; idx < end; idx++) { + // Cutpoints are always on boudndaries + if (y[indices[idx]] == y[indices[idx - 1]]) + continue; + entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx); + entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end); + if (entropy_left + entropy_right < minEntropy) { + minEntropy = entropy_left + entropy_right; + candidate = idx; + } + } + return candidate; + } + bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) { int k, k1, k2; - float ig, delta; - float ent, ent1, ent2; - auto N = float(rest.end - rest.start); + precision_t ig, delta; + precision_t ent, ent1, ent2; + auto N = precision_t(end - start); if (N < 2) { return false; } - k = Metrics::numClasses(y, indices, rest.start, rest.end); - k1 = Metrics::numClasses(y, indices, rest.start, candidate.end); - k2 = Metrics::numClasses(y, indices, candidate.end, rest.end); - ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses); - ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses); - ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses); - ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses); - delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2); - float term = 1 / N * (log2(N - 1) + delta); - if (debug) { - cout << "Rest: " << rest; - cout << "Candidate: " << candidate; - cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl; - cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl; - } - return (ig > term); + k = metrics.computeNumClasses(start, end); + k1 = metrics.computeNumClasses(start, cut); + k2 = metrics.computeNumClasses(cut, end); + ent = metrics.entropy(start, end); + ent1 = metrics.entropy(start, cut); + ent2 = metrics.entropy(cut, end); + ig = metrics.informationGain(start, cut, end); + delta = log2(pow(3, precision_t(k)) - 2) - + (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); + precision_t term = 1 / N * (log2(N - 1) + delta); + return ig > term; } - void CPPFImdlp::filterCutPoints() + cutPoints_t CPPFImdlp::getCutPoints() { - cutPoints_t filtered; - cutPoint_t rest, item; - int classNumber = 0; - - rest.start = 0; - rest.end = X.size(); - rest.fromValue = numeric_limits::lowest(); - rest.toValue = numeric_limits::max(); - rest.classNumber = classNumber; - bool first = true; - for (size_t index = 0; index < size_t(cutPoints.size()); index++) { - item = cutPoints[index]; - if (evaluateCutPoint(rest, item)) { - if (debug) - cout << "Accepted: " << item << endl; - //Assign class number to the interval (cutpoint) - item.classNumber = classNumber++; - filtered.push_back(item); - first = false; - rest.start = item.end; - } else { - if (debug) - cout << "Rejected: " << item << endl; - if (index != size_t(cutPoints.size()) - 1) { - // Try to merge the rejected cutpoint with the next one - if (first) { - cutPoints[index + 1].fromValue = numeric_limits::lowest(); - cutPoints[index + 1].start = indices[0]; - } else { - cutPoints[index + 1].fromValue = item.fromValue; - cutPoints[index + 1].start = item.start; - } - } - } - } - if (!first) { - filtered.back().toValue = numeric_limits::max(); - filtered.back().end = X.size() - 1; - } else { - filtered.push_back(rest); - } - cutPoints = filtered; - } - void CPPFImdlp::computeCutPointsProposal() - { - cutPoints_t cutPts; - cutPoint_t cutPoint; - float xPrev, xCur, xPivot; - int yPrev, yCur, yPivot; - size_t idx, numElements, start; - - xCur = xPrev = X[indices[0]]; - yCur = yPrev = y[indices[0]]; - numElements = indices.size() - 1; - idx = start = 0; - bool firstCutPoint = true; - if (debug) - printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements); - while (idx < numElements) { - xPivot = xCur; - yPivot = yCur; - if (debug) - printf(" Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur); - // Read the same values and check class changes - do { - idx++; - xCur = X[indices[idx]]; - yCur = y[indices[idx]]; - if (yCur != yPivot && xCur == xPivot) { - yPivot = -1; - } - if (debug) - printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur); - } - while (idx < numElements && xCur == xPivot); - // Check if the class changed and there are more than 1 element - if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) { - // Must we add the entropy criteria here? - // if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point } - cutPoint.start = start; - cutPoint.end = idx; - start = idx; - cutPoint.fromValue = firstCutPoint ? numeric_limits::lowest() : cutPts.back().toValue; - cutPoint.toValue = (xPrev + xCur) / 2; - cutPoint.classNumber = -1; - firstCutPoint = false; - if (debug) { - printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue); - } - cutPts.push_back(cutPoint); - } - yPrev = yPivot; - xPrev = xPivot; - } - if (idx == numElements) { - cutPoint.start = start; - cutPoint.end = numElements + 1; - cutPoint.fromValue = firstCutPoint ? numeric_limits::lowest() : cutPts.back().toValue; - cutPoint.toValue = numeric_limits::max(); - cutPoint.classNumber = -1; - if (debug) - printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue); - cutPts.push_back(cutPoint); - } - if (debug) { - cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << endl; - for (auto cutPt : cutPts) - cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt; - } - cutPoints = cutPts; - } - void CPPFImdlp::computeCutPointsOriginal() - { - cutPoints_t cutPts; - cutPoint_t cutPoint; - float xPrev; - int yPrev; - bool first = true; - // idxPrev is the index of the init instance of the cutPoint - size_t index, idxPrev = 0, last, idx = indices[0]; - xPrev = X[idx]; - yPrev = y[idx]; - last = indices.size() - 1; - for (index = 0; index < last; index++) { - idx = indices[index]; - // Definition 2 Cut points are always on class boundaries && - // there are more than 1 items in the interval - // if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut) - if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) { - // Must we add the entropy criteria here? - if (first) { - first = false; - cutPoint.fromValue = numeric_limits::lowest(); - } else { - cutPoint.fromValue = cutPts.back().toValue; - } - cutPoint.start = idxPrev; - cutPoint.end = index; - cutPoint.classNumber = -1; - cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider; - idxPrev = index; - cutPts.push_back(cutPoint); - } - xPrev = X[idx]; - yPrev = y[idx]; - } - if (first) { - cutPoint.start = 0; - cutPoint.classNumber = -1; - cutPoint.fromValue = numeric_limits::lowest(); - cutPoint.toValue = numeric_limits::max(); - cutPts.push_back(cutPoint); - } else - cutPts.back().toValue = numeric_limits::max(); - cutPts.back().end = X.size(); - if (debug) { - cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << endl; - for (auto cutPt : cutPts) - cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt; - } - cutPoints = cutPts; - } - bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end) - { - /* - Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla - - - */ - float entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses); - float entropyRight = Metrics::entropy(y, indices, cut, end, numClasses); - float entropyInterval = Metrics::entropy(y, indices, start, end, numClasses); - if (debug) - printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval); - //return (entropyInterval - (entropyLeft + entropyRight) > 0); - return true; + // Remove duplicates and sort + cutPoints_t output(cutPoints.size()); + set s; + unsigned size = cutPoints.size(); + for (unsigned i = 0; i < size; i++) + s.insert(cutPoints[i]); + output.assign(s.begin(), s.end()); + sort(output.begin(), output.end()); + return output; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes indices_t CPPFImdlp::sortIndices(samples& X_) @@ -275,12 +105,8 @@ namespace mdlp { indices_t idx(X_.size()); iota(idx.begin(), idx.end(), 0); for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) + sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) { return X_[i1] < X_[i2]; }); return idx; } - void CPPFImdlp::setCutPoints(cutPoints_t cutPoints_) - { - cutPoints = cutPoints_; - } } diff --git a/fimdlp/CPPFImdlp.h b/fimdlp/CPPFImdlp.h index 926b6e6..7d467cc 100644 --- a/fimdlp/CPPFImdlp.h +++ b/fimdlp/CPPFImdlp.h @@ -1,39 +1,30 @@ #ifndef CPPFIMDLP_H #define CPPFIMDLP_H #include "typesFImdlp.h" +#include "Metrics.h" #include namespace mdlp { class CPPFImdlp { protected: bool proposal; // proposed algorithm or original algorithm - int precision; bool debug; - float divider; indices_t indices; // sorted indices to use with X and y samples X; labels y; - labels xDiscretized; - int numClasses; + Metrics metrics; cutPoints_t cutPoints; - void setCutPoints(cutPoints_t); static indices_t sortIndices(samples&); - void computeCutPointsOriginal(); - void computeCutPointsProposal(); - bool evaluateCutPoint(cutPoint_t, cutPoint_t); - void filterCutPoints(); - bool goodCut(size_t, size_t, size_t); // if the cut candidate reduces entropy + void computeCutPoints(size_t, size_t); + long int getCandidate(size_t, size_t); + bool mdlp(size_t, size_t, size_t); public: CPPFImdlp(); - CPPFImdlp(bool, int, bool debug = false); + CPPFImdlp(bool, bool debug = false); ~CPPFImdlp(); - samples getCutPoints(); - indices_t getIndices(); - labels getDiscretizedValues(); - void debugPoints(samples&, labels&); CPPFImdlp& fit(samples&, labels&); - labels transform(samples&); + samples getCutPoints(); }; } #endif \ No newline at end of file diff --git a/fimdlp/Metrics.cpp b/fimdlp/Metrics.cpp index ffc1806..041ecf4 100644 --- a/fimdlp/Metrics.cpp +++ b/fimdlp/Metrics.cpp @@ -1,46 +1,63 @@ #include "Metrics.h" #include +#include +using namespace std; namespace mdlp { - Metrics::Metrics() - = default; - int Metrics::numClasses(labels& y, indices_t indices, size_t start, size_t end) + Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) { - std::set numClasses; - for (auto i = start; i < end; ++i) { - numClasses.insert(y[indices[i]]); - } - return numClasses.size(); } - float Metrics::entropy(labels& y, indices_t& indices, size_t start, size_t end, int nClasses) + int Metrics::computeNumClasses(size_t start, size_t end) { - float entropy = 0; + set nClasses; + for (auto i = start; i < end; ++i) { + nClasses.insert(y[indices[i]]); + } + return nClasses.size(); + } + void Metrics::setData(labels& y_, indices_t& indices_) + { + indices = indices_; + y = y_; + numClasses = computeNumClasses(0, indices.size()); + } + precision_t Metrics::entropy(size_t start, size_t end) + { + precision_t p, ventropy = 0; int nElements = 0; - labels counts(nClasses + 1, 0); + labels counts(numClasses + 1, 0); + if (end - start < 2) + return 0; + if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) { + return entropyCache[make_tuple(start, end)]; + } for (auto i = &indices[start]; i != &indices[end]; ++i) { counts[y[*i]]++; nElements++; } for (auto count : counts) { if (count > 0) { - float p = (float)count / nElements; - entropy -= p * log2(p); + p = (precision_t)count / nElements; + ventropy -= p * log2(p); } } - return entropy < 0 ? 0 : entropy; + entropyCache[make_tuple(start, end)] = ventropy; + return ventropy; } - float Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses) + precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) { - float iGain; - float entropy, entropyLeft, entropyRight; - int nClassesLeft, nClassesRight; - int nElementsLeft = cutPoint - start, nElementsRight = end - cutPoint; + precision_t iGain; + precision_t entropyInterval, entropyLeft, entropyRight; + int nElementsLeft = cut - start, nElementsRight = end - cut; int nElements = end - start; - nClassesLeft = Metrics::numClasses(y, indices, start, cutPoint); - nClassesRight = Metrics::numClasses(y, indices, cutPoint, end); - entropy = Metrics::entropy(y, indices, start, end, nClasses); - entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft); - entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight); - iGain = entropy - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements; + if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { + cout << "**********Cache IG hit for " << start << " " << end << endl; + return igCache[make_tuple(start, cut, end)]; + } + entropyInterval = entropy(start, end); + entropyLeft = entropy(start, cut); + entropyRight = entropy(cut, end); + iGain = entropyInterval - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements; + igCache[make_tuple(start, cut, end)] = iGain; return iGain; } diff --git a/fimdlp/Metrics.h b/fimdlp/Metrics.h index 41b9b2c..79bc286 100644 --- a/fimdlp/Metrics.h +++ b/fimdlp/Metrics.h @@ -1,14 +1,21 @@ -#ifndef METRICS_H -#define METRICS_H +#ifndef CCMETRICS_H +#define CCMETRICS_H #include "typesFImdlp.h" #include namespace mdlp { class Metrics { + protected: + labels& y; + indices_t& indices; + int numClasses; + cacheEnt_t entropyCache; + cacheIg_t igCache; public: - Metrics(); - static int numClasses(labels&, indices_t, size_t, size_t); - static float entropy(labels&, indices_t&, size_t, size_t, int); - static float informationGain(labels&, indices_t&, size_t, size_t, size_t, int); + Metrics(labels&, indices_t&); + void setData(labels&, indices_t&); + int computeNumClasses(size_t, size_t); + precision_t entropy(size_t, size_t); + precision_t informationGain(size_t, size_t, size_t); }; } #endif \ No newline at end of file diff --git a/fimdlp/ccFImdlp.cc b/fimdlp/ccFImdlp.cc deleted file mode 100644 index 629b762..0000000 --- a/fimdlp/ccFImdlp.cc +++ /dev/null @@ -1,110 +0,0 @@ -#include "ccFImdlp.h" -#include -#include -#include -#include -#include "ccMetrics.h" - -namespace mdlp { - CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices)) - { - } - CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug), divider(pow(10, precision)), indices(indices_t()), y(labels()), metrics(Metrics(y, indices)) - { - } - CPPFImdlp::~CPPFImdlp() - = default; - - CPPFImdlp& CPPFImdlp::fitx(samples& X_, labels& y_) - { - X = X_; - y = y_; - if (X.size() != y.size()) { - throw invalid_argument("X and y must have the same size"); - } - if (X.size() == 0 || y.size() == 0) { - throw invalid_argument("X and y must have at least one element"); - } - indices = sortIndices(X_); - metrics.setData(y, indices); - computeCutPoints(0, X.size()); - return *this; - } - void CPPFImdlp::computeCutPoints(size_t start, size_t end) - { - int cut; - if (end - start < 2) - return; - cut = getCandidate(start, end); - if (cut == -1 || !mdlp(start, cut, end)) { - // cut.value == -1 means that there is no candidate in the interval - // that enhances the information gain - if (start != 0) - xCutPoints.push_back(xcutPoint_t({ start, (X[indices[start]] + X[indices[start - 1]]) / 2 })); - if (end != X.size()) - xCutPoints.push_back(xcutPoint_t({ end, (X[indices[end]] + X[indices[end - 1]]) / 2 })); - return; - } - computeCutPoints(start, cut); - computeCutPoints(cut, end); - } - long int CPPFImdlp::getCandidate(size_t start, size_t end) - { - long int candidate = -1, elements = end - start; - float entropy_left, entropy_right, minEntropy = numeric_limits::max(); - for (auto idx = start + 1; idx < end; idx++) { - // Cutpoints are always on boudndaries - if (y[indices[idx]] == y[indices[idx - 1]]) - continue; - entropy_left = float(idx - start) / elements * metrics.entropy(start, idx); - entropy_right = float(end - idx) / elements * metrics.entropy(idx, end); - if (entropy_left + entropy_right < minEntropy) { - minEntropy = entropy_left + entropy_right; - candidate = idx; - } - } - return candidate; - } - bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) - { - int k, k1, k2; - float ig, delta; - float ent, ent1, ent2; - auto N = float(end - start); - if (N < 2) { - return false; - } - k = metrics.computeNumClasses(start, end); - k1 = metrics.computeNumClasses(start, cut); - k2 = metrics.computeNumClasses(cut, end); - ent = metrics.entropy(start, end); - ent1 = metrics.entropy(start, cut); - ent2 = metrics.entropy(cut, end); - ig = metrics.informationGain(start, cut, end); - delta = log2(pow(3, float(k)) - 2) - (float(k) * ent - float(k1) * ent1 - float(k2) * ent2); - float term = 1 / N * (log2(N - 1) + delta); - return ig > term; - } - samples CPPFImdlp::getCutPointsx() - { - // Remove duplicates and sort - samples output(xCutPoints.size()); - set s; - unsigned size = xCutPoints.size(); - for (unsigned i = 0; i < size; i++) - s.insert(xCutPoints[i].value); - output.assign(s.begin(), s.end()); - sort(output.begin(), output.end()); - return output; - } - // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples& X_) - { - indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); - for (size_t i = 0; i < X_.size(); i++) - sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) - { return X_[i1] < X_[i2]; }); - return idx; - } -} diff --git a/fimdlp/ccFImdlp.h b/fimdlp/ccFImdlp.h deleted file mode 100644 index 00f3f9f..0000000 --- a/fimdlp/ccFImdlp.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef CCFIMDLP_H -#define CCFIMDLP_H -#include "typesFImdlp.h" -#include "ccMetrics.h" -#include -namespace mdlp { - class CPPFImdlp { - protected: - bool proposal; // proposed algorithm or original algorithm - int precision; - bool debug; - float divider; - indices_t indices; // sorted indices to use with X and y - samples X; - labels y; - Metrics metrics; - xcutPoints_t xCutPoints; - - static indices_t sortIndices(samples&); - void computeCutPoints(size_t, size_t); - long int getCandidate(size_t, size_t); - bool mdlp(size_t, size_t, size_t); - - public: - CPPFImdlp(); - CPPFImdlp(bool, int, bool debug = false); - ~CPPFImdlp(); - CPPFImdlp& fitx(samples&, labels&); - samples getCutPointsx(); - }; -} -#endif \ No newline at end of file diff --git a/fimdlp/ccMetrics.cc b/fimdlp/ccMetrics.cc deleted file mode 100644 index 06ddb9a..0000000 --- a/fimdlp/ccMetrics.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include "ccMetrics.h" -#include -#include -using namespace std; -namespace mdlp { - Metrics::Metrics(labels& y_, indices_t& indices_): y(y_), indices(indices_), numClasses(computeNumClasses(0, indices.size())), entropyCache(cacheEnt_t()), igCache(cacheIg_t()) - { - } - int Metrics::computeNumClasses(size_t start, size_t end) - { - set nClasses; - for (auto i = start; i < end; ++i) { - nClasses.insert(y[indices[i]]); - } - return nClasses.size(); - } - void Metrics::setData(labels& y_, indices_t& indices_) - { - indices = indices_; - y = y_; - numClasses = computeNumClasses(0, indices.size()); - } - float Metrics::entropy(size_t start, size_t end) - { - float p, ventropy = 0; - int nElements = 0; - labels counts(numClasses + 1, 0); - if (end - start < 2) - return 0; - if (entropyCache.find(make_tuple(start, end)) != entropyCache.end()) { - return entropyCache[make_tuple(start, end)]; - } - for (auto i = &indices[start]; i != &indices[end]; ++i) { - counts[y[*i]]++; - nElements++; - } - for (auto count : counts) { - if (count > 0) { - p = (float)count / nElements; - ventropy -= p * log2(p); - } - } - entropyCache[make_tuple(start, end)] = ventropy; - return ventropy; - } - float Metrics::informationGain(size_t start, size_t cut, size_t end) - { - float iGain; - float entropyInterval, entropyLeft, entropyRight; - int nElementsLeft = cut - start, nElementsRight = end - cut; - int nElements = end - start; - if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { - cout << "**********Cache IG hit for " << start << " " << end << endl; - return igCache[make_tuple(start, cut, end)]; - } - entropyInterval = entropy(start, end); - entropyLeft = entropy(start, cut); - entropyRight = entropy(cut, end); - iGain = entropyInterval - ((float)nElementsLeft * entropyLeft + (float)nElementsRight * entropyRight) / nElements; - igCache[make_tuple(start, cut, end)] = iGain; - return iGain; - } - -} -/* - cache_t entropyCache; - std::map, double> c; - - // Set the value at index (3, 5) to 7.8. - c[std::make_tuple(3, 5)] = 7.8; - - // Print the value at index (3, 5). - std::cout << c[std::make_tuple(3, 5)] << std::endl; -*/ \ No newline at end of file diff --git a/fimdlp/ccMetrics.h b/fimdlp/ccMetrics.h deleted file mode 100644 index b4c5752..0000000 --- a/fimdlp/ccMetrics.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef CCMETRICS_H -#define CCMETRICS_H -#include "typesFImdlp.h" -#include -namespace mdlp { - class Metrics { - protected: - labels& y; - indices_t& indices; - int numClasses; - cacheEnt_t entropyCache; - cacheIg_t igCache; - public: - Metrics(labels&, indices_t&); - void setData(labels&, indices_t&); - int computeNumClasses(size_t, size_t); - float entropy(size_t, size_t); - float informationGain(size_t, size_t, size_t); - }; -} -#endif \ No newline at end of file diff --git a/fimdlp/cfimdlp.pyx b/fimdlp/cfimdlp.pyx index 1a04922..db87af1 100644 --- a/fimdlp/cfimdlp.pyx +++ b/fimdlp/cfimdlp.pyx @@ -3,16 +3,13 @@ from libcpp.vector cimport vector from libcpp cimport bool -cdef extern from "ccFImdlp.h" namespace "mdlp": - cdef struct CutPointBody: - size_t start, end; - int classNumber; - float fromValue, toValue; +cdef extern from "CPPFImdlp.h" namespace "mdlp": + ctypedef float precision_t cdef cppclass CPPFImdlp: CPPFImdlp() except + - CPPFImdlp(bool, int, bool) except + - CPPFImdlp& fitx(vector[float]&, vector[int]&) - vector[float] getCutPointsx() + CPPFImdlp(bool, bool) except + + CPPFImdlp& fit(vector[precision_t]&, vector[int]&) + vector[precision_t] getCutPoints() class PcutPoint_t: @@ -24,14 +21,14 @@ class PcutPoint_t: cdef class CFImdlp: cdef CPPFImdlp *thisptr - def __cinit__(self, precision=6, debug=False, proposal=True): + def __cinit__(self, debug=False, proposal=True): # Proposal or original algorithm - self.thisptr = new CPPFImdlp(proposal, precision, debug) + self.thisptr = new CPPFImdlp(proposal, debug) def __dealloc__(self): del self.thisptr def fit(self, X, y): - self.thisptr.fitx(X, y) + self.thisptr.fit(X, y) return self def get_cut_points(self): - return self.thisptr.getCutPointsx() + return self.thisptr.getCutPoints() \ No newline at end of file diff --git a/fimdlp/cppfimdlp.cpython-310-darwin.so b/fimdlp/cppfimdlp.cpython-310-darwin.so index 681ead1..2995578 100755 Binary files a/fimdlp/cppfimdlp.cpython-310-darwin.so and b/fimdlp/cppfimdlp.cpython-310-darwin.so differ diff --git a/fimdlp/m2.cpp b/fimdlp/m2.cpp deleted file mode 100644 index 73dadda..0000000 --- a/fimdlp/m2.cpp +++ /dev/null @@ -1,36 +0,0 @@ - -#include - -using namespace std; -struct CutPointBody { - size_t start, end; // indices of the sorted vector - int classNumber; // class assigned to the cut point - float fromValue, toValue; -}; -typedef CutPointBody cutPoint_t; -typedef vector samples; -typedef vector labels; -typedef vector indices_t; -typedef vector cutPoints_t; -//typedef std::map, float> cache_t; -struct cutPointStruct { - size_t index; - float value; -}; -typedef cutPointStruct xcutPoint_t; -typedef vector xcutPoints_t; -class Metrics { -private: - labels& y; - indices_t& indices; - int numClasses; -public: - Metrics(labels&, indices_t&); - int computeNumClasses(size_t, size_t); - float entropy(size_t, size_t); - float informationGain(size_t, size_t, size_t); -}; -Metrics::Metrics(labels& y_, indices_t& indices_) : y(y_), indices(indices_) -{ - numClasses = computeNumClasses(0, indices.size()); -} \ No newline at end of file diff --git a/fimdlp/main b/fimdlp/main deleted file mode 100755 index 5e9e630..0000000 Binary files a/fimdlp/main and /dev/null differ diff --git a/fimdlp/main.cpp b/fimdlp/main.cpp deleted file mode 100644 index 7c1c4a1..0000000 --- a/fimdlp/main.cpp +++ /dev/null @@ -1,52 +0,0 @@ -#include "CPPFImdlp.h" -#include -#include -#include -#include -#include -using namespace std; -using namespace mdlp; - -int main() -{ - ifstream fin("kdd_JapaneseVowels.arff"); - if (!fin.is_open()) { - cout << "Error opening file" << endl; - return 1; - } - - int count = 0; - - // Read the Data from the file - // as String Vector - size_t col; - vector row; - string line, word; - vector> dataset = vector>(15, vector()); - while (getline(fin, line)) { - if (count++ > 215) { - stringstream ss(line); - col = 0; - while (getline(ss, word, ',')) { - col = col % 15; - dataset[col].push_back(stof(word)); - cout << col << "-" << word << " "; - col++; - } - cout << endl; - } - } - labels y = labels(dataset[0].begin(), dataset[0].end()); - cout << "Column 0 (y): " << y.size() << endl; - for (auto item : y) { - cout << item << " "; - } - CPPFImdlp test = CPPFImdlp(false, 6, true); - test.fit(dataset[3], y); - cout << "Cut points: " << test.getCutPoints().size() << endl; - for (auto item : test.getCutPoints()) { - cout << item << " "; - } - fin.close(); - return 0; -} \ No newline at end of file diff --git a/fimdlp/mdlp.py b/fimdlp/mdlp.py index 2d3f610..50e5ca7 100644 --- a/fimdlp/mdlp.py +++ b/fimdlp/mdlp.py @@ -1,6 +1,5 @@ import numpy as np from .cppfimdlp import CFImdlp -from .pyfimdlp import PyFImdlp from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.multiclass import unique_labels from sklearn.utils.validation import check_X_y, check_array, check_is_fitted diff --git a/fimdlp/pyfimdlp.py b/fimdlp/pyfimdlp.py deleted file mode 100644 index 91c01c7..0000000 --- a/fimdlp/pyfimdlp.py +++ /dev/null @@ -1,479 +0,0 @@ -import numpy as np -from math import log2 -from types import SimpleNamespace - - -class PyFImdlp: - def __init__(self, proposal=True, debug=False): - self.proposal = proposal - self.n_features_ = None - self.X_ = None - self.y_ = None - self.debug = debug - self.features_ = None - self.cut_points_ = [] - self.entropy_cache = {} - self.information_gain_cache = {} - - def fit(self, X, y): - self.n_features_ = len(X) - self.indices_ = np.argsort(X) - self.use_indices = False - X = [ - 4.3, - 4.4, - 4.4, - 4.4, - 4.5, - 4.6, - 4.6, - 4.6, - 4.6, - 4.7, - 4.7, - 4.8, - 4.8, - 4.8, - 4.8, - 4.8, - 4.9, - 4.9, - 4.9, - 4.9, - 4.9, - 4.9, - 5, - 5, - 5, - 5, - 5, - 5, - 5, - 5, - 5, - 5, - 5.1, - 5.1, - 5.1, - 5.1, - 5.1, - 5.1, - 5.1, - 5.1, - 5.1, - 5.2, - 5.2, - 5.2, - 5.2, - 5.3, - 5.4, - 5.4, - 5.4, - 5.4, - 5.4, - 5.4, - 5.5, - 5.5, - 5.5, - 5.5, - 5.5, - 5.5, - 5.5, - 5.6, - 5.6, - 5.6, - 5.6, - 5.6, - 5.6, - 5.7, - 5.7, - 5.7, - 5.7, - 5.7, - 5.7, - 5.7, - 5.7, - 5.8, - 5.8, - 5.8, - 5.8, - 5.8, - 5.8, - 5.8, - 5.9, - 5.9, - 5.9, - 6, - 6, - 6, - 6, - 6, - 6, - 6.1, - 6.1, - 6.1, - 6.1, - 6.1, - 6.1, - 6.2, - 6.2, - 6.2, - 6.2, - 6.3, - 6.3, - 6.3, - 6.3, - 6.3, - 6.3, - 6.3, - 6.3, - 6.3, - 6.4, - 6.4, - 6.4, - 6.4, - 6.4, - 6.4, - 6.4, - 6.5, - 6.5, - 6.5, - 6.5, - 6.5, - 6.6, - 6.6, - 6.7, - 6.7, - 6.7, - 6.7, - 6.7, - 6.7, - 6.7, - 6.7, - 6.8, - 6.8, - 6.8, - 6.9, - 6.9, - 6.9, - 6.9, - 7, - 7.1, - 7.2, - 7.2, - 7.2, - 7.3, - 7.4, - 7.6, - 7.7, - 7.7, - 7.7, - 7.7, - 7.9, - ] - y = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 2, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 0, - 1, - 1, - 1, - 1, - 1, - 0, - 1, - 1, - 2, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 1, - 2, - 0, - 1, - 1, - 2, - 0, - 1, - 2, - 1, - 2, - 2, - 1, - 1, - 2, - 1, - 1, - 1, - 2, - 1, - 2, - 2, - 1, - 1, - 1, - 1, - 2, - 2, - 1, - 1, - 2, - 2, - 1, - 2, - 2, - 1, - 2, - 1, - 2, - 2, - 1, - 2, - 2, - 2, - 1, - 2, - 2, - 2, - 1, - 2, - 2, - 1, - 1, - 2, - 2, - 2, - 2, - 2, - 1, - 1, - 1, - 2, - 2, - 1, - 2, - 1, - 2, - 2, - 1, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - ] - # self.X_ = X[self.indices_] if not self.use_indices else X - # self.y_ = y[self.indices_] if not self.use_indices else y - self.X_ = X - self.y_ = y - self.compute_cut_points(0, len(y)) - return self - - def get_cut_points(self): - return sorted(list(set([cut.value for cut in self.cut_points_]))) - - def compute_cut_points(self, start, end): - # print((start, end)) - cut = self.get_candidate(start, end) - if cut.value is None: - return - print("cut: ", cut.value, " index: ", cut.index) - if self.mdlp(cut, start, end): - print("¡Ding!", cut.value, cut.index) - self.cut_points_.append(cut) - self.compute_cut_points(start, cut.index) - self.compute_cut_points(cut.index, end) - - def mdlp(self, cut, start, end): - N = end - start - k = self.num_classes(start, end) - k1 = self.num_classes(start, cut.index) - k2 = self.num_classes(cut.index, end) - ent = self.entropy(start, end) - ent1 = self.entropy(start, cut.index) - ent2 = self.entropy(cut.index, end) - ig = self.information_gain(start, cut.index, end) - delta = log2(pow(3, k) - 2, 2) - ( - float(k) * ent - float(k1) * ent1 - float(k2) * ent2 - ) - term = 1 / N * (log2(N - 1, 2) + delta) - print("start: ", start, " cut: ", cut.index, " end: ", end) - print( - "k=", - k, - " k1=", - k1, - " k2=", - k2, - " ent=", - ent, - " ent1=", - ent1, - " ent2=", - ent2, - ) - print("ig=", ig, " delta=", delta, " N ", N, " term ", term) - return ig > term - - def num_classes(self, start, end): - n_classes = set() - for i in range(start, end): - n_classes.add( - self.y_[self.indices_[i]] if self.use_indices else self.y_[i] - ) - return len(n_classes) - - def get_candidate(self, start, end): - """Return the best cutpoint candidate for the given range. - - Parameters - ---------- - start : int - Start of the range. - end : int - End of the range. - - Returns - ------- - candidate : SimpleNamespace with attributes index and value - value == None if no candidate is found. - """ - candidate = SimpleNamespace() - candidate.value = None - minEntropy = float("inf") - for idx in range(start + 1, end): - condition = ( - self.y_[self.indices_[idx]] == self.y_[self.indices_[idx - 1]] - if self.use_indices - else self.y_[idx] == self.y_[idx - 1] - ) - if condition: - continue - entropy_left = self.entropy(start, idx) - entropy_right = self.entropy(idx, end) - entropy_cut = entropy_left + entropy_right - print( - "idx: ", - idx, - " entropy_left: ", - entropy_left, - " entropy_right : ", - entropy_right, - " -> ", - start, - " ", - end, - ) - if entropy_cut < minEntropy: - minEntropy = entropy_cut - candidate.index = idx - if self.use_indices: - candidate.value = ( - self.X_[self.indices_[idx]] - + self.X_[self.indices_[idx - 1]] - ) / 2 - else: - candidate.value = (self.X_[idx] + self.X_[idx - 1]) / 2 - return candidate - - def entropy(self, start, end) -> float: - n_labels = end - start - if n_labels <= 1: - return 0 - if (start, end) in self.entropy_cache: - return self.entropy_cache[(start, end)] - if self.use_indices: - counts = np.bincount(self.y_[self.indices_[start:end]]) - else: - counts = np.bincount(self.y_[start:end]) - proportions = counts / n_labels - n_classes = np.count_nonzero(proportions) - if n_classes <= 1: - return 0 - entropy = 0.0 - # Compute standard entropy. - for prop in proportions: - if prop != 0.0: - entropy -= prop * log2(prop, 2) - self.entropy_cache[(start, end)] = entropy - return entropy - - def information_gain(self, start, cut, end): - if (start, cut, end) in self.information_gain_cache: - return self.information_gain_cache[(start, cut, end)] - labels = end - start - if labels == 0: - return 0.0 - entropy = self.entropy(start, end) - card_left = cut - start - entropy_left = self.entropy(start, cut) - card_right = end - cut - entropy_right = self.entropy(cut, end) - result = ( - entropy - - (card_left / labels) * entropy_left - - (card_right / labels) * entropy_right - ) - self.information_gain_cache[(start, cut, end)] = result - return result diff --git a/fimdlp/testcpp/FImdlp_unittest.cc b/fimdlp/testcpp/FImdlp_unittest.cc index df90f23..3bdc69d 100644 --- a/fimdlp/testcpp/FImdlp_unittest.cc +++ b/fimdlp/testcpp/FImdlp_unittest.cc @@ -34,7 +34,7 @@ namespace mdlp { X = X_; indices = indices_; indices_t testSortedIndices = sortIndices(X); - float prev = X[testSortedIndices[0]]; + precision_t prev = X[testSortedIndices[0]]; for (auto i = 0; i < X.size(); ++i) { EXPECT_EQ(testSortedIndices[i], indices[i]); EXPECT_LE(prev, X[testSortedIndices[i]]); @@ -162,7 +162,7 @@ namespace mdlp { fit(X, y); computeCutPointsOriginal(); cutPoints_t expected; - vector computed = getCutPoints(); + vector computed = getCutPoints(); expected = { { 0, 4, -1, -3.4028234663852886e+38, 5.15 }, { 4, 6, -1, 5.15, 5.45 }, { 6, 10, -1, 5.45, 3.4028234663852886e+38 } diff --git a/fimdlp/testcpp/Metrics_unittest.cc b/fimdlp/testcpp/Metrics_unittest.cc index c04ec0f..0bea1c1 100644 --- a/fimdlp/testcpp/Metrics_unittest.cc +++ b/fimdlp/testcpp/Metrics_unittest.cc @@ -2,7 +2,7 @@ #include "../Metrics.h" namespace mdlp { - float precision = 0.000001; + precision_t precision = 0.000001; TEST(MetricTest, NumClasses) { labels y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; diff --git a/fimdlp/tests/bak/CPPFImdlp.cpp b/fimdlp/tests/bak/CPPFImdlp.cpp new file mode 100644 index 0000000..7f35562 --- /dev/null +++ b/fimdlp/tests/bak/CPPFImdlp.cpp @@ -0,0 +1,286 @@ +#include "CPPFImdlp.h" +#include +#include +#include +#include "Metrics.h" + +namespace mdlp { + ostream& operator << (ostream& os, const cutPoint_t& cut) + { + os << cut.classNumber << " -> (" << cut.start << ", " << cut.end << + ") - (" << cut.fromValue << ", " << cut.toValue << ") " + << endl; + return os; + + } + CPPFImdlp::CPPFImdlp(): proposal(true), precision(6), debug(false) + { + divider = pow(10, precision); + numClasses = 0; + } + CPPFImdlp::CPPFImdlp(bool proposal, int precision, bool debug): proposal(proposal), precision(precision), debug(debug) + { + divider = pow(10, precision); + numClasses = 0; + } + CPPFImdlp::~CPPFImdlp() + = default; + samples CPPFImdlp::getCutPoints() + { + samples output(cutPoints.size()); + ::transform(cutPoints.begin(), cutPoints.end(), output.begin(), + [](cutPoint_t cut) { return cut.toValue; }); + return output; + } + labels CPPFImdlp::getDiscretizedValues() + { + return xDiscretized; + } + CPPFImdlp& CPPFImdlp::fit(samples& X_, labels& y_) + { + X = X_; + y = y_; + if (X.size() != y.size()) { + throw invalid_argument("X and y must have the same size"); + } + if (X.size() == 0 || y.size() == 0) { + throw invalid_argument("X and y must have at least one element"); + } + indices = sortIndices(X_); + xDiscretized = labels(X.size(), -1); + numClasses = Metrics::numClasses(y, indices, 0, X.size()); + + if (proposal) { + computeCutPointsProposal(); + } else { + computeCutPointsOriginal(); + } + filterCutPoints(); + // Apply cut points to the input vector + for (auto cut : cutPoints) { + for (size_t i = cut.start; i < cut.end; i++) { + xDiscretized[indices[i]] = cut.classNumber; + } + } + return *this; + } + bool CPPFImdlp::evaluateCutPoint(cutPoint_t rest, cutPoint_t candidate) + { + int k, k1, k2; + precision_t ig, delta; + precision_t ent, ent1, ent2; + auto N = precision_t(rest.end - rest.start); + if (N < 2) { + return false; + } + k = Metrics::numClasses(y, indices, rest.start, rest.end); + k1 = Metrics::numClasses(y, indices, rest.start, candidate.end); + k2 = Metrics::numClasses(y, indices, candidate.end, rest.end); + ent = Metrics::entropy(y, indices, rest.start, rest.end, numClasses); + ent1 = Metrics::entropy(y, indices, rest.start, candidate.end, numClasses); + ent2 = Metrics::entropy(y, indices, candidate.end, rest.end, numClasses); + ig = Metrics::informationGain(y, indices, rest.start, rest.end, candidate.end, numClasses); + delta = log2(pow(3, precision_t(k)) - 2) - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); + precision_t term = 1 / N * (log2(N - 1) + delta); + if (debug) { + cout << "Rest: " << rest; + cout << "Candidate: " << candidate; + cout << "k=" << k << " k1=" << k1 << " k2=" << k2 << " ent=" << ent << " ent1=" << ent1 << " ent2=" << ent2 << endl; + cout << "ig=" << ig << " delta=" << delta << " N " << N << " term " << term << endl; + } + return (ig > term); + } + void CPPFImdlp::filterCutPoints() + { + cutPoints_t filtered; + cutPoint_t rest, item; + int classNumber = 0; + + rest.start = 0; + rest.end = X.size(); + rest.fromValue = numeric_limits::lowest(); + rest.toValue = numeric_limits::max(); + rest.classNumber = classNumber; + bool first = true; + for (size_t index = 0; index < size_t(cutPoints.size()); index++) { + item = cutPoints[index]; + if (evaluateCutPoint(rest, item)) { + if (debug) + cout << "Accepted: " << item << endl; + //Assign class number to the interval (cutpoint) + item.classNumber = classNumber++; + filtered.push_back(item); + first = false; + rest.start = item.end; + } else { + if (debug) + cout << "Rejected: " << item << endl; + if (index != size_t(cutPoints.size()) - 1) { + // Try to merge the rejected cutpoint with the next one + if (first) { + cutPoints[index + 1].fromValue = numeric_limits::lowest(); + cutPoints[index + 1].start = indices[0]; + } else { + cutPoints[index + 1].fromValue = item.fromValue; + cutPoints[index + 1].start = item.start; + } + } + } + } + if (!first) { + filtered.back().toValue = numeric_limits::max(); + filtered.back().end = X.size() - 1; + } else { + filtered.push_back(rest); + } + cutPoints = filtered; + } + void CPPFImdlp::computeCutPointsProposal() + { + cutPoints_t cutPts; + cutPoint_t cutPoint; + precision_t xPrev, xCur, xPivot; + int yPrev, yCur, yPivot; + size_t idx, numElements, start; + + xCur = xPrev = X[indices[0]]; + yCur = yPrev = y[indices[0]]; + numElements = indices.size() - 1; + idx = start = 0; + bool firstCutPoint = true; + if (debug) + printf("*idx=%lu -> (-1, -1) Prev(%3.1f, %d) Elementos: %lu\n", idx, xCur, yCur, numElements); + while (idx < numElements) { + xPivot = xCur; + yPivot = yCur; + if (debug) + printf(" Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur); + // Read the same values and check class changes + do { + idx++; + xCur = X[indices[idx]]; + yCur = y[indices[idx]]; + if (yCur != yPivot && xCur == xPivot) { + yPivot = -1; + } + if (debug) + printf(">idx=%lu -> Prev(%3.1f, %d) Pivot(%3.1f, %d) Cur(%3.1f, %d) \n", idx, xPrev, yPrev, xPivot, yPivot, xCur, yCur); + } + while (idx < numElements && xCur == xPivot); + // Check if the class changed and there are more than 1 element + if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && goodCut(start, idx, numElements + 1)) { + // Must we add the entropy criteria here? + // if (totalEntropy - (entropyLeft + entropyRight) > 0) { Accept cut point } + cutPoint.start = start; + cutPoint.end = idx; + start = idx; + cutPoint.fromValue = firstCutPoint ? numeric_limits::lowest() : cutPts.back().toValue; + cutPoint.toValue = (xPrev + xCur) / 2; + cutPoint.classNumber = -1; + firstCutPoint = false; + if (debug) { + printf("Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue); + } + cutPts.push_back(cutPoint); + } + yPrev = yPivot; + xPrev = xPivot; + } + if (idx == numElements) { + cutPoint.start = start; + cutPoint.end = numElements + 1; + cutPoint.fromValue = firstCutPoint ? numeric_limits::lowest() : cutPts.back().toValue; + cutPoint.toValue = numeric_limits::max(); + cutPoint.classNumber = -1; + if (debug) + printf("Final Cutpoint idx=%lu Cur(%3.1f, %d) Prev(%3.1f, %d) Pivot(%3.1f, %d) = (%3.1g, %3.1g] \n", idx, xCur, yCur, xPrev, yPrev, xPivot, yPivot, cutPoint.fromValue, cutPoint.toValue); + cutPts.push_back(cutPoint); + } + if (debug) { + cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, numElements + 1, numClasses) << endl; + for (auto cutPt : cutPts) + cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << " :Proposal: Cut point: " << cutPt; + } + cutPoints = cutPts; + } + void CPPFImdlp::computeCutPointsOriginal() + { + cutPoints_t cutPts; + cutPoint_t cutPoint; + precision_t xPrev; + int yPrev; + bool first = true; + // idxPrev is the index of the init instance of the cutPoint + size_t index, idxPrev = 0, last, idx = indices[0]; + xPrev = X[idx]; + yPrev = y[idx]; + last = indices.size() - 1; + for (index = 0; index < last; index++) { + idx = indices[index]; + // Definition 2 Cut points are always on class boundaries && + // there are more than 1 items in the interval + // if (entropy of interval) > (entropyLeft + entropyRight)) { Accept cut point } (goodCut) + if (y[idx] != yPrev && xPrev < X[idx] && idxPrev != index - 1 && goodCut(idxPrev, idx, last + 1)) { + // Must we add the entropy criteria here? + if (first) { + first = false; + cutPoint.fromValue = numeric_limits::lowest(); + } else { + cutPoint.fromValue = cutPts.back().toValue; + } + cutPoint.start = idxPrev; + cutPoint.end = index; + cutPoint.classNumber = -1; + cutPoint.toValue = round(divider * (X[idx] + xPrev) / 2) / divider; + idxPrev = index; + cutPts.push_back(cutPoint); + } + xPrev = X[idx]; + yPrev = y[idx]; + } + if (first) { + cutPoint.start = 0; + cutPoint.classNumber = -1; + cutPoint.fromValue = numeric_limits::lowest(); + cutPoint.toValue = numeric_limits::max(); + cutPts.push_back(cutPoint); + } else + cutPts.back().toValue = numeric_limits::max(); + cutPts.back().end = X.size(); + if (debug) { + cout << "Entropy of the dataset: " << Metrics::entropy(y, indices, 0, indices.size(), numClasses) << endl; + for (auto cutPt : cutPts) + cout << "Entropy: " << Metrics::entropy(y, indices, cutPt.start, cutPt.end, numClasses) << ": Original: Cut point: " << cutPt; + } + cutPoints = cutPts; + } + bool CPPFImdlp::goodCut(size_t start, size_t cut, size_t end) + { + /* + Meter las entropías en una matríz cuadrada dispersa (samples, samples) M[start, end] iniciada a -1 y si no se ha calculado calcularla y almacenarla + + + */ + precision_t entropyLeft = Metrics::entropy(y, indices, start, cut, numClasses); + precision_t entropyRight = Metrics::entropy(y, indices, cut, end, numClasses); + precision_t entropyInterval = Metrics::entropy(y, indices, start, end, numClasses); + if (debug) + printf("Entropy L, R, T: L(%5.3g) + R(%5.3g) - T(%5.3g) \t", entropyLeft, entropyRight, entropyInterval); + //return (entropyInterval - (entropyLeft + entropyRight) > 0); + return true; + } + // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes + indices_t CPPFImdlp::sortIndices(samples& X_) + { + indices_t idx(X_.size()); + iota(idx.begin(), idx.end(), 0); + for (size_t i = 0; i < X_.size(); i++) + stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) + { return X_[i1] < X_[i2]; }); + return idx; + } + void CPPFImdlp::setCutPoints(cutPoints_t cutPoints_) + { + cutPoints = cutPoints_; + } +} diff --git a/fimdlp/tests/bak/CPPFImdlp.h b/fimdlp/tests/bak/CPPFImdlp.h new file mode 100644 index 0000000..608b817 --- /dev/null +++ b/fimdlp/tests/bak/CPPFImdlp.h @@ -0,0 +1,39 @@ +#ifndef CPPFIMDLP_H +#define CPPFIMDLP_H +#include "typesFImdlp.h" +#include +namespace mdlp { + class CPPFImdlp { + protected: + bool proposal; // proposed algorithm or original algorithm + int precision; + bool debug; + precision_t divider; + indices_t indices; // sorted indices to use with X and y + samples X; + labels y; + labels xDiscretized; + int numClasses; + cutPoints_t cutPoints; + + void setCutPoints(cutPoints_t); + static indices_t sortIndices(samples&); + void computeCutPointsOriginal(); + void computeCutPointsProposal(); + bool evaluateCutPoint(cutPoint_t, cutPoint_t); + void filterCutPoints(); + bool goodCut(size_t, size_t, size_t); // if the cut candidate reduces entropy + + public: + CPPFImdlp(); + CPPFImdlp(bool, int, bool debug = false); + ~CPPFImdlp(); + samples getCutPoints(); + indices_t getIndices(); + labels getDiscretizedValues(); + void debugPoints(samples&, labels&); + CPPFImdlp& fit(samples&, labels&); + labels transform(samples&); + }; +} +#endif \ No newline at end of file diff --git a/fimdlp/tests/bak/Metrics.cpp b/fimdlp/tests/bak/Metrics.cpp new file mode 100644 index 0000000..d43d314 --- /dev/null +++ b/fimdlp/tests/bak/Metrics.cpp @@ -0,0 +1,47 @@ +#include "Metrics.h" +#include +namespace mdlp { + Metrics::Metrics() + = default; + int Metrics::numClasses(labels& y, indices_t indices, size_t start, size_t end) + { + std::set numClasses; + for (auto i = start; i < end; ++i) { + numClasses.insert(y[indices[i]]); + } + return numClasses.size(); + } + precision_t Metrics::entropy(labels& y, indices_t& indices, size_t start, size_t end, int nClasses) + { + precision_t entropy = 0; + int nElements = 0; + labels counts(nClasses + 1, 0); + for (auto i = &indices[start]; i != &indices[end]; ++i) { + counts[y[*i]]++; + nElements++; + } + for (auto count : counts) { + if (count > 0) { + precision_t p = (precision_t)count / nElements; + entropy -= p * log2(p); + } + } + return entropy < 0 ? 0 : entropy; + } + precision_t Metrics::informationGain(labels& y, indices_t& indices, size_t start, size_t end, size_t cutPoint, int nClasses) + { + precision_t iGain; + precision_t entropy, entropyLeft, entropyRight; + int nClassesLeft, nClassesRight; + int nElementsLeft = cutPoint - start, nElementsRight = end - cutPoint; + int nElements = end - start; + nClassesLeft = Metrics::numClasses(y, indices, start, cutPoint); + nClassesRight = Metrics::numClasses(y, indices, cutPoint, end); + entropy = Metrics::entropy(y, indices, start, end, nClasses); + entropyLeft = Metrics::entropy(y, indices, start, cutPoint, nClassesLeft); + entropyRight = Metrics::entropy(y, indices, cutPoint, end, nClassesRight); + iGain = entropy - ((precision_t)nElementsLeft * entropyLeft + (precision_t)nElementsRight * entropyRight) / nElements; + return iGain; + } + +} \ No newline at end of file diff --git a/fimdlp/tests/bak/Metrics.h b/fimdlp/tests/bak/Metrics.h new file mode 100644 index 0000000..5054998 --- /dev/null +++ b/fimdlp/tests/bak/Metrics.h @@ -0,0 +1,14 @@ +#ifndef METRICS_H +#define METRICS_H +#include "typesFImdlp.h" +#include +namespace mdlp { + class Metrics { + public: + Metrics(); + static int numClasses(labels&, indices_t, size_t, size_t); + static precision_t entropy(labels&, indices_t&, size_t, size_t, int); + static precision_t informationGain(labels&, indices_t&, size_t, size_t, size_t, int); + }; +} +#endif \ No newline at end of file diff --git a/fimdlp/typesFImdlp.h b/fimdlp/typesFImdlp.h index f23b78e..b94b943 100644 --- a/fimdlp/typesFImdlp.h +++ b/fimdlp/typesFImdlp.h @@ -5,21 +5,12 @@ using namespace std; namespace mdlp { - struct CutPointBody { - size_t start, end; // indices of the sorted vector - }; - typedef CutPointBody cutPoint_t; - typedef vector samples; + typedef float precision_t; + typedef vector samples; typedef vector labels; typedef vector indices_t; - typedef vector cutPoints_t; - typedef map, float> cacheEnt_t; - typedef map, float> cacheIg_t; - struct cutPointStruct { - size_t index; - float value; - }; - typedef cutPointStruct xcutPoint_t; - typedef vector xcutPoints_t; + typedef vector cutPoints_t; + typedef map, precision_t> cacheEnt_t; + typedef map, precision_t> cacheIg_t; } #endif \ No newline at end of file diff --git a/prueba/FImdlp.cpp b/prueba/FImdlp.cpp index 68c2f69..0e18c7a 100644 --- a/prueba/FImdlp.cpp +++ b/prueba/FImdlp.cpp @@ -13,7 +13,7 @@ namespace FImdlp { int n = X.size(); for (i = 1; i < n; i++) { if (X.at(i) != ant) { - cutPts.push_back(float(X.at(i) + ant) / 2); + cutPts.push_back(precision_t(X.at(i) + ant) / 2); ant = X.at(i); } } diff --git a/prueba/cfimdlp.pyx b/prueba/cfimdlp.pyx index cfa00b2..f7ba7f0 100644 --- a/prueba/cfimdlp.pyx +++ b/prueba/cfimdlp.pyx @@ -5,7 +5,7 @@ from libcpp.vector cimport vector cdef extern from "FImdlp.h" namespace "FImdlp": cdef cppclass FImdlp: FImdlp() except + - vector[float] cutPoints(vector[int]&, vector[int]&) + vector[precision_t] cutPoints(vector[int]&, vector[int]&) cdef class CFImdlp: cdef FImdlp *thisptr diff --git a/setup.py b/setup.py index e62a83a..b1ce695 100644 --- a/setup.py +++ b/setup.py @@ -12,10 +12,8 @@ setup( name="cppfimdlp", sources=[ "fimdlp/cfimdlp.pyx", - # "fimdlp/CPPFImdlp.cpp", - # "fimdlp/Metrics.cpp", - "fimdlp/ccMetrics.cc", - "fimdlp/ccFImdlp.cc", + "fimdlp/CPPFImdlp.cpp", + "fimdlp/Metrics.cpp", ], language="c++", include_dirs=["fimdlp"],