#include #include #include #include #include "CPPFImdlp.h" #include "Metrics.h" namespace mdlp { CPPFImdlp::CPPFImdlp(int algorithm) : algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices)) { } CPPFImdlp::~CPPFImdlp() = default; CPPFImdlp &CPPFImdlp::fit(samples_t &X_, labels_t &y_) { X = X_; y = y_; cutPoints.clear(); if (X.size() != y.size()) { throw invalid_argument("X and y must have the same size"); } if (X.empty() || y.empty()) { throw invalid_argument("X and y must have at least one element"); } indices = sortIndices(X_, y_); metrics.setData(y, indices); switch (algorithm) { case 0: computeCutPoints(0, X.size()); break; case 1: computeCutPointsAlternative(0, X.size()); break; case 2: indices = sortIndices1(X_); metrics.setData(y, indices); computeCutPointsClassic(0, X.size()); break; default: throw invalid_argument("algorithm must be 0, 1 or 2"); } return *this; } precision_t CPPFImdlp::halfWayValueCutPoint(size_t start, size_t idx) { size_t idxPrev = idx - 1; precision_t previous = X[indices[idxPrev]], actual = X[indices[idx]]; // definition 2 of the paper => X[t-1] < X[t] while (idxPrev-- > start && actual == previous) { previous = X[indices[idxPrev]]; } return (previous + actual) / 2; } tuple CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end) { size_t idxPrev = cut - 1; bool fforward = false; precision_t previous, actual; previous = X[indices[idxPrev]]; actual = X[indices[cut]]; // definition 2 of the paper => X[t-1] < X[t] while (idxPrev-- > start && actual == previous) { previous = X[indices[idxPrev]]; } // get the last equal value of X in the interval while (actual == X[indices[cut]] && cut + 1 < end) { cut++; fforward = true; } if (fforward) cut--; // try to get the next value if it can't be found backwards if (previous == actual && cut + 1 < end) actual = X[indices[cut + 1]]; return make_tuple((previous + actual) / 2, cut); } void CPPFImdlp::computeCutPoints(size_t start, size_t end) { size_t cut; tuple result; if (end - start < 2) return; cut = getCandidate(start, end); if (cut == numeric_limits::max()) return; if (mdlp(start, cut, end)) { result = completeValueCutPoint(start, cut, end); cut = get<1>(result); cutPoints.push_back(get<0>(result)); computeCutPoints(start, cut); computeCutPoints(cut, end); } } void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) { size_t cut; if (end - start < 2) return; cut = getCandidate(start, end); if (cut == numeric_limits::max()) return; if (mdlp(start, cut, end)) { cutPoints.push_back(halfWayValueCutPoint(start, cut)); computeCutPointsAlternative(start, cut); computeCutPointsAlternative(cut, end); } } void CPPFImdlp::computeCutPointsClassic(size_t start, size_t end) { size_t cut; cut = getCandidate(start, end); if (cut == numeric_limits::max() || !mdlp(start, cut, end)) { // cut.value == -1 means that there is no candidate in the interval // No boundary found, so we add both ends of the interval as cutpoints // because they were selected by the algorithm before if (start == end) return; if (start != 0) cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2); if (end != X.size()) cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2); return; } computeCutPoints(start, cut); computeCutPoints(cut, end); } size_t CPPFImdlp::getCandidate(size_t start, size_t end) { /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which E(A, TA; S) is minimal amongst all the candidate cut points. */ size_t candidate = numeric_limits::max(), elements = end - start; bool same_values = true; precision_t entropy_left, entropy_right, minEntropy; minEntropy = metrics.entropy(start, end); for (auto idx = start + 1; idx < end; idx++) { if (X[indices[idx]] != X[indices[idx - 1]]) same_values = false; // Cutpoints are always on boundaries (definition 2) if (y[indices[idx]] == y[indices[idx - 1]]) continue; entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx); entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end); if (entropy_left + entropy_right < minEntropy) { minEntropy = entropy_left + entropy_right; candidate = idx; } } // If all the values of the variable in the interval are the same, it doesn't consider the cut point if (same_values) candidate = numeric_limits::max(); return candidate; } bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) { int k, k1, k2; precision_t ig, delta; precision_t ent, ent1, ent2; auto N = precision_t(end - start); if (N < 2) { return false; } k = metrics.computeNumClasses(start, end); k1 = metrics.computeNumClasses(start, cut); k2 = metrics.computeNumClasses(cut, end); ent = metrics.entropy(start, end); ent1 = metrics.entropy(start, cut); ent2 = metrics.entropy(cut, end); ig = metrics.informationGain(start, cut, end); delta = log2(pow(3, precision_t(k)) - 2) - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes indices_t CPPFImdlp::sortIndices(samples_t &X_, labels_t &y_) { indices_t idx(X_.size()); iota(idx.begin(), idx.end(), 0); for (size_t i = 0; i < X_.size(); i++) stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; else return X_[i1] < X_[i2]; }); return idx; } // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes indices_t CPPFImdlp::sortIndices1(samples_t &X_) { indices_t idx(X_.size()); iota(idx.begin(), idx.end(), 0); for (size_t i = 0; i < X_.size(); i++) stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) { return X_[i1] < X_[i2]; }); return idx; } cutPoints_t CPPFImdlp::getCutPoints() { // Remove duplicates and sort cutPoints_t output(cutPoints.size()); set s; unsigned size = cutPoints.size(); for (unsigned i = 0; i < size; i++) s.insert(cutPoints[i]); output.assign(s.begin(), s.end()); sort(output.begin(), output.end()); return output; } }