diff --git a/src/Platform/x/CPPFImdlp.cpp b/src/Platform/x/CPPFImdlp.cpp deleted file mode 100644 index 8e5f27b..0000000 --- a/src/Platform/x/CPPFImdlp.cpp +++ /dev/null @@ -1,221 +0,0 @@ -#include -#include -#include -#include -#include "CPPFImdlp.h" -#include "Metrics.h" - -namespace mdlp { - - CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_), - max_depth(max_depth_), - proposed_cuts(proposed) - { - } - - CPPFImdlp::CPPFImdlp() = default; - - CPPFImdlp::~CPPFImdlp() = default; - - size_t CPPFImdlp::compute_max_num_cut_points() const - { - // Set the actual maximum number of cut points as a number or as a percentage of the number of samples - if (proposed_cuts == 0) { - return numeric_limits::max(); - } - if (proposed_cuts < 0 || proposed_cuts > static_cast(X.size())) { - throw invalid_argument("wrong proposed num_cuts value"); - } - if (proposed_cuts < 1) - return static_cast(round(static_cast(X.size()) * proposed_cuts)); - return static_cast(proposed_cuts); - } - - void CPPFImdlp::fit(samples_t& X_, labels_t& y_) - { - X = X_; - y = y_; - num_cut_points = compute_max_num_cut_points(); - depth = 0; - discretizedData.clear(); - cutPoints.clear(); - if (X.size() != y.size()) { - throw invalid_argument("X and y must have the same size"); - } - if (X.empty() || y.empty()) { - throw invalid_argument("X and y must have at least one element"); - } - if (min_length < 3) { - throw invalid_argument("min_length must be greater than 2"); - } - if (max_depth < 1) { - throw invalid_argument("max_depth must be greater than 0"); - } - indices = sortIndices(X_, y_); - metrics.setData(y, indices); - computeCutPoints(0, X.size(), 1); - sort(cutPoints.begin(), cutPoints.end()); - if (num_cut_points > 0) { - // Select the best (with lower entropy) cut points - while (cutPoints.size() > num_cut_points) { - resizeCutPoints(); - } - } - } - - pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) - { - size_t n; - size_t m; - size_t idxPrev = cut - 1 >= start ? cut - 1 : cut; - size_t idxNext = cut + 1 < end ? cut + 1 : cut; - bool backWall; // true if duplicates reach beginning of the interval - precision_t previous; - precision_t actual; - precision_t next; - previous = X[indices[idxPrev]]; - actual = X[indices[cut]]; - next = X[indices[idxNext]]; - // definition 2 of the paper => X[t-1] < X[t] - // get the first equal value of X in the interval - while (idxPrev > start && actual == previous) { - previous = X[indices[--idxPrev]]; - } - backWall = idxPrev == start && actual == previous; - // get the last equal value of X in the interval - while (idxNext < end - 1 && actual == next) { - next = X[indices[++idxNext]]; - } - // # of duplicates before cutpoint - n = cut - 1 - idxPrev; - // # of duplicates after cutpoint - m = idxNext - cut - 1; - // Decide which values to use - cut = cut + (backWall ? m + 1 : -n); - actual = X[indices[cut]]; - return { (actual + previous) / 2, cut }; - } - - void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_) - { - size_t cut; - pair result; - // Check if the interval length and the depth are Ok - if (end - start < min_length || depth_ > max_depth) - return; - depth = depth_ > depth ? depth_ : depth; - cut = getCandidate(start, end); - if (cut == numeric_limits::max()) - return; - if (mdlp(start, cut, end)) { - result = valueCutPoint(start, cut, end); - cut = result.second; - cutPoints.push_back(result.first); - computeCutPoints(start, cut, depth_ + 1); - computeCutPoints(cut, end, depth_ + 1); - } - } - - size_t CPPFImdlp::getCandidate(size_t start, size_t end) - { - /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which - E(A, TA; S) is minimal amongst all the candidate cut points. */ - size_t candidate = numeric_limits::max(); - size_t elements = end - start; - bool sameValues = true; - precision_t entropy_left; - precision_t entropy_right; - precision_t minEntropy; - // Check if all the values of the variable in the interval are the same - for (size_t idx = start + 1; idx < end; idx++) { - if (X[indices[idx]] != X[indices[start]]) { - sameValues = false; - break; - } - } - if (sameValues) - return candidate; - minEntropy = metrics.entropy(start, end); - for (size_t idx = start + 1; idx < end; idx++) { - // Cutpoints are always on boundaries (definition 2) - if (y[indices[idx]] == y[indices[idx - 1]]) - continue; - entropy_left = precision_t(idx - start) / static_cast(elements) * metrics.entropy(start, idx); - entropy_right = precision_t(end - idx) / static_cast(elements) * metrics.entropy(idx, end); - if (entropy_left + entropy_right < minEntropy) { - minEntropy = entropy_left + entropy_right; - candidate = idx; - } - } - return candidate; - } - - bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) - { - int k; - int k1; - int k2; - precision_t ig; - precision_t delta; - precision_t ent; - precision_t ent1; - precision_t ent2; - auto N = precision_t(end - start); - k = metrics.computeNumClasses(start, end); - k1 = metrics.computeNumClasses(start, cut); - k2 = metrics.computeNumClasses(cut, end); - ent = metrics.entropy(start, end); - ent1 = metrics.entropy(start, cut); - ent2 = metrics.entropy(cut, end); - ig = metrics.informationGain(start, cut, end); - delta = static_cast(log2(pow(3, precision_t(k)) - 2) - - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2)); - precision_t term = 1 / N * (log2(N - 1) + delta); - return ig > term; - } - - // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) - { - indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); - stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { - if (X_[i1] == X_[i2]) - return y_[i1] < y_[i2]; - else - return X_[i1] < X_[i2]; - }); - return idx; - } - - void CPPFImdlp::resizeCutPoints() - { - //Compute entropy of each of the whole cutpoint set and discards the biggest value - precision_t maxEntropy = 0; - precision_t entropy; - size_t maxEntropyIdx = 0; - size_t begin = 0; - size_t end; - for (size_t idx = 0; idx < cutPoints.size(); idx++) { - end = begin; - while (X[indices[end]] < cutPoints[idx] && end < X.size()) - end++; - entropy = metrics.entropy(begin, end); - if (entropy > maxEntropy) { - maxEntropy = entropy; - maxEntropyIdx = idx; - } - begin = end; - } - cutPoints.erase(cutPoints.begin() + static_cast(maxEntropyIdx)); - } - labels_t& CPPFImdlp::transform(const samples_t& data) - { - discretizedData.reserve(data.size()); - for (const precision_t& item : data) { - auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item); - discretizedData.push_back(upper - cutPoints.begin()); - } - return discretizedData; - } -} diff --git a/src/Platform/x/CPPFImdlp.h b/src/Platform/x/CPPFImdlp.h deleted file mode 100644 index 1fb0cab..0000000 --- a/src/Platform/x/CPPFImdlp.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef CPPFIMDLP_H -#define CPPFIMDLP_H - -#include "typesFImdlp.h" -#include "Metrics.h" -#include -#include -#include - -namespace mdlp { - class CPPFImdlp { - protected: - size_t min_length = 3; - int depth = 0; - int max_depth = numeric_limits::max(); - float proposed_cuts = 0; - indices_t indices = indices_t(); - samples_t X = samples_t(); - labels_t y = labels_t(); - Metrics metrics = Metrics(y, indices); - cutPoints_t cutPoints; - size_t num_cut_points = numeric_limits::max(); - labels_t discretizedData = labels_t(); - - static indices_t sortIndices(samples_t&, labels_t&); - - void computeCutPoints(size_t, size_t, int); - void resizeCutPoints(); - bool mdlp(size_t, size_t, size_t); - size_t getCandidate(size_t, size_t); - size_t compute_max_num_cut_points() const; - pair valueCutPoint(size_t, size_t, size_t); - - public: - CPPFImdlp(); - CPPFImdlp(size_t, int, float); - ~CPPFImdlp(); - void fit(samples_t&, labels_t&); - inline cutPoints_t getCutPoints() const { return cutPoints; }; - labels_t& transform(const samples_t&); - inline int get_depth() const { return depth; }; - static inline string version() { return "1.1.2"; }; - }; -} -#endif diff --git a/src/Platform/x/Metrics.cpp b/src/Platform/x/Metrics.cpp deleted file mode 100644 index 71a3c07..0000000 --- a/src/Platform/x/Metrics.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include "Metrics.h" -#include -#include - -using namespace std; -namespace mdlp { - Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), - numClasses(computeNumClasses(0, indices.size())) - { - } - - int Metrics::computeNumClasses(size_t start, size_t end) - { - set nClasses; - for (auto i = start; i < end; ++i) { - nClasses.insert(y[indices[i]]); - } - return static_cast(nClasses.size()); - } - - void Metrics::setData(const labels_t& y_, const indices_t& indices_) - { - indices = indices_; - y = y_; - numClasses = computeNumClasses(0, indices.size()); - entropyCache.clear(); - igCache.clear(); - } - - precision_t Metrics::entropy(size_t start, size_t end) - { - precision_t p; - precision_t ventropy = 0; - int nElements = 0; - labels_t counts(numClasses + 1, 0); - if (end - start < 2) - return 0; - if (entropyCache.find({ start, end }) != entropyCache.end()) { - return entropyCache[{start, end}]; - } - for (auto i = &indices[start]; i != &indices[end]; ++i) { - counts[y[*i]]++; - nElements++; - } - for (auto count : counts) { - if (count > 0) { - p = static_cast(count) / static_cast(nElements); - ventropy -= p * log2(p); - } - } - entropyCache[{start, end}] = ventropy; - return ventropy; - } - - precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) - { - precision_t iGain; - precision_t entropyInterval; - precision_t entropyLeft; - precision_t entropyRight; - size_t nElementsLeft = cut - start; - size_t nElementsRight = end - cut; - size_t nElements = end - start; - if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { - return igCache[make_tuple(start, cut, end)]; - } - entropyInterval = entropy(start, end); - entropyLeft = entropy(start, cut); - entropyRight = entropy(cut, end); - iGain = entropyInterval - - (static_cast(nElementsLeft) * entropyLeft + - static_cast(nElementsRight) * entropyRight) / - static_cast(nElements); - igCache[make_tuple(start, cut, end)] = iGain; - return iGain; - } - -} \ No newline at end of file diff --git a/src/Platform/x/Metrics.h b/src/Platform/x/Metrics.h deleted file mode 100644 index 4f8151a..0000000 --- a/src/Platform/x/Metrics.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef CCMETRICS_H -#define CCMETRICS_H - -#include "typesFImdlp.h" - -namespace mdlp { - class Metrics { - protected: - labels_t& y; - indices_t& indices; - int numClasses; - cacheEnt_t entropyCache = cacheEnt_t(); - cacheIg_t igCache = cacheIg_t(); - public: - Metrics(labels_t&, indices_t&); - void setData(const labels_t&, const indices_t&); - int computeNumClasses(size_t, size_t); - precision_t entropy(size_t, size_t); - precision_t informationGain(size_t, size_t, size_t); - }; -} -#endif \ No newline at end of file diff --git a/src/Platform/x/typesFImdlp.h b/src/Platform/x/typesFImdlp.h deleted file mode 100644 index b28b2ca..0000000 --- a/src/Platform/x/typesFImdlp.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef TYPES_H -#define TYPES_H - -#include -#include -#include - -using namespace std; -namespace mdlp { - typedef float precision_t; - typedef vector samples_t; - typedef vector labels_t; - typedef vector indices_t; - typedef vector cutPoints_t; - typedef map, precision_t> cacheEnt_t; - typedef map, precision_t> cacheIg_t; -} -#endif