From f449f438ef86199b6360443bd22ab98438fbaa36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 21 Dec 2022 11:33:55 +0100 Subject: [PATCH] Refactor Algorithm --- CPPFImdlp.cpp | 165 +++++++------------------------------- CPPFImdlp.h | 18 ++--- Metrics.cpp | 2 +- README.md | 9 ++- feature0 | 152 ----------------------------------- sample/ArffFiles.cpp | 1 - sample/test.cpp | 95 ---------------------- sample/tests/01.arff | 35 -------- sample/tests/02.arff | 25 ------ sample/tests/03.arff | 24 ------ tests/FImdlp_unittest.cpp | 118 +++++++++++++-------------- 11 files changed, 101 insertions(+), 543 deletions(-) delete mode 100644 feature0 delete mode 100644 sample/test.cpp delete mode 100755 sample/tests/01.arff delete mode 100755 sample/tests/02.arff delete mode 100755 sample/tests/03.arff diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 010e168..a23c148 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -4,15 +4,12 @@ #include #include "CPPFImdlp.h" #include "Metrics.h" -// OJO QUITAR ESTO -#include namespace mdlp { CPPFImdlp::CPPFImdlp(int algorithm):algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices)) { } CPPFImdlp::~CPPFImdlp() = default; - CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_) { X = X_; @@ -24,22 +21,21 @@ namespace mdlp { if (X.size() == 0 || y.size() == 0) { throw invalid_argument("X and y must have at least one element"); } - indices = sortIndices2(X_, y_); + indices = sortIndices(X_, y_); metrics.setData(y, indices); switch (algorithm) { case 0: computeCutPoints(0, X.size()); break; case 1: - computeCutPointsProposal(0, X.size()); - break; - case 2: computeCutPointsAlternative(0, X.size()); break; + default: + throw invalid_argument("algorithm must be 0 or 1"); } return *this; } - precision_t CPPFImdlp::value_cut_point(size_t start, size_t idx) + precision_t CPPFImdlp::halfWayValueCutPoint(size_t start, size_t idx) { size_t idxPrev = idx - 1; precision_t previous = X[indices[idxPrev]], actual = X[indices[idx]]; @@ -49,7 +45,7 @@ namespace mdlp { } return (previous + actual) / 2; } - tuple CPPFImdlp::value_proposal_cut_point(size_t start, size_t cut, size_t end) + tuple CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end) { size_t idxPrev = cut - 1; precision_t previous, next, actual; @@ -66,62 +62,22 @@ namespace mdlp { cut--; return make_tuple((previous + actual) / 2, cut); } - // void CPPFImdlp::computeCutPoints(size_t start, size_t end) - // { - // size_t cut; - // if (end - start < 2) - // return; - // cut = getCandidate(start, end); - // if (cut == numeric_limits::max() || !mdlp(start, cut, end)) { - // // cut == max means that there is no candidate in the interval - // // No boundary found, so we add both ends of the interval as cutpoints - // // because they were selected by the algorithm before - // if (start != 0) - // cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2); - // if (end != X.size()) - // cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2); - // //cout << "!!!Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; - // return; - // } - // // cout << "*Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; - // computeCutPoints(start, cut); - // computeCutPoints(cut, end); - // } - // void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) - // { - // size_t cut; - // if (end - start < 2) - // return; - // cut = getCandidate(start, end); - // if (cut == numeric_limits::max() || !mdlp(start, cut, end)) { - // // cut == max means that there is no candidate in the interval - // // No boundary found, so we add both ends of the interval as cutpoints - // // because they were selected by the algorithm before - // if (start != 0) - // cutPoints.push_back(value_cut_point(0, start)); - // if (end != X.size()) - // cutPoints.push_back(value_cut_point(start, end)); - // //cout << "!!!Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; - // return; - // } - // // cout << "*Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; - // computeCutPointsAlternative(start, cut); - // computeCutPointsAlternative(cut, end); - // } void CPPFImdlp::computeCutPoints(size_t start, size_t end) { size_t cut; + tuple result; if (end - start < 2) return; cut = getCandidate(start, end); if (cut == numeric_limits::max()) return; if (mdlp(start, cut, end)) { - cutPoints.push_back(value_cut_point(start, cut)); - //cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; + result = completeValueCutPoint(start, cut, end); + cut = get<1>(result); + cutPoints.push_back(get<0>(result)); + computeCutPoints(start, cut); + computeCutPoints(cut, end); } - computeCutPoints(start, cut); - computeCutPoints(cut, end); } void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) { @@ -132,67 +88,11 @@ namespace mdlp { if (cut == numeric_limits::max()) return; if (mdlp(start, cut, end)) { - cutPoints.push_back(value_cut_point(start, cut)); - //cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; + cutPoints.push_back(halfWayValueCutPoint(start, cut)); computeCutPointsAlternative(start, cut); computeCutPointsAlternative(cut, end); } } - // void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) - // { - // size_t cut; - // if (end - start < 2) - // return; - // cut = getCandidateWeka(start, end); - // if (cut == numeric_limits::max()) - // return; - // if (mdlp(start, cut, end)) { - // cutPoints.push_back(value_cut_point(start, cut)); - // //cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; - // } - // computeCutPointsAlternative(start, cut); - // computeCutPointsAlternative(cut, end); - // } - void CPPFImdlp::computeCutPointsProposal(size_t start, size_t end) - { - size_t cut; - tuple result; - if (end - start < 2) - return; - cut = getCandidate(start, end); - if (cut == numeric_limits::max()) - return; - if (mdlp(start, cut, end)) { - //cout << "+Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; - result = value_proposal_cut_point(start, cut, end); - cut = get<1>(result); - cutPoints.push_back(get<0>(result)); - //cout << "*Alg: " << algorithm << " Cut: " << cut << " Start: " << start << " End: " << end << endl; - computeCutPointsProposal(start, cut); - computeCutPointsProposal(cut, end); - } - - } - size_t CPPFImdlp::getCandidateWeka(size_t start, size_t end) - { - /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which - E(A, TA; S) is minimal amogst all the candidate cut points. */ - size_t candidate = numeric_limits::max(), elements = end - start; - precision_t entropy_left, entropy_right, minEntropy; - minEntropy = metrics.entropy(start, end); - for (auto idx = start + 1; idx < end; idx++) { - // Cutpoints are always on boundaries (definition 2) - if (X[indices[idx - 1]] < X[indices[idx]]) { - entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx); - entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end); - if (entropy_left + entropy_right < minEntropy) { - minEntropy = entropy_left + entropy_right; - candidate = idx; - } - } - } - return candidate; - } size_t CPPFImdlp::getCandidate(size_t start, size_t end) { /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which @@ -229,11 +129,25 @@ namespace mdlp { ent1 = metrics.entropy(start, cut); ent2 = metrics.entropy(cut, end); ig = metrics.informationGain(start, cut, end); - delta = log(pow(3, precision_t(k)) - 2) - + delta = log2(pow(3, precision_t(k)) - 2) - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); - precision_t term = 1 / N * (log(N - 1) + delta); + precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } + // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes + indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) + { + indices_t idx(X_.size()); + iota(idx.begin(), idx.end(), 0); + for (size_t i = 0; i < X_.size(); i++) + stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) + { + if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; + else + return X_[i1] < X_[i2]; + }); + return idx; + } cutPoints_t CPPFImdlp::getCutPoints() { // Remove duplicates and sort @@ -246,27 +160,4 @@ namespace mdlp { sort(output.begin(), output.end()); return output; } - // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples_t& X_) - { - indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); - for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) - { return X_[i1] < X_[i2]; }); - return idx; - } - indices_t CPPFImdlp::sortIndices2(samples_t& X_, labels_t& y_) - { - indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); - for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) - { - if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; - else - return X_[i1] < X_[i2]; - }); - return idx; - } } diff --git a/CPPFImdlp.h b/CPPFImdlp.h index dfddfb0..56e9006 100644 --- a/CPPFImdlp.h +++ b/CPPFImdlp.h @@ -9,29 +9,25 @@ namespace mdlp { class CPPFImdlp { protected: int algorithm; - indices_t indices; // sorted indices to use with X and y + indices_t indices; samples_t X; labels_t y; Metrics metrics; cutPoints_t cutPoints; - static indices_t sortIndices(samples_t&); - static indices_t sortIndices2(samples_t&, labels_t&); + static indices_t sortIndices(samples_t&, labels_t&); void computeCutPoints(size_t, size_t); + void computeCutPointsAlternative(size_t, size_t); bool mdlp(size_t, size_t, size_t); size_t getCandidate(size_t, size_t); - size_t getCandidateWeka(size_t, size_t); - void computeCutPointsAlternative(size_t, size_t); - void computeCutPointsProposal(size_t, size_t); - precision_t value_cut_point(size_t, size_t); - tuple value_proposal_cut_point(size_t, size_t, size_t); - + precision_t halfWayValueCutPoint(size_t, size_t); + tuple completeValueCutPoint(size_t, size_t, size_t); public: - CPPFImdlp(int); + CPPFImdlp(int algorithm = 0); ~CPPFImdlp(); CPPFImdlp& fit(samples_t&, labels_t&); samples_t getCutPoints(); - inline string version() { return "0.8.1"; }; + inline string version() { return "0.9.7"; }; }; } #endif \ No newline at end of file diff --git a/Metrics.cpp b/Metrics.cpp index dc7bc27..1275b00 100644 --- a/Metrics.cpp +++ b/Metrics.cpp @@ -39,7 +39,7 @@ namespace mdlp { for (auto count : counts) { if (count > 0) { p = (precision_t)count / nElements; - ventropy -= p * log(p); + ventropy -= p * log2(p); } } entropyCache[make_tuple(start, end)] = ventropy; diff --git a/README.md b/README.md index 56d9a7d..a8d04ce 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,9 @@ # mdlp -Discretization algorithm based on the paper by Fayyad & Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning +Discretization algorithm based on the paper by Fayyad & Irani [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](https://www.ijcai.org/Proceedings/93-2/Papers/022.pdf) + +The implementation tries to mitigate the problem of different label values with the same value of the variable: + +- Sorts the values of the variable using the label values as a tie-breaker +- Once found a valid candidate for the split, it checks if the previous value is the same as actual one, and tries to get previous one, or next if the former is not possible. + +The algorithm returns the cut points for the variable. \ No newline at end of file diff --git a/feature0 b/feature0 deleted file mode 100644 index e835df9..0000000 --- a/feature0 +++ /dev/null @@ -1,152 +0,0 @@ -+++++++++++++++++++++++ -( 0, 13) -> (4.3, 0) -( 1, 8) -> (4.4, 0) -( 2, 38) -> (4.4, 0) -( 3, 42) -> (4.4, 0) -( 4, 41) -> (4.5, 0) -( 5, 3) -> (4.6, 0) -( 6, 6) -> (4.6, 0) -( 7, 22) -> (4.6, 0) -( 8, 47) -> (4.6, 0) -( 9, 2) -> (4.7, 0) -( 10, 29) -> (4.7, 0) -( 11, 11) -> (4.8, 0) -( 12, 12) -> (4.8, 0) -( 13, 24) -> (4.8, 0) -( 14, 30) -> (4.8, 0) -( 15, 45) -> (4.8, 0) -( 16, 1) -> (4.9, 0) -( 17, 9) -> (4.9, 0) -( 18, 34) -> (4.9, 0) -( 19, 37) -> (4.9, 0) -( 20, 57) -> (4.9, 1) candidate Total Entropy: 0.633 E. left: 0.000 E. right: 0.855 = 0.539 (0, 54) No -( 21, 106) -> (4.9, 2) -( 22, 4) -> (5.0, 0) -( 23, 7) -> (5.0, 0) -( 24, 25) -> (5.0, 0) -( 25, 26) -> (5.0, 0) -( 26, 35) -> (5.0, 0) -( 27, 40) -> (5.0, 0) -( 28, 43) -> (5.0, 0) -( 29, 49) -> (5.0, 0) -( 30, 60) -> (5.0, 1) -( 31, 93) -> (5.0, 1) -( 32, 0) -> (5.1, 0) -( 33, 17) -> (5.1, 0) -( 34, 19) -> (5.1, 0) -( 35, 21) -> (5.1, 0) -( 36, 23) -> (5.1, 0) -( 37, 39) -> (5.1, 0) -( 38, 44) -> (5.1, 0) -( 39, 46) -> (5.1, 0) -( 40, 98) -> (5.1, 1) -( 41, 27) -> (5.2, 0) -( 42, 28) -> (5.2, 0) -( 43, 32) -> (5.2, 0) -( 44, 59) -> (5.2, 1) -( 45, 48) -> (5.3, 0) -( 46, 5) -> (5.4, 0) -( 47, 10) -> (5.4, 0) -( 48, 16) -> (5.4, 0) -( 49, 20) -> (5.4, 0) -( 50, 31) -> (5.4, 0) -( 51, 84) -> (5.4, 1) -( 52, 33) -> (5.5, 0) -( 53, 36) -> (5.5, 0) -( 54, 53) -> (5.5, 1) 1st cut Total Entropy: 1.585 E. left: 0.633 E. right: 1.167 = 0.975 (0, 150) Sí => 5.450 -( 55, 80) -> (5.5, 1) -( 56, 81) -> (5.5, 1) -( 57, 89) -> (5.5, 1) -( 58, 90) -> (5.5, 1) -( 59, 64) -> (5.6, 1) -( 60, 66) -> (5.6, 1) -( 61, 69) -> (5.6, 1) -( 62, 88) -> (5.6, 1) -( 63, 94) -> (5.6, 1) -( 64, 121) -> (5.6, 2) Candidate Total Entropy: 1.167 E. left: 0.966 E. right: 0.939 = 0.946 (54, 77) No -( 65, 15) -> (5.7, 0) -( 66, 18) -> (5.7, 0) -( 67, 55) -> (5.7, 1) -( 68, 79) -> (5.7, 1) -( 69, 95) -> (5.7, 1) -( 70, 96) -> (5.7, 1) -( 71, 99) -> (5.7, 1) -( 72, 113) -> (5.7, 2) -( 73, 14) -> (5.8, 0) -( 74, 67) -> (5.8, 1) -( 75, 82) -> (5.8, 1) -( 76, 92) -> (5.8, 1) -( 77, 101) -> (5.8, 2) 2nd cut Total Entropy: 1.167 E. left: 0.966 E. right: 0.939 = 0.946 (54, 150) Sí => 5.750 -( 78, 114) -> (5.8, 2) -( 79, 142) -> (5.8, 2) -( 80, 61) -> (5.9, 1) -( 81, 70) -> (5.9, 1) -( 82, 149) -> (5.9, 2) -( 83, 62) -> (6.0, 1) -( 84, 78) -> (6.0, 1) -( 85, 83) -> (6.0, 1) -( 86, 85) -> (6.0, 1) -( 87, 119) -> (6.0, 2) -( 88, 138) -> (6.0, 2) -( 89, 63) -> (6.1, 1) -( 90, 71) -> (6.1, 1) -( 91, 73) -> (6.1, 1) -( 92, 91) -> (6.1, 1) -( 93, 127) -> (6.1, 2) -( 94, 134) -> (6.1, 2) -( 95, 68) -> (6.2, 1) -( 96, 97) -> (6.2, 1) -( 97, 126) -> (6.2, 2) -( 98, 148) -> (6.2, 2) -( 99, 56) -> (6.3, 1) -(100, 72) -> (6.3, 1) -(101, 87) -> (6.3, 1) -(102, 100) -> (6.3, 2) -(103, 103) -> (6.3, 2) -(104, 123) -> (6.3, 2) -(105, 133) -> (6.3, 2) -(106, 136) -> (6.3, 2) -(107, 146) -> (6.3, 2) -(108, 51) -> (6.4, 1) -(109, 74) -> (6.4, 1) -(110, 111) -> (6.4, 2) -(111, 115) -> (6.4, 2) -(112, 128) -> (6.4, 2) -(113, 132) -> (6.4, 2) -(114, 137) -> (6.4, 2) -(115, 54) -> (6.5, 1) -(116, 104) -> (6.5, 2) -(117, 110) -> (6.5, 2) -(118, 116) -> (6.5, 2) -(119, 147) -> (6.5, 2) -(120, 58) -> (6.6, 1) -(121, 75) -> (6.6, 1) -(122, 65) -> (6.7, 1) -(123, 77) -> (6.7, 1) -(124, 86) -> (6.7, 1) -(125, 108) -> (6.7, 2) -(126, 124) -> (6.7, 2) -(127, 140) -> (6.7, 2) -(128, 144) -> (6.7, 2) -(129, 145) -> (6.7, 2) -(130, 76) -> (6.8, 1) -(131, 112) -> (6.8, 2) -(132, 143) -> (6.8, 2) -(133, 52) -> (6.9, 1) -(134, 120) -> (6.9, 2) -(135, 139) -> (6.9, 2) -(136, 141) -> (6.9, 2) -(137, 50) -> (7.0, 1) -(138, 102) -> (7.1, 2) candidate Total Entropy: 0.939 E. left: 0.984 E. right: 0.000 = 0.822 (77, 150) No -(139, 109) -> (7.2, 2) -(140, 125) -> (7.2, 2) -(141, 129) -> (7.2, 2) -(142, 107) -> (7.3, 2) -(143, 130) -> (7.4, 2) -(144, 105) -> (7.6, 2) -(145, 117) -> (7.7, 2) -(146, 118) -> (7.7, 2) -(147, 122) -> (7.7, 2) -(148, 135) -> (7.7, 2) -(149, 131) -> (7.9, 2) -+++++++++++++++++++++++ \ No newline at end of file diff --git a/sample/ArffFiles.cpp b/sample/ArffFiles.cpp index 9baf861..7b59ef8 100644 --- a/sample/ArffFiles.cpp +++ b/sample/ArffFiles.cpp @@ -1,5 +1,4 @@ #include "ArffFiles.h" - #include #include #include diff --git a/sample/test.cpp b/sample/test.cpp deleted file mode 100644 index 44bdfa9..0000000 --- a/sample/test.cpp +++ /dev/null @@ -1,95 +0,0 @@ -#include "ArffFiles.h" -#include -#include -#include -#include "../CPPFImdlp.h" - -using namespace std; -using namespace mdlp; - -tuple getCutPoint(samples_t& X, labels_t& y, size_t start, size_t cut, size_t end) -{ - size_t idxPrev = cut - 1; - precision_t previous, next, actual; - previous = X[idxPrev]; - next = actual = X[cut]; - // definition 2 of the paper => X[t-1] < X[t] - while (idxPrev-- > start && actual == previous) { - previous = X[idxPrev]; - } - // get the last equal value of X in the interval - while (actual == X[cut++] && cut < end); - if (previous == actual && cut < end) - actual = X[cut]; - cut--; - return make_tuple((previous + actual) / 2, cut); -} - -void show_points(samples_t& X, labels_t& y, size_t start, size_t end) -{ - cout << "Interval: " << start << " - " << end << endl; - tuple cutPoint; - size_t cut = start + 1; - if (start >= end) { - return; - } - while (y[cut - 1] == y[cut] && cut < end) - cut++; - if (cut != end) { - cutPoint = getCutPoint(X, y, start, cut, end); - cout << cut << ": " << fixed << setprecision(1) << X[cut] << " " << y[cut] << endl; - cout << "Cut point: " << get<0>(cutPoint) << " at " << get<1>(cutPoint) << endl; - show_points(X, y, start, get<1>(cutPoint)); - show_points(X, y, get<1>(cutPoint), end); - } - -} - -int main(int argc, char** argv) -{ - ArffFiles file; - vector lines; - string path = "../tests/"; - map datasets = { - {"01", true}, - {"02", true}, - {"03", true}, - {"04", true} - }; - if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { - cout << "Usage: " << argv[0] << " {01, 02, 03, 04}" << endl; - return 1; - } - - file.load(path + argv[1] + ".arff", datasets[argv[1]]); - auto attributes = file.getAttributes(); - int items = file.getSize(); - cout << "Number of lines: " << items << endl; - cout << "Attributes: " << endl; - for (auto attribute : attributes) { - cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl; - } - cout << "Class name: " << file.getClassName() << endl; - cout << "Class type: " << file.getClassType() << endl; - cout << "Data: " << endl; - vector& X = file.getX(); - labels_t& y = file.getY(); - for (int i = 0; i < y.size(); i++) { - for (auto feature : X) { - cout << i << ": " << fixed << setprecision(1) << feature[i] << " "; - } - cout << y[i] << endl; - } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0); - for (auto i = 0; i < attributes.size(); i++) { - cout << "Cut points for " << get<0>(attributes[i]) << endl; - cout << "--------------------------" << setprecision(3) << endl; - test.fit(X[i], y); - for (auto item : test.getCutPoints()) { - cout << item << endl; - } - } - cout << "Function test" << endl; - show_points(X[0], y, 0, items); - return 0; -} diff --git a/sample/tests/01.arff b/sample/tests/01.arff deleted file mode 100755 index aaeacb6..0000000 --- a/sample/tests/01.arff +++ /dev/null @@ -1,35 +0,0 @@ -% . - -@RELATION 01 - -@ATTRIBUTE X REAL -@ATTRIBUTE class {0,1,2} - -@DATA -1, 0 -1, 0 -1, 0 -1, 0 -1, 0 -1, 0 -1, 0 -2, 0 -2, 0 -2, 0 -2, 1 -2, 2 -2, 2 -2, 2 -2, 2 -3, 0 -3, 0 -3, 0 -3, 0 -3, 0 -3, 1 -3, 1 -3, 1 -3, 2 -3, 2 -4, 0 -4, 1 \ No newline at end of file diff --git a/sample/tests/02.arff b/sample/tests/02.arff deleted file mode 100755 index 71df45b..0000000 --- a/sample/tests/02.arff +++ /dev/null @@ -1,25 +0,0 @@ -% . - -@RELATION 01 - -@ATTRIBUTE X REAL -@ATTRIBUTE class {0,1,2} - -@DATA -2, 0 -3, 0 -3, 0 -3, 0 -3, 0 -3, 0 -3, 1 -3, 1 -3, 1 -3, 2 -3, 2 -4, 0 -4, 1 -4, 1 -4, 1 -4, 1 -4, 1 \ No newline at end of file diff --git a/sample/tests/03.arff b/sample/tests/03.arff deleted file mode 100755 index 601043d..0000000 --- a/sample/tests/03.arff +++ /dev/null @@ -1,24 +0,0 @@ -% . - -@RELATION 01 - -@ATTRIBUTE X REAL -@ATTRIBUTE class {0,1,2} - -@DATA -3, 0 -3, 0 -3, 0 -3, 0 -3, 0 -3, 1 -3, 1 -3, 1 -3, 2 -3, 2 -4, 0 -4, 1 -4, 1 -4, 1 -4, 1 -4, 1 \ No newline at end of file diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index 2958bee..06d3d52 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -4,31 +4,26 @@ #include namespace mdlp { - class TestFImdlp : public CPPFImdlp, public testing::Test { + class TestFImdlp: public CPPFImdlp, public testing::Test { public: precision_t precision = 0.000001; - - TestFImdlp() : CPPFImdlp(false) {} - - void SetUp() { + TestFImdlp(): CPPFImdlp() {} + void SetUp() + { // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) - X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9}; - y = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2}; + X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; + y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; algorithm = false; fit(X, y); } - - void setalgorithm(bool value) { + void setalgorithm(bool value) + { algorithm = value; } - - // void initIndices() - // { - // indices = indices_t(); - // } - void checkSortedVector() { - indices_t testSortedIndices = sortIndices(X); + void checkSortedVector() + { + indices_t testSortedIndices = sortIndices(X, y); precision_t prev = X[testSortedIndices[0]]; for (auto i = 0; i < X.size(); ++i) { EXPECT_EQ(testSortedIndices[i], indices[i]); @@ -36,54 +31,55 @@ namespace mdlp { prev = X[testSortedIndices[i]]; } } - - void checkCutPoints(cutPoints_t &expected) { + void checkCutPoints(cutPoints_t& expected) + { int expectedSize = expected.size(); EXPECT_EQ(cutPoints.size(), expectedSize); for (auto i = 0; i < cutPoints.size(); i++) { EXPECT_NEAR(cutPoints[i], expected[i], precision); } } - template - void checkVectors(std::vector const &expected, std::vector const &computed) { + void checkVectors(std::vector const& expected, std::vector const& computed) + { EXPECT_EQ(expected.size(), computed.size()); ASSERT_EQ(expected.size(), computed.size()); for (auto i = 0; i < expected.size(); i++) { - EXPECT_NEAR(expected[i], computed[i],precision); + EXPECT_NEAR(expected[i], computed[i], precision); } } }; - - TEST_F(TestFImdlp, FitErrorEmptyDataset) { + TEST_F(TestFImdlp, FitErrorEmptyDataset) + { X = samples_t(); y = labels_t(); EXPECT_THROW(fit(X, y), std::invalid_argument); } - - TEST_F(TestFImdlp, FitErrorDifferentSize) { - X = {1, 2, 3}; - y = {1, 2}; + TEST_F(TestFImdlp, FitErrorDifferentSize) + { + X = { 1, 2, 3 }; + y = { 1, 2 }; EXPECT_THROW(fit(X, y), std::invalid_argument); } - - TEST_F(TestFImdlp, SortIndices) { - X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9}; - indices = {4, 3, 6, 8, 2, 1, 5, 0, 9, 7}; + TEST_F(TestFImdlp, SortIndices) + { + X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; + indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 }; checkSortedVector(); - X = {5.77, 5.88, 5.99}; - indices = {0, 1, 2}; + X = { 5.77, 5.88, 5.99 }; + indices = { 0, 1, 2 }; checkSortedVector(); - X = {5.33, 5.22, 5.11}; - indices = {2, 1, 0}; + X = { 5.33, 5.22, 5.11 }; + indices = { 2, 1, 0 }; checkSortedVector(); } - TEST_F(TestFImdlp, TestDataset) { - algorithm = false; + TEST_F(TestFImdlp, TestDataset) + { + algorithm = 0; fit(X, y); - computeCutPointsOriginal(0, 10); - cutPoints_t expected = {5.6499996185302734}; + computeCutPoints(0, 10); + cutPoints_t expected = { 5.6499996185302734 }; vector computed = getCutPoints(); computed = getCutPoints(); int expectedSize = expected.size(); @@ -92,49 +88,49 @@ namespace mdlp { EXPECT_NEAR(computed[i], expected[i], precision); } } - - TEST_F(TestFImdlp, ComputeCutPointsOriginal) { - cutPoints_t expected = {5.65}; + TEST_F(TestFImdlp, ComputeCutPoints) + { + cutPoints_t expected = { 5.65 }; algorithm = false; - computeCutPointsOriginal(0, 10); + computeCutPoints(0, 10); checkCutPoints(expected); } - - TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) { + TEST_F(TestFImdlp, ComputeCutPointsGCase) + { cutPoints_t expected; algorithm = false; - expected = {2}; - samples_t X_ = {0, 1, 2, 2}; - labels_t y_ = {1, 1, 1, 2}; + expected = { 2 }; + samples_t X_ = { 0, 1, 2, 2 }; + labels_t y_ = { 1, 1, 1, 2 }; fit(X_, y_); checkCutPoints(expected); } - - TEST_F(TestFImdlp, ComputeCutPointsalgorithm) { + TEST_F(TestFImdlp, ComputeCutPointsalAlternative) + { algorithm = true; cutPoints_t expected; expected = {}; fit(X, y); - computeCutPointsalgorithm(); + computeCutPointsAlternative(0, 10); checkCutPoints(expected); } - - TEST_F(TestFImdlp, ComputeCutPointsalgorithmGCase) { + TEST_F(TestFImdlp, ComputeCutPointsAlternativeGCase) + { cutPoints_t expected; - expected = {1.5}; + expected = { 1.5 }; algorithm = true; - samples_t X_ = {0, 1, 2, 2}; - labels_t y_ = {1, 1, 1, 2}; + samples_t X_ = { 0, 1, 2, 2 }; + labels_t y_ = { 1, 1, 1, 2 }; fit(X_, y_); checkCutPoints(expected); } - - TEST_F(TestFImdlp, GetCutPoints) { - samples_t computed, expected = {5.65}; + TEST_F(TestFImdlp, GetCutPoints) + { + samples_t computed, expected = { 5.65 }; algorithm = false; - computeCutPointsOriginal(0, 10); + computeCutPoints(0, 10); computed = getCutPoints(); - for (auto item: cutPoints) + for (auto item : cutPoints) cout << setprecision(6) << item << endl; checkVectors(expected, computed); }