diff --git a/.gitignore b/.gitignore index fbe8f7d..be772d2 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ **/lcoverage .idea cmake-* +**/CMakeFiles diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 443ffd7..a23c148 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -4,14 +4,12 @@ #include #include "CPPFImdlp.h" #include "Metrics.h" - namespace mdlp { - CPPFImdlp::CPPFImdlp(int proposal):proposal(proposal), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices)) + CPPFImdlp::CPPFImdlp(int algorithm):algorithm(algorithm), indices(indices_t()), X(samples_t()), y(labels_t()), metrics(Metrics(y, indices)) { } CPPFImdlp::~CPPFImdlp() = default; - CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_) { X = X_; @@ -23,93 +21,87 @@ namespace mdlp { if (X.size() == 0 || y.size() == 0) { throw invalid_argument("X and y must have at least one element"); } - indices = sortIndices(X_); + indices = sortIndices(X_, y_); metrics.setData(y, indices); - switch (proposal) { + switch (algorithm) { case 0: computeCutPoints(0, X.size()); break; case 1: - computeCutPointsProposal(); - break; - case 2: computeCutPointsAlternative(0, X.size()); break; + default: + throw invalid_argument("algorithm must be 0 or 1"); } return *this; } + precision_t CPPFImdlp::halfWayValueCutPoint(size_t start, size_t idx) + { + size_t idxPrev = idx - 1; + precision_t previous = X[indices[idxPrev]], actual = X[indices[idx]]; + // definition 2 of the paper => X[t-1] < X[t] + while (idxPrev-- > start && actual == previous) { + previous = X[indices[idxPrev]]; + } + return (previous + actual) / 2; + } + tuple CPPFImdlp::completeValueCutPoint(size_t start, size_t cut, size_t end) + { + size_t idxPrev = cut - 1; + precision_t previous, next, actual; + previous = X[indices[idxPrev]]; + next = actual = X[indices[cut]]; + // definition 2 of the paper => X[t-1] < X[t] + while (idxPrev-- > start && actual == previous) { + previous = X[indices[idxPrev]]; + } + // get the last equal value of X in the interval + while (actual == X[indices[cut++]] && cut < end); + if (previous == actual && cut < end) + actual = X[indices[cut]]; + cut--; + return make_tuple((previous + actual) / 2, cut); + } void CPPFImdlp::computeCutPoints(size_t start, size_t end) { - int cut; + size_t cut; + tuple result; if (end - start < 2) return; cut = getCandidate(start, end); - if (cut == -1 || !mdlp(start, cut, end)) { - // cut.value == -1 means that there is no candidate in the interval - // No boundary found, so we add both ends of the interval as cutpoints - // because they were selected by the algorithm before - if (start != 0) - cutPoints.push_back((X[indices[start]] + X[indices[start - 1]]) / 2); - if (end != X.size()) - cutPoints.push_back((X[indices[end]] + X[indices[end - 1]]) / 2); + if (cut == numeric_limits::max()) return; + if (mdlp(start, cut, end)) { + result = completeValueCutPoint(start, cut, end); + cut = get<1>(result); + cutPoints.push_back(get<0>(result)); + computeCutPoints(start, cut); + computeCutPoints(cut, end); } - computeCutPoints(start, cut); - computeCutPoints(cut, end); } void CPPFImdlp::computeCutPointsAlternative(size_t start, size_t end) { - precision_t cut; + size_t cut; if (end - start < 2) return; cut = getCandidate(start, end); - if (cut == -1) + if (cut == numeric_limits::max()) return; if (mdlp(start, cut, end)) { - cutPoints.push_back((X[indices[cut]] + X[indices[cut - 1]]) / 2); - } - computeCutPointsAlternative(start, cut); - computeCutPointsAlternative(cut, end); - } - void CPPFImdlp::computeCutPointsProposal() - { - precision_t xPrev, xCur, xPivot, cutPoint; - int yPrev, yCur, yPivot; - size_t idx, numElements, start; - - xCur = xPrev = X[indices[0]]; - yCur = yPrev = y[indices[0]]; - numElements = indices.size() - 1; - idx = start = 0; - while (idx < numElements) { - xPivot = xCur; - yPivot = yCur; - // Read the same values and check class changes - do { - idx++; - xCur = X[indices[idx]]; - yCur = y[indices[idx]]; - if (yCur != yPivot && xCur == xPivot) { - yPivot = -1; - } - } - while (idx < numElements && xCur == xPivot); - // Check if the class changed and there are more than 1 element - if ((idx - start > 1) && (yPivot == -1 || yPrev != yCur) && mdlp(start, idx, indices.size())) { - start = idx; - cutPoint = (xPrev + xCur) / 2; - cutPoints.push_back(cutPoint); - } - yPrev = yPivot; - xPrev = xPivot; + cutPoints.push_back(halfWayValueCutPoint(start, cut)); + computeCutPointsAlternative(start, cut); + computeCutPointsAlternative(cut, end); } } - long int CPPFImdlp::getCandidate(size_t start, size_t end) + size_t CPPFImdlp::getCandidate(size_t start, size_t end) { - long int candidate = -1, elements = end - start; - precision_t entropy_left, entropy_right, minEntropy = numeric_limits::max(); + /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which + E(A, TA; S) is minimal amogst all the candidate cut points. */ + size_t candidate = numeric_limits::max(), elements = end - start; + precision_t entropy_left, entropy_right, minEntropy; + minEntropy = metrics.entropy(start, end); for (auto idx = start + 1; idx < end; idx++) { - // Cutpoints are always on boundaries + // Cutpoints are always on boundaries (definition 2) if (y[indices[idx]] == y[indices[idx - 1]]) continue; entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx); @@ -142,6 +134,20 @@ namespace mdlp { precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } + // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes + indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) + { + indices_t idx(X_.size()); + iota(idx.begin(), idx.end(), 0); + for (size_t i = 0; i < X_.size(); i++) + stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) + { + if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; + else + return X_[i1] < X_[i2]; + }); + return idx; + } cutPoints_t CPPFImdlp::getCutPoints() { // Remove duplicates and sort @@ -154,14 +160,4 @@ namespace mdlp { sort(output.begin(), output.end()); return output; } - // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes - indices_t CPPFImdlp::sortIndices(samples_t& X_) - { - indices_t idx(X_.size()); - iota(idx.begin(), idx.end(), 0); - for (size_t i = 0; i < X_.size(); i++) - stable_sort(idx.begin(), idx.end(), [&X_](size_t i1, size_t i2) - { return X_[i1] < X_[i2]; }); - return idx; - } } diff --git a/CPPFImdlp.h b/CPPFImdlp.h index 64b9fa1..56e9006 100644 --- a/CPPFImdlp.h +++ b/CPPFImdlp.h @@ -3,28 +3,31 @@ #include "typesFImdlp.h" #include "Metrics.h" #include +#include +#include namespace mdlp { class CPPFImdlp { protected: - int proposal; - indices_t indices; // sorted indices to use with X and y + int algorithm; + indices_t indices; samples_t X; labels_t y; Metrics metrics; cutPoints_t cutPoints; - static indices_t sortIndices(samples_t&); + static indices_t sortIndices(samples_t&, labels_t&); void computeCutPoints(size_t, size_t); - bool mdlp(size_t, size_t, size_t); - long int getCandidate(size_t, size_t); void computeCutPointsAlternative(size_t, size_t); - void computeCutPointsProposal(); - + bool mdlp(size_t, size_t, size_t); + size_t getCandidate(size_t, size_t); + precision_t halfWayValueCutPoint(size_t, size_t); + tuple completeValueCutPoint(size_t, size_t, size_t); public: - CPPFImdlp(int); + CPPFImdlp(int algorithm = 0); ~CPPFImdlp(); CPPFImdlp& fit(samples_t&, labels_t&); samples_t getCutPoints(); + inline string version() { return "0.9.7"; }; }; } #endif \ No newline at end of file diff --git a/README.md b/README.md index 56d9a7d..892d922 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,22 @@ # mdlp -Discretization algorithm based on the paper by Fayyad & Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning +Discretization algorithm based on the paper by Fayyad & Irani [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](https://www.ijcai.org/Proceedings/93-2/Papers/022.pdf) + +The implementation tries to mitigate the problem of different label values with the same value of the variable: + +- Sorts the values of the variable using the label values as a tie-breaker +- Once found a valid candidate for the split, it checks if the previous value is the same as actual one, and tries to get previous one, or next if the former is not possible. + +The algorithm returns the cut points for the variable. + +## Sample + +To run the sample, just execute the following commands: + +```bash +cd sample +mkdir build +cd build +cmake .. +make +./sample iris +``` \ No newline at end of file diff --git a/sample/ArffFiles.cpp b/sample/ArffFiles.cpp index 9baf861..7b59ef8 100644 --- a/sample/ArffFiles.cpp +++ b/sample/ArffFiles.cpp @@ -1,5 +1,4 @@ #include "ArffFiles.h" - #include #include #include diff --git a/sample/sample.cpp b/sample/sample.cpp index 772e150..6c65255 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -5,6 +5,7 @@ #include "../CPPFImdlp.h" using namespace std; +using namespace mdlp; int main(int argc, char** argv) { @@ -33,8 +34,8 @@ int main(int argc, char** argv) cout << "Class name: " << file.getClassName() << endl; cout << "Class type: " << file.getClassType() << endl; cout << "Data: " << endl; - vector>& X = file.getX(); - vector& y = file.getY(); + vector& X = file.getX(); + labels_t& y = file.getY(); for (int i = 0; i < 50; i++) { for (auto feature : X) { cout << fixed << setprecision(1) << feature[i] << " "; diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index 0fc0ae9..06d3d52 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -4,31 +4,26 @@ #include namespace mdlp { - class TestFImdlp : public CPPFImdlp, public testing::Test { + class TestFImdlp: public CPPFImdlp, public testing::Test { public: precision_t precision = 0.000001; - - TestFImdlp() : CPPFImdlp(false) {} - - void SetUp() { + TestFImdlp(): CPPFImdlp() {} + void SetUp() + { // 5.0, 5.1, 5.1, 5.1, 5.2, 5.3, 5.6, 5.7, 5.9, 6.0] //(5.0, 1) (5.1, 1) (5.1, 2) (5.1, 2) (5.2, 1) (5.3, 1) (5.6, 2) (5.7, 1) (5.9, 2) (6.0, 2) - X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9}; - y = {1, 1, 1, 1, 1, 2, 2, 2, 2, 2}; - proposal = false; + X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; + y = { 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; + algorithm = false; fit(X, y); } - - void setProposal(bool value) { - proposal = value; + void setalgorithm(bool value) + { + algorithm = value; } - - // void initIndices() - // { - // indices = indices_t(); - // } - void checkSortedVector() { - indices_t testSortedIndices = sortIndices(X); + void checkSortedVector() + { + indices_t testSortedIndices = sortIndices(X, y); precision_t prev = X[testSortedIndices[0]]; for (auto i = 0; i < X.size(); ++i) { EXPECT_EQ(testSortedIndices[i], indices[i]); @@ -36,54 +31,55 @@ namespace mdlp { prev = X[testSortedIndices[i]]; } } - - void checkCutPoints(cutPoints_t &expected) { + void checkCutPoints(cutPoints_t& expected) + { int expectedSize = expected.size(); EXPECT_EQ(cutPoints.size(), expectedSize); for (auto i = 0; i < cutPoints.size(); i++) { EXPECT_NEAR(cutPoints[i], expected[i], precision); } } - template - void checkVectors(std::vector const &expected, std::vector const &computed) { + void checkVectors(std::vector const& expected, std::vector const& computed) + { EXPECT_EQ(expected.size(), computed.size()); ASSERT_EQ(expected.size(), computed.size()); for (auto i = 0; i < expected.size(); i++) { - EXPECT_NEAR(expected[i], computed[i],precision); + EXPECT_NEAR(expected[i], computed[i], precision); } } }; - - TEST_F(TestFImdlp, FitErrorEmptyDataset) { + TEST_F(TestFImdlp, FitErrorEmptyDataset) + { X = samples_t(); y = labels_t(); EXPECT_THROW(fit(X, y), std::invalid_argument); } - - TEST_F(TestFImdlp, FitErrorDifferentSize) { - X = {1, 2, 3}; - y = {1, 2}; + TEST_F(TestFImdlp, FitErrorDifferentSize) + { + X = { 1, 2, 3 }; + y = { 1, 2 }; EXPECT_THROW(fit(X, y), std::invalid_argument); } - - TEST_F(TestFImdlp, SortIndices) { - X = {5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9}; - indices = {4, 3, 6, 8, 2, 1, 5, 0, 9, 7}; + TEST_F(TestFImdlp, SortIndices) + { + X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; + indices = { 4, 3, 6, 8, 2, 1, 5, 0, 9, 7 }; checkSortedVector(); - X = {5.77, 5.88, 5.99}; - indices = {0, 1, 2}; + X = { 5.77, 5.88, 5.99 }; + indices = { 0, 1, 2 }; checkSortedVector(); - X = {5.33, 5.22, 5.11}; - indices = {2, 1, 0}; + X = { 5.33, 5.22, 5.11 }; + indices = { 2, 1, 0 }; checkSortedVector(); } - TEST_F(TestFImdlp, TestDataset) { - proposal = false; + TEST_F(TestFImdlp, TestDataset) + { + algorithm = 0; fit(X, y); - computeCutPointsOriginal(0, 10); - cutPoints_t expected = {5.6499996185302734}; + computeCutPoints(0, 10); + cutPoints_t expected = { 5.6499996185302734 }; vector computed = getCutPoints(); computed = getCutPoints(); int expectedSize = expected.size(); @@ -92,49 +88,49 @@ namespace mdlp { EXPECT_NEAR(computed[i], expected[i], precision); } } - - TEST_F(TestFImdlp, ComputeCutPointsOriginal) { - cutPoints_t expected = {5.65}; - proposal = false; - computeCutPointsOriginal(0, 10); + TEST_F(TestFImdlp, ComputeCutPoints) + { + cutPoints_t expected = { 5.65 }; + algorithm = false; + computeCutPoints(0, 10); checkCutPoints(expected); } - - TEST_F(TestFImdlp, ComputeCutPointsOriginalGCase) { + TEST_F(TestFImdlp, ComputeCutPointsGCase) + { cutPoints_t expected; - proposal = false; - expected = {2}; - samples_t X_ = {0, 1, 2, 2}; - labels_t y_ = {1, 1, 1, 2}; + algorithm = false; + expected = { 2 }; + samples_t X_ = { 0, 1, 2, 2 }; + labels_t y_ = { 1, 1, 1, 2 }; fit(X_, y_); checkCutPoints(expected); } - - TEST_F(TestFImdlp, ComputeCutPointsProposal) { - proposal = true; + TEST_F(TestFImdlp, ComputeCutPointsalAlternative) + { + algorithm = true; cutPoints_t expected; expected = {}; fit(X, y); - computeCutPointsProposal(); + computeCutPointsAlternative(0, 10); checkCutPoints(expected); } - - TEST_F(TestFImdlp, ComputeCutPointsProposalGCase) { + TEST_F(TestFImdlp, ComputeCutPointsAlternativeGCase) + { cutPoints_t expected; - expected = {1.5}; - proposal = true; - samples_t X_ = {0, 1, 2, 2}; - labels_t y_ = {1, 1, 1, 2}; + expected = { 1.5 }; + algorithm = true; + samples_t X_ = { 0, 1, 2, 2 }; + labels_t y_ = { 1, 1, 1, 2 }; fit(X_, y_); checkCutPoints(expected); } - - TEST_F(TestFImdlp, GetCutPoints) { - samples_t computed, expected = {5.65}; - proposal = false; - computeCutPointsOriginal(0, 10); + TEST_F(TestFImdlp, GetCutPoints) + { + samples_t computed, expected = { 5.65 }; + algorithm = false; + computeCutPoints(0, 10); computed = getCutPoints(); - for (auto item: cutPoints) + for (auto item : cutPoints) cout << setprecision(6) << item << endl; checkVectors(expected, computed); }