From d77d27459ba6fbddcbc54469fab718ab4337290d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 11 Apr 2023 19:24:31 +0200 Subject: [PATCH] refactor system types in library Add new test taken from join_fit in FImdlp python Update instructions in README --- CPPFImdlp.cpp | 8 ++++---- README.md | 5 ++--- tests/ArffFiles.cpp | 4 ++-- tests/ArffFiles.h | 5 +++-- tests/FImdlp_unittest.cpp | 10 ++++++++++ tests/Metrics_unittest.cpp | 8 ++++++++ typesFImdlp.h | 1 + 7 files changed, 30 insertions(+), 11 deletions(-) diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 5778219..a57c9a9 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -128,8 +128,8 @@ namespace mdlp { // Cutpoints are always on boundaries (definition 2) if (y[indices[idx]] == y[indices[idx - 1]]) continue; - entropy_left = precision_t(idx - start) / static_cast(elements) * metrics.entropy(start, idx); - entropy_right = precision_t(end - idx) / static_cast(elements) * metrics.entropy(idx, end); + entropy_left = precision_t(idx - start) / static_cast(elements) * metrics.entropy(start, idx); + entropy_right = precision_t(end - idx) / static_cast(elements) * metrics.entropy(idx, end); if (entropy_left + entropy_right < minEntropy) { minEntropy = entropy_left + entropy_right; candidate = idx; @@ -155,8 +155,8 @@ namespace mdlp { ent1 = metrics.entropy(start, cut); ent2 = metrics.entropy(cut, end); ig = metrics.informationGain(start, cut, end); - delta = static_cast(log2(pow(3, precision_t(k)) - 2) - - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2)); + delta = static_cast(log2(pow(3, precision_t(k)) - 2) - + (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2)); precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } diff --git a/README.md b/README.md index 445a4dc..064056c 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,8 @@ To run the sample, just execute the following commands: ```bash cd sample -mkdir build +cmake -B build cd build -cmake .. make ./sample -f iris -m 2 ./sample -h @@ -34,7 +33,7 @@ make ## Test -To run the tests, execute the following commands: +To run the tests and see coverage (llvm & gcovr have to be installed), execute the following commands: ```bash cd tests diff --git a/tests/ArffFiles.cpp b/tests/ArffFiles.cpp index 0d88818..d815000 100644 --- a/tests/ArffFiles.cpp +++ b/tests/ArffFiles.cpp @@ -27,7 +27,7 @@ string ArffFiles::getClassType() const { return classType; } -vector> &ArffFiles::getX() { +vector &ArffFiles::getX() { return X; } @@ -80,7 +80,7 @@ void ArffFiles::load(const string &fileName, bool classLast) { } void ArffFiles::generateDataset(bool classLast) { - X = vector>(attributes.size(), vector(lines.size())); + X = vector(attributes.size(), mdlp::samples_t(lines.size())); auto yy = vector(lines.size(), ""); int labelIndex = classLast ? static_cast(attributes.size()) : 0; for (size_t i = 0; i < lines.size(); i++) { diff --git a/tests/ArffFiles.h b/tests/ArffFiles.h index 38531af..9b0aa2b 100644 --- a/tests/ArffFiles.h +++ b/tests/ArffFiles.h @@ -3,6 +3,7 @@ #include #include +#include "../typesFImdlp.h" using namespace std; @@ -12,7 +13,7 @@ private: vector> attributes; string className; string classType; - vector> X; + vector X; vector y; void generateDataset(bool); @@ -32,7 +33,7 @@ public: static string trim(const string &); - vector> &getX(); + vector &getX(); vector &getY(); diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index 3b090bb..1559fa3 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -111,6 +111,16 @@ namespace mdlp { EXPECT_THROW_WITH_MESSAGE(testDepth.fit(X, y), invalid_argument, "max_depth must be greater than 0"); } + TEST_F(TestFImdlp, JoinFit) { + samples_t X_ = {1, 2, 2, 3, 4, 2, 3}; + labels_t y_ = {0, 0, 1, 2, 3, 4, 5}; + cutPoints_t expected = {1.5f, 2.5f}; + fit(X_, y_); + auto computed = getCutPoints(); + EXPECT_EQ(computed.size(), expected.size()); + checkCutPoints(computed, expected); + } + TEST_F(TestFImdlp, FitErrorMaxCutPoints) { auto testmin = CPPFImdlp(2, 10, -1); auto testmax = CPPFImdlp(3, 0, 200); diff --git a/tests/Metrics_unittest.cpp b/tests/Metrics_unittest.cpp index e059fac..d8ee8db 100644 --- a/tests/Metrics_unittest.cpp +++ b/tests/Metrics_unittest.cpp @@ -30,6 +30,14 @@ namespace mdlp { ASSERT_NEAR(0.468996f, entropy(0, 10), precision); } + TEST_F(TestMetrics, EntropyDouble) { + y = {0, 0, 1, 2, 3}; + samples_t expected_entropies = {0.0, 0.0, 0.91829583, 1.5, 1.4575424759098898}; + for (auto idx = 0; idx < y.size(); ++idx) { + ASSERT_NEAR(expected_entropies[idx], entropy(0, idx + 1), precision); + } + } + TEST_F(TestMetrics, InformationGain) { ASSERT_NEAR(1, informationGain(0, 5, 10), precision); ASSERT_NEAR(1, informationGain(0, 5, 10), precision); // For cache diff --git a/typesFImdlp.h b/typesFImdlp.h index 753e333..b28b2ca 100644 --- a/typesFImdlp.h +++ b/typesFImdlp.h @@ -1,5 +1,6 @@ #ifndef TYPES_H #define TYPES_H + #include #include #include