refactor system types in library

Add new test taken from join_fit in FImdlp python Update instructions in README
2025-08-15 15:35:55 +00:00 · 2023-04-11 19:24:31 +02:00
parent 49c08bfe12
commit d77d27459b
7 changed files with 30 additions and 11 deletions
--- a/CPPFImdlp.cpp
+++ b/CPPFImdlp.cpp
@@ -128,8 +128,8 @@ namespace mdlp {
            // Cutpoints are always on boundaries (definition 2)
            if (y[indices[idx]] == y[indices[idx - 1]])
                continue;
-            entropy_left = precision_t(idx - start) / static_cast<float>(elements) * metrics.entropy(start, idx);
-            entropy_right = precision_t(end - idx) / static_cast<float>(elements) * metrics.entropy(idx, end);
+            entropy_left = precision_t(idx - start) / static_cast<precision_t>(elements) * metrics.entropy(start, idx);
+            entropy_right = precision_t(end - idx) / static_cast<precision_t>(elements) * metrics.entropy(idx, end);
            if (entropy_left + entropy_right < minEntropy) {
                minEntropy = entropy_left + entropy_right;
                candidate = idx;
@@ -155,8 +155,8 @@ namespace mdlp {
        ent1 = metrics.entropy(start, cut);
        ent2 = metrics.entropy(cut, end);
        ig = metrics.informationGain(start, cut, end);
-        delta = static_cast<float>(log2(pow(3, precision_t(k)) - 2) -
-                                   (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
+        delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
+                                         (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
        precision_t term = 1 / N * (log2(N - 1) + delta);
        return ig > term;
    }
--- a/README.md
+++ b/README.md
@@ -24,9 +24,8 @@ To run the sample, just execute the following commands:

 ```bash
 cd sample
-mkdir build
+cmake -B build
 cd build
-cmake ..
 make
 ./sample -f iris -m 2
 ./sample -h
@@ -34,7 +33,7 @@ make

 ## Test

-To run the tests, execute the following commands:
+To run the tests and see coverage (llvm & gcovr have to be installed), execute the following commands:

 ```bash
 cd tests
--- a/tests/ArffFiles.cpp
+++ b/tests/ArffFiles.cpp
@@ -27,7 +27,7 @@ string ArffFiles::getClassType() const {
    return classType;
 }

-vector<vector<float>> &ArffFiles::getX() {
+vector<mdlp::samples_t> &ArffFiles::getX() {
    return X;
 }

@@ -80,7 +80,7 @@ void ArffFiles::load(const string &fileName, bool classLast) {
 }

 void ArffFiles::generateDataset(bool classLast) {
-    X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
+    X = vector<mdlp::samples_t>(attributes.size(), mdlp::samples_t(lines.size()));
    auto yy = vector<string>(lines.size(), "");
    int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
    for (size_t i = 0; i < lines.size(); i++) {
--- a/tests/ArffFiles.h
+++ b/tests/ArffFiles.h
@@ -3,6 +3,7 @@

 #include <string>
 #include <vector>
+#include "../typesFImdlp.h"

 using namespace std;

@@ -12,7 +13,7 @@ private:
    vector<pair<string, string>> attributes;
    string className;
    string classType;
-    vector<vector<float>> X;
+    vector<mdlp::samples_t> X;
    vector<int> y;

    void generateDataset(bool);
@@ -32,7 +33,7 @@ public:

    static string trim(const string &);

-    vector<vector<float>> &getX();
+    vector<mdlp::samples_t> &getX();

    vector<int> &getY();

--- a/tests/FImdlp_unittest.cpp
+++ b/tests/FImdlp_unittest.cpp
@@ -111,6 +111,16 @@ namespace mdlp {
        EXPECT_THROW_WITH_MESSAGE(testDepth.fit(X, y), invalid_argument, "max_depth must be greater than 0");
    }

+    TEST_F(TestFImdlp, JoinFit) {
+        samples_t X_ = {1, 2, 2, 3, 4, 2, 3};
+        labels_t y_ = {0, 0, 1, 2, 3, 4, 5};
+        cutPoints_t expected = {1.5f, 2.5f};
+        fit(X_, y_);
+        auto computed = getCutPoints();
+        EXPECT_EQ(computed.size(), expected.size());
+        checkCutPoints(computed, expected);
+    }
+
    TEST_F(TestFImdlp, FitErrorMaxCutPoints) {
        auto testmin = CPPFImdlp(2, 10, -1);
        auto testmax = CPPFImdlp(3, 0, 200);
--- a/tests/Metrics_unittest.cpp
+++ b/tests/Metrics_unittest.cpp
@@ -30,6 +30,14 @@ namespace mdlp {
        ASSERT_NEAR(0.468996f, entropy(0, 10), precision);
    }

+    TEST_F(TestMetrics, EntropyDouble) {
+        y = {0, 0, 1, 2, 3};
+        samples_t expected_entropies = {0.0, 0.0, 0.91829583, 1.5, 1.4575424759098898};
+        for (auto idx = 0; idx < y.size(); ++idx) {
+            ASSERT_NEAR(expected_entropies[idx], entropy(0, idx + 1), precision);
+        }
+    }
+
    TEST_F(TestMetrics, InformationGain) {
        ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
        ASSERT_NEAR(1, informationGain(0, 5, 10), precision); // For cache
--- a/typesFImdlp.h
+++ b/typesFImdlp.h
@@ -1,5 +1,6 @@
 #ifndef TYPES_H
 #define TYPES_H
+
 #include <vector>
 #include <map>
 #include <stdexcept>