From 638bb2a59e4a83d750edc752d584051623dddd68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Wed, 5 Jun 2024 17:53:08 +0200 Subject: [PATCH] Discretizer (#8) * Add better check in testKBins.py * Add Discretizer base class for Both discretizers * Refactor order of constructors init --- .vscode/launch.json | 11 ++--- .vscode/settings.json | 86 +++++++++++++++++++++++++++++++++- BinDisc.cpp | 55 ++++------------------ BinDisc.h | 12 ++--- CPPFImdlp.cpp | 19 ++------ CPPFImdlp.h | 16 +++---- Discretizer.h | 31 ++++++++++++ tests/BinDisc_unittest.cpp | 71 +++++++++++++--------------- tests/CMakeLists.txt | 23 ++++++--- tests/Discretizer_unittest.cpp | 74 +++++++++++++++++++++++++++++ tests/test | 4 +- tests/testKbins.py | 46 ++++++++++-------- 12 files changed, 294 insertions(+), 154 deletions(-) create mode 100644 Discretizer.h create mode 100644 tests/Discretizer_unittest.cpp diff --git a/.vscode/launch.json b/.vscode/launch.json index e2daa52..1342f2d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -8,15 +8,10 @@ "name": "C++ Launch config", "type": "cppdbg", "request": "launch", - "program": "${workspaceFolder}/build/sample/sample", - "cwd": "${workspaceFolder}/build/sample", - "args": [ - "-f", - "glass" - ], - "targetArchitecture": "arm64", + "program": "${workspaceFolder}/tests/build/BinDisc_unittest", + "cwd": "${workspaceFolder}/tests/build", + "args": [], "launchCompleteCommand": "exec-run", - "preLaunchTask": "CMake: build", "stopAtEntry": false, "linux": { "MIMode": "gdb", diff --git a/.vscode/settings.json b/.vscode/settings.json index 85f6c83..cbb0d84 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -5,5 +5,89 @@ }, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools", "cmake.configureOnOpen": true, - "sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json" + "sonarlint.pathToCompileCommands": "${workspaceFolder}/build/compile_commands.json", + "files.associations": { + "*.rmd": "markdown", + "*.py": "python", + "vector": "cpp", + "__bit_reference": "cpp", + "__bits": "cpp", + "__config": "cpp", + "__debug": "cpp", + "__errc": "cpp", + "__hash_table": "cpp", + "__locale": "cpp", + "__mutex_base": "cpp", + "__node_handle": "cpp", + "__nullptr": "cpp", + "__split_buffer": "cpp", + "__string": "cpp", + "__threading_support": "cpp", + "__tuple": "cpp", + "array": "cpp", + "atomic": "cpp", + "bitset": "cpp", + "cctype": "cpp", + "chrono": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "exception": "cpp", + "initializer_list": "cpp", + "ios": "cpp", + "iosfwd": "cpp", + "istream": "cpp", + "limits": "cpp", + "locale": "cpp", + "memory": "cpp", + "mutex": "cpp", + "new": "cpp", + "optional": "cpp", + "ostream": "cpp", + "ratio": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "string": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "unordered_map": "cpp", + "variant": "cpp", + "algorithm": "cpp", + "iostream": "cpp", + "iomanip": "cpp", + "numeric": "cpp", + "set": "cpp", + "__tree": "cpp", + "deque": "cpp", + "list": "cpp", + "map": "cpp", + "unordered_set": "cpp", + "any": "cpp", + "condition_variable": "cpp", + "forward_list": "cpp", + "fstream": "cpp", + "stack": "cpp", + "thread": "cpp", + "__memory": "cpp", + "filesystem": "cpp", + "*.toml": "toml", + "utility": "cpp", + "span": "cpp", + "*.tcc": "cpp" + } } \ No newline at end of file diff --git a/BinDisc.cpp b/BinDisc.cpp index 4865fdc..626b908 100644 --- a/BinDisc.cpp +++ b/BinDisc.cpp @@ -7,7 +7,8 @@ namespace mdlp { - BinDisc::BinDisc(int n_bins, strategy_t strategy) : n_bins{ n_bins }, strategy{ strategy } + BinDisc::BinDisc(int n_bins, strategy_t strategy) : + Discretizer(), n_bins{ n_bins }, strategy{ strategy } { if (n_bins < 3) { throw std::invalid_argument("n_bins must be greater than 2"); @@ -16,6 +17,7 @@ namespace mdlp { BinDisc::~BinDisc() = default; void BinDisc::fit(samples_t& X) { + // y is included for compatibility with the Discretizer interface cutPoints.clear(); if (X.empty()) { cutPoints.push_back(std::numeric_limits::max()); @@ -27,6 +29,10 @@ namespace mdlp { fit_uniform(X); } } + void BinDisc::fit(samples_t& X, labels_t& y) + { + fit(X); + } std::vector linspace(precision_t start, precision_t end, int num) { // Doesn't include end point as it is not needed @@ -90,49 +96,4 @@ namespace mdlp { // Remove first as it is not needed cutPoints.erase(cutPoints.begin()); } - labels_t& BinDisc::transform(const samples_t& X) - { - discretizedData.clear(); - discretizedData.reserve(X.size()); - for (const precision_t& item : X) { - auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item); - discretizedData.push_back(upper - cutPoints.begin()); - } - return discretizedData; - } -} -// void BinDisc::fit_quantile(samples_t& X) - // { - // cutPoints.clear(); - // if (X.empty()) { - // cutPoints.push_back(std::numeric_limits::max()); - // return; - // } - // samples_t data = X; - // std::sort(data.begin(), data.end()); - // float min_val = data.front(); - // float max_val = data.back(); - // // Handle case of all data points having the same value - // if (min_val == max_val) { - // cutPoints.push_back(std::numeric_limits::max()); - // return; - // } - // int first = X.size() / n_bins; - // cutPoints.push_back(data.at(first - 1)); - // int bins_done = 1; - // int prev = first - 1; - // while (bins_done < n_bins) { - // int next = first * (bins_done + 1) - 1; - // while (next < X.size() && data.at(next) == data[prev]) { - // ++next; - // } - // if (next == X.size() || bins_done == n_bins - 1) { - // cutPoints.push_back(std::numeric_limits::max()); - // break; - // } else { - // cutPoints.push_back(data[next]); - // bins_done++; - // prev = next; - // } - // } - // } \ No newline at end of file +} \ No newline at end of file diff --git a/BinDisc.h b/BinDisc.h index b0ab432..76736f8 100644 --- a/BinDisc.h +++ b/BinDisc.h @@ -2,6 +2,7 @@ #define BINDISC_H #include "typesFImdlp.h" +#include "Discretizer.h" #include namespace mdlp { @@ -10,22 +11,19 @@ namespace mdlp { UNIFORM, QUANTILE }; - class BinDisc { + class BinDisc : public Discretizer { public: BinDisc(int n_bins = 3, strategy_t strategy = strategy_t::UNIFORM); ~BinDisc(); - void fit(samples_t&); - inline cutPoints_t getCutPoints() const { return cutPoints; }; - labels_t& transform(const samples_t&); - static inline std::string version() { return "1.0.0"; }; + // y is included for compatibility with the Discretizer interface + void fit(samples_t& X_, labels_t& y) override; + void fit(samples_t& X); private: void fit_uniform(samples_t&); void fit_quantile(samples_t&); void normalizeCutPoints(); int n_bins; strategy_t strategy; - labels_t discretizedData = labels_t(); - cutPoints_t cutPoints; }; } #endif diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index 4b62222..c2d4733 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -6,16 +6,14 @@ namespace mdlp { - CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_), + CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : + Discretizer(), + min_length(min_length_), max_depth(max_depth_), proposed_cuts(proposed) { } - CPPFImdlp::CPPFImdlp() = default; - - CPPFImdlp::~CPPFImdlp() = default; - size_t CPPFImdlp::compute_max_num_cut_points() const { // Set the actual maximum number of cut points as a number or as a percentage of the number of samples @@ -208,14 +206,5 @@ namespace mdlp { } cutPoints.erase(cutPoints.begin() + static_cast(maxEntropyIdx)); } - labels_t& CPPFImdlp::transform(const samples_t& data) - { - discretizedData.clear(); - discretizedData.reserve(data.size()); - for (const precision_t& item : data) { - auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item); - discretizedData.push_back(upper - cutPoints.begin()); - } - return discretizedData; - } + } diff --git a/CPPFImdlp.h b/CPPFImdlp.h index 4286c9e..b832423 100644 --- a/CPPFImdlp.h +++ b/CPPFImdlp.h @@ -6,18 +6,16 @@ #include #include #include "Metrics.h" +#include "Discretizer.h" namespace mdlp { - class CPPFImdlp { + class CPPFImdlp : public Discretizer { public: - CPPFImdlp(); - CPPFImdlp(size_t, int, float); - ~CPPFImdlp(); - void fit(samples_t&, labels_t&); - inline cutPoints_t getCutPoints() const { return cutPoints; }; - labels_t& transform(const samples_t&); + CPPFImdlp() = default; + CPPFImdlp(size_t min_length_, int max_depth_, float proposed); + virtual ~CPPFImdlp() = default; + void fit(samples_t& X_, labels_t& y_) override; inline int get_depth() const { return depth; }; - static inline std::string version() { return "1.1.3"; }; protected: size_t min_length = 3; int depth = 0; @@ -27,9 +25,7 @@ namespace mdlp { samples_t X = samples_t(); labels_t y = labels_t(); Metrics metrics = Metrics(y, indices); - cutPoints_t cutPoints; size_t num_cut_points = numeric_limits::max(); - labels_t discretizedData = labels_t(); static indices_t sortIndices(samples_t&, labels_t&); void computeCutPoints(size_t, size_t, int); void resizeCutPoints(); diff --git a/Discretizer.h b/Discretizer.h new file mode 100644 index 0000000..d3556e8 --- /dev/null +++ b/Discretizer.h @@ -0,0 +1,31 @@ +#ifndef DISCRETIZER_H +#define DISCRETIZER_H + +#include +#include +#include "typesFImdlp.h" + +namespace mdlp { + class Discretizer { + public: + Discretizer() = default; + virtual ~Discretizer() = default; + virtual void fit(samples_t& X_, labels_t& y_) = 0; + inline cutPoints_t getCutPoints() const { return cutPoints; }; + labels_t& transform(const samples_t& data) + { + discretizedData.clear(); + discretizedData.reserve(data.size()); + for (const precision_t& item : data) { + auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item); + discretizedData.push_back(upper - cutPoints.begin()); + } + return discretizedData; + }; + static inline std::string version() { return "1.1.3"; }; + protected: + labels_t discretizedData = labels_t(); + cutPoints_t cutPoints; + }; +} +#endif diff --git a/tests/BinDisc_unittest.cpp b/tests/BinDisc_unittest.cpp index f6e78dc..e888a5c 100644 --- a/tests/BinDisc_unittest.cpp +++ b/tests/BinDisc_unittest.cpp @@ -37,12 +37,13 @@ namespace mdlp { TEST_F(TestBinDisc3U, Easy3BinsUniform) { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; - fit(X); + auto y = labels_t(); + fit(X, y); auto cuts = getCutPoints(); - EXPECT_NEAR(3.66667, cuts[0], margin); - EXPECT_NEAR(6.33333, cuts[1], margin); - EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); + ASSERT_EQ(3, cuts.size()); + EXPECT_NEAR(3.66667, cuts.at(0), margin); + EXPECT_NEAR(6.33333, cuts.at(1), margin); + EXPECT_EQ(numeric_limits::max(), cuts.at(2)); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -52,10 +53,10 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(3, cuts.size()); EXPECT_NEAR(3.666667, cuts[0], margin); EXPECT_NEAR(6.333333, cuts[1], margin); EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -65,10 +66,10 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(3, cuts.size()); EXPECT_EQ(4.0, cuts[0]); EXPECT_EQ(7.0, cuts[1]); EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -78,10 +79,10 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(3, cuts.size()); EXPECT_EQ(4, cuts[0]); EXPECT_EQ(7, cuts[1]); EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -91,10 +92,10 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(3, cuts.size()); EXPECT_NEAR(4.33333, cuts[0], margin); EXPECT_NEAR(7.66667, cuts[1], margin); EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -104,10 +105,10 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(3, cuts.size()); EXPECT_NEAR(4.33333, cuts[0], margin); EXPECT_NEAR(7.66667, cuts[1], margin); EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -117,8 +118,8 @@ namespace mdlp { samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(1, cuts.size()); EXPECT_EQ(numeric_limits::max(), cuts[0]); - EXPECT_EQ(1, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 0, 0 }; EXPECT_EQ(expected, labels); @@ -128,8 +129,8 @@ namespace mdlp { samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(numeric_limits::max(), cuts[0]); EXPECT_EQ(1, cuts.size()); + EXPECT_EQ(numeric_limits::max(), cuts[0]); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 0, 0 }; EXPECT_EQ(expected, labels); @@ -139,16 +140,16 @@ namespace mdlp { samples_t X = {}; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(numeric_limits::max(), cuts[0]); EXPECT_EQ(1, cuts.size()); + EXPECT_EQ(numeric_limits::max(), cuts[0]); } TEST_F(TestBinDisc3Q, EmptyQuantile) { samples_t X = {}; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(numeric_limits::max(), cuts[0]); EXPECT_EQ(1, cuts.size()); + EXPECT_EQ(numeric_limits::max(), cuts[0]); } TEST(TestBinDisc3, ExceptionNumberBins) { @@ -159,44 +160,38 @@ namespace mdlp { samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(3, cuts.size()); EXPECT_NEAR(1.66667, cuts[0], margin); EXPECT_NEAR(2.33333, cuts[1], margin); EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); auto labels = transform(X); labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 }; EXPECT_EQ(expected, labels); - EXPECT_EQ(3.0, X[0]); // X is not modified + ASSERT_EQ(3.0, X[0]); // X is not modified } TEST_F(TestBinDisc3Q, EasyRepeated) { samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); - std::cout << "cuts: "; - for (auto cut : cuts) { - std::cout << cut << " "; - } - std::cout << std::endl; - std::cout << std::string(80, '-') << std::endl; + EXPECT_EQ(2, cuts.size()); EXPECT_NEAR(1.66667, cuts[0], margin); EXPECT_EQ(numeric_limits::max(), cuts[1]); - EXPECT_EQ(2, cuts.size()); auto labels = transform(X); labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 }; EXPECT_EQ(expected, labels); - EXPECT_EQ(3.0, X[0]); // X is not modified + ASSERT_EQ(3.0, X[0]); // X is not modified } TEST_F(TestBinDisc4U, Easy4BinsUniform) { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(3.75, cuts[0]); + EXPECT_EQ(4, cuts.size()); + ASSERT_EQ(3.75, cuts[0]); EXPECT_EQ(6.5, cuts[1]); EXPECT_EQ(9.25, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -206,11 +201,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(3.75, cuts[0]); + EXPECT_EQ(4, cuts.size()); + ASSERT_EQ(3.75, cuts[0]); EXPECT_EQ(6.5, cuts[1]); EXPECT_EQ(9.25, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -220,11 +215,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; fit(X); auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts.size()); EXPECT_EQ(4.0, cuts[0]); EXPECT_EQ(7.0, cuts[1]); EXPECT_EQ(10.0, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -234,11 +229,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; fit(X); auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts.size()); EXPECT_EQ(4.0, cuts[0]); EXPECT_EQ(7.0, cuts[1]); EXPECT_EQ(10.0, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -248,11 +243,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; fit(X); auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts.size()); EXPECT_EQ(4.25, cuts[0]); EXPECT_EQ(7.5, cuts[1]); EXPECT_EQ(10.75, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -262,11 +257,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; fit(X); auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts.size()); EXPECT_EQ(4.25, cuts[0]); EXPECT_EQ(7.5, cuts[1]); EXPECT_EQ(10.75, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -276,11 +271,11 @@ namespace mdlp { samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; fit(X); auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts.size()); EXPECT_EQ(4.5, cuts[0]); EXPECT_EQ(8, cuts[1]); EXPECT_EQ(11.5, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 }; EXPECT_EQ(expected, labels); @@ -290,11 +285,11 @@ namespace mdlp { samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; fit(X); auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts.size()); EXPECT_EQ(4.5, cuts[0]); EXPECT_EQ(8, cuts[1]); EXPECT_EQ(11.5, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 }; EXPECT_EQ(expected, labels); @@ -305,11 +300,11 @@ namespace mdlp { // 0 1 2 3 4 5 6 7 8 9 fit(X); auto cuts = getCutPoints(); + EXPECT_EQ(4, cuts.size()); EXPECT_EQ(1.0, cuts[0]); EXPECT_EQ(2.0, cuts[1]); - EXPECT_EQ(3.0, cuts[2]); + ASSERT_EQ(3.0, cuts[2]); EXPECT_EQ(numeric_limits::max(), cuts[3]); - EXPECT_EQ(4, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -320,10 +315,10 @@ namespace mdlp { // 0 1 2 3 4 5 6 7 8 9 fit(X); auto cuts = getCutPoints(); + ASSERT_EQ(3, cuts.size()); EXPECT_EQ(2.0, cuts[0]); - EXPECT_EQ(3.0, cuts[1]); + ASSERT_EQ(3.0, cuts[1]); EXPECT_EQ(numeric_limits::max(), cuts[2]); - EXPECT_EQ(3, cuts.size()); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 }; EXPECT_EQ(expected, labels); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a382899..5fa52af 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -16,19 +16,28 @@ FetchContent_MakeAvailable(googletest) enable_testing() add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp) -add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp) -add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp) target_link_libraries(Metrics_unittest GTest::gtest_main) -target_link_libraries(FImdlp_unittest GTest::gtest_main) -target_link_libraries(BinDisc_unittest GTest::gtest_main) target_compile_options(Metrics_unittest PRIVATE --coverage) -target_compile_options(FImdlp_unittest PRIVATE --coverage) -target_compile_options(BinDisc_unittest PRIVATE --coverage) target_link_options(Metrics_unittest PRIVATE --coverage) + +add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp) +target_link_libraries(FImdlp_unittest GTest::gtest_main) +target_compile_options(FImdlp_unittest PRIVATE --coverage) target_link_options(FImdlp_unittest PRIVATE --coverage) + +add_executable(BinDisc_unittest ../BinDisc.cpp ArffFiles.cpp BinDisc_unittest.cpp) +target_link_libraries(BinDisc_unittest GTest::gtest_main) +target_compile_options(BinDisc_unittest PRIVATE --coverage) target_link_options(BinDisc_unittest PRIVATE --coverage) +add_executable(Discretizer_unittest ../BinDisc.cpp ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp Discretizer_unittest.cpp) +target_link_libraries(Discretizer_unittest GTest::gtest_main) +target_compile_options(Discretizer_unittest PRIVATE --coverage) +target_link_options(Discretizer_unittest PRIVATE --coverage) + include(GoogleTest) + gtest_discover_tests(Metrics_unittest) gtest_discover_tests(FImdlp_unittest) -gtest_discover_tests(BinDisc_unittest) \ No newline at end of file +gtest_discover_tests(BinDisc_unittest) +gtest_discover_tests(Discretizer_unittest) \ No newline at end of file diff --git a/tests/Discretizer_unittest.cpp b/tests/Discretizer_unittest.cpp new file mode 100644 index 0000000..8c8f201 --- /dev/null +++ b/tests/Discretizer_unittest.cpp @@ -0,0 +1,74 @@ +#include +#include +#include +#include "gtest/gtest.h" +#include "ArffFiles.h" +#include "../Discretizer.h" +#include "../BinDisc.h" +#include "../CPPFImdlp.h" + +namespace mdlp { + const float margin = 1e-4; + static std::string set_data_path() + { + std::string path = "../datasets/"; + std::ifstream file(path + "iris.arff"); + if (file.is_open()) { + file.close(); + return path; + } + return "../../tests/datasets/"; + } + const std::string data_path = set_data_path(); + + TEST(Discretizer, BinIrisUniform) + { + ArffFiles file; + Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM); + file.load(data_path + "iris.arff", true); + vector& X = file.getX(); + auto y = labels_t(); + disc->fit(X[0], y); + auto Xt = disc->transform(X[0]); + labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; + delete disc; + EXPECT_EQ(expected, Xt); + } + TEST(Discretizer, BinIrisQuantile) + { + ArffFiles file; + Discretizer* disc = new BinDisc(4, strategy_t::QUANTILE); + file.load(data_path + "iris.arff", true); + vector& X = file.getX(); + auto y = labels_t(); + disc->fit(X[0], y); + auto Xt = disc->transform(X[0]); + labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 }; + delete disc; + EXPECT_EQ(expected, Xt); + } + TEST(Discretizer, FImdlpIris) + { + labels_t expected = { + 5, 3, 4, 4, 5, 5, 5, 5, 2, 4, 5, 5, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 4, 5, 3, 5, 5, 5, 4, 4, 5, 5, 5, 4, 4, 5, 4, 3, 5, 5, 0, 4, 5, + 5, 3, 5, 4, 5, 4, 4, 4, 4, 0, 1, 1, 4, 0, 2, 0, 0, 3, 0, 2, 2, 4, + 3, 0, 0, 0, 4, 1, 0, 1, 2, 3, 1, 3, 2, 0, 0, 0, 0, 0, 3, 5, 4, 0, + 3, 0, 0, 3, 0, 0, 0, 3, 2, 2, 0, 1, 4, 0, 3, 2, 3, 3, 0, 2, 0, 5, + 4, 0, 3, 0, 1, 4, 3, 5, 0, 0, 4, 1, 1, 0, 4, 4, 1, 3, 1, 3, 1, 5, + 1, 1, 0, 3, 5, 4, 3, 4, 4, 4, 0, 4, 4, 3, 0, 3, 5, 3 + }; + ArffFiles file; + Discretizer* disc = new CPPFImdlp(); + file.load(data_path + "iris.arff", true); + vector& X = file.getX(); + labels_t& y = file.getY(); + disc->fit(X[1], y); + auto computed = disc->transform(X[1]); + delete disc; + EXPECT_EQ(computed.size(), expected.size()); + for (unsigned long i = 0; i < computed.size(); i++) { + EXPECT_EQ(computed[i], expected[i]); + } + } +} diff --git a/tests/test b/tests/test index 552d755..5291af8 100755 --- a/tests/test +++ b/tests/test @@ -5,7 +5,7 @@ fi if [ -d gcovr-report ] ; then rm -fr gcovr-report fi -cmake -S . -B build -Wno-dev +cmake -S . -B build -Wno-dev -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS="--coverage" -DCMAKE_C_FLAGS="--coverage" cmake --build build cd build ctest --output-on-failure @@ -15,4 +15,4 @@ mkdir gcovr-report #lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q #lcov --list lcoverage/main_coverage.info cd .. -gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines +gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --gcov-filter "BinDisc.cpp" --gcov-filter "Discretizer.h" --txt --sonarqube=tests/gcovr-report/coverage.xml --exclude-noncode-lines diff --git a/tests/testKbins.py b/tests/testKbins.py index e2f8fea..5f8a671 100644 --- a/tests/testKbins.py +++ b/tests/testKbins.py @@ -89,6 +89,7 @@ print(f"Quaintile {clf4q.bin_edges_=}") print("-" * 80) # data, meta = loadarff("tests/datasets/iris.arff") + labelsu = [ 0, 0, @@ -117,12 +118,12 @@ labelsu = [ 0, 0, 0, - 1, - 1, + 0, + 0, 0, 0, 1, - 1, + 0, 1, 0, 0, @@ -149,11 +150,11 @@ labelsu = [ 2, 0, 2, - 1, + 0, 0, 1, 1, - 2, + 1, 1, 2, 1, @@ -161,9 +162,9 @@ labelsu = [ 2, 1, 1, + 1, 2, - 2, - 2, + 1, 2, 2, 2, @@ -181,7 +182,7 @@ labelsu = [ 1, 1, 1, - 2, + 1, 1, 0, 1, @@ -217,14 +218,14 @@ labelsu = [ 2, 3, 2, - 2, + 1, 2, 3, 3, 3, 2, 2, - 2, + 1, 3, 2, 2, @@ -393,12 +394,19 @@ labelsq = [ 2, 2, ] -test(clf4u, data["sepallength"], labelsu, title="IrisUniform") -test(clf4q, data["sepallength"], labelsq, title="IrisQuantile") -# print("Labels") -# print(labels) -# print("Expected") -# print(expected) -# for i in range(len(labels)): -# if labels[i] != expected[i]: -# print(f"Error at {i} {labels[i]} != {expected[i]}") +# test(clf4u, data["sepallength"], labelsu, title="IrisUniform") +# test(clf4q, data["sepallength"], labelsq, title="IrisQuantile") +sepallength = [[x] for x in data["sepallength"]] +clf4u.fit(sepallength) +clf4q.fit(sepallength) +computedu = clf4u.transform(sepallength) +computedq = clf4q.transform(sepallength) +wrongu = 0 +wrongq = 0 +for i in range(len(labelsu)): + if labelsu[i] != computedu[i]: + wrongu += 1 + if labelsq[i] != computedq[i]: + wrongq += 1 +print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Uniform ={wrongu:3d}") +print(f"Iris sepallength diff. between BinDisc & sklearn::KBins Quantile ={wrongq:3d}")