diff --git a/.vscode/launch.json b/.vscode/launch.json index fa381ef..1342f2d 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -8,7 +8,7 @@ "name": "C++ Launch config", "type": "cppdbg", "request": "launch", - "program": "${workspaceFolder}/tests/build/Metrics_unittest", + "program": "${workspaceFolder}/tests/build/BinDisc_unittest", "cwd": "${workspaceFolder}/tests/build", "args": [], "launchCompleteCommand": "exec-run", diff --git a/BinDisc.cpp b/BinDisc.cpp index 551192c..afc2e8d 100644 --- a/BinDisc.cpp +++ b/BinDisc.cpp @@ -1,5 +1,4 @@ #include -#include #include #include "BinDisc.h" #include @@ -20,7 +19,8 @@ namespace mdlp { // y is included for compatibility with the Discretizer interface cutPoints.clear(); if (X.empty()) { - cutPoints.push_back(std::numeric_limits::max()); + cutPoints.push_back(0.0); + cutPoints.push_back(0.0); return; } if (strategy == strategy_t::QUANTILE) { @@ -35,13 +35,12 @@ namespace mdlp { } std::vector linspace(precision_t start, precision_t end, int num) { - // Doesn't include end point as it is not needed if (start == end) { - return { 0 }; + return { start, end }; } precision_t delta = (end - start) / static_cast(num - 1); std::vector linspc; - for (size_t i = 0; i < num - 1; ++i) { + for (size_t i = 0; i < num; ++i) { precision_t val = start + delta * static_cast(i); linspc.push_back(val); } @@ -55,6 +54,7 @@ namespace mdlp { { // Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html std::vector results; + bool first = true; results.reserve(percentiles.size()); for (auto percentile : percentiles) { const size_t i = static_cast(std::floor(static_cast(data.size() - 1) * percentile / 100.)); @@ -64,8 +64,9 @@ namespace mdlp { (percentile / 100.0 - percentI) / (static_cast(indexLower + 1) / static_cast(data.size() - 1) - percentI); const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; - if (value != results.back()) + if (value != results.back() || first) // first needed as results.back() return is undefined for empty vectors results.push_back(value); + first = false; } return results; } @@ -75,25 +76,16 @@ namespace mdlp { auto data = X; std::sort(data.begin(), data.end()); if (data.front() == data.back() || data.size() == 1) { - // if X is constant - cutPoints.push_back(std::numeric_limits::max()); + // if X is constant, pass any two given points that shall be ignored in transform + cutPoints.push_back(data.front()); + cutPoints.push_back(data.front()); return; } cutPoints = percentile(data, quantiles); - normalizeCutPoints(); } void BinDisc::fit_uniform(samples_t& X) { - auto minmax = std::minmax_element(X.begin(), X.end()); cutPoints = linspace(*minmax.first, *minmax.second, n_bins + 1); - normalizeCutPoints(); - } - void BinDisc::normalizeCutPoints() - { - // Add max value to the end - cutPoints.push_back(std::numeric_limits::max()); - // Remove first as it is not needed - cutPoints.erase(cutPoints.begin()); } } \ No newline at end of file diff --git a/BinDisc.h b/BinDisc.h index d1bb94b..eaa7ddf 100644 --- a/BinDisc.h +++ b/BinDisc.h @@ -20,7 +20,6 @@ namespace mdlp { private: void fit_uniform(samples_t&); void fit_quantile(samples_t&); - void normalizeCutPoints(); int n_bins; strategy_t strategy; }; diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index c2d4733..f9fc660 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -25,7 +25,7 @@ namespace mdlp { } if (proposed_cuts < 1) return static_cast(round(static_cast(X.size()) * proposed_cuts)); - return static_cast(proposed_cuts); + return static_cast(proposed_cuts); // As the first and last cutpoints shall be ignored in transform } void CPPFImdlp::fit(samples_t& X_, labels_t& y_) @@ -58,6 +58,10 @@ namespace mdlp { resizeCutPoints(); } } + // Insert first & last X value to the cutpoints as them shall be ignored in transform + auto minmax = std::minmax_element(X.begin(), X.end()); + cutPoints.push_back(*minmax.second); + cutPoints.insert(cutPoints.begin(), *minmax.first); } pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) diff --git a/Discretizer.cpp b/Discretizer.cpp index 9d637ca..1a30d38 100644 --- a/Discretizer.cpp +++ b/Discretizer.cpp @@ -5,9 +5,19 @@ namespace mdlp { { discretizedData.clear(); discretizedData.reserve(data.size()); + // CutPoints always have more than two items + // Have to ignore first and last cut points provided + auto first = cutPoints.begin() + 1; + auto last = cutPoints.end() - 1; for (const precision_t& item : data) { - auto upper = std::upper_bound(cutPoints.begin(), cutPoints.end(), item); - discretizedData.push_back(upper - cutPoints.begin()); + auto upper = std::lower_bound(first, last, item); + int number = upper - first; + /* + OJO + */ + if (number < 0) + throw std::runtime_error("number is less than 0 in discretizer::transform"); + discretizedData.push_back(number); } return discretizedData; } diff --git a/Discretizer.h b/Discretizer.h index 9749af8..0c7fafe 100644 --- a/Discretizer.h +++ b/Discretizer.h @@ -18,10 +18,10 @@ namespace mdlp { void fit_t(torch::Tensor& X_, torch::Tensor& y_); torch::Tensor transform_t(torch::Tensor& X_); torch::Tensor fit_transform_t(torch::Tensor& X_, torch::Tensor& y_); - static inline std::string version() { return "1.2.2"; }; + static inline std::string version() { return "1.2.3"; }; protected: labels_t discretizedData = labels_t(); - cutPoints_t cutPoints; + cutPoints_t cutPoints; // At least two cutpoints must be provided, the first and the last will be ignored in transform }; } #endif diff --git a/tests/BinDisc_unittest.cpp b/tests/BinDisc_unittest.cpp index 2d4437c..cdcc895 100644 --- a/tests/BinDisc_unittest.cpp +++ b/tests/BinDisc_unittest.cpp @@ -4,6 +4,7 @@ #include "gtest/gtest.h" #include "ArffFiles.h" #include "../BinDisc.h" +#include "Experiments.hpp" namespace mdlp { const float margin = 1e-4; @@ -40,10 +41,11 @@ namespace mdlp { auto y = labels_t(); fit(X, y); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_NEAR(3.66667, cuts.at(0), margin); - EXPECT_NEAR(6.33333, cuts.at(1), margin); - EXPECT_EQ(numeric_limits::max(), cuts.at(2)); + ASSERT_EQ(4, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(3.66667, cuts.at(1), margin); + EXPECT_NEAR(6.33333, cuts.at(2), margin); + EXPECT_NEAR(9.0, cuts.at(3), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -53,10 +55,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_NEAR(3.666667, cuts[0], margin); - EXPECT_NEAR(6.333333, cuts[1], margin); - EXPECT_EQ(numeric_limits::max(), cuts[2]); + ASSERT_EQ(4, cuts.size()); + EXPECT_NEAR(1, cuts[0], margin); + EXPECT_NEAR(3.666667, cuts[1], margin); + EXPECT_NEAR(6.333333, cuts[2], margin); + EXPECT_NEAR(9, cuts[3], margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -66,12 +69,13 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_EQ(4.0, cuts[0]); - EXPECT_EQ(7.0, cuts[1]); - EXPECT_EQ(numeric_limits::max(), cuts[2]); + ASSERT_EQ(4, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(4.0, cuts.at(1), margin); + EXPECT_NEAR(7.0, cuts.at(2), margin); + EXPECT_NEAR(10.0, cuts.at(3), margin); auto labels = transform(X); - labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2 }; EXPECT_EQ(expected, labels); } TEST_F(TestBinDisc3Q, X10BinsQuantile) @@ -79,12 +83,13 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0 }; fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_EQ(4, cuts[0]); - EXPECT_EQ(7, cuts[1]); - EXPECT_EQ(numeric_limits::max(), cuts[2]); + ASSERT_EQ(4, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(4.0, cuts.at(1), margin); + EXPECT_NEAR(7.0, cuts.at(2), margin); + EXPECT_NEAR(10.0, cuts.at(3), margin); auto labels = transform(X); - labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2 }; EXPECT_EQ(expected, labels); } TEST_F(TestBinDisc3U, X11BinsUniform) @@ -92,10 +97,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_NEAR(4.33333, cuts[0], margin); - EXPECT_NEAR(7.66667, cuts[1], margin); - EXPECT_EQ(numeric_limits::max(), cuts[2]); + ASSERT_EQ(4, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(4.33333, cuts.at(1), margin); + EXPECT_NEAR(7.66667, cuts.at(2), margin); + EXPECT_NEAR(11.0, cuts.at(3), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -105,10 +111,11 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_NEAR(4.33333, cuts[0], margin); - EXPECT_NEAR(7.66667, cuts[1], margin); - EXPECT_EQ(numeric_limits::max(), cuts[2]); + ASSERT_EQ(4, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(4.33333, cuts.at(1), margin); + EXPECT_NEAR(7.66667, cuts.at(2), margin); + EXPECT_NEAR(11.0, cuts.at(3), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 }; EXPECT_EQ(expected, labels); @@ -118,8 +125,9 @@ namespace mdlp { samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(1, cuts.size()); - EXPECT_EQ(numeric_limits::max(), cuts[0]); + ASSERT_EQ(2, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(1, cuts.at(1), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 0, 0 }; EXPECT_EQ(expected, labels); @@ -129,8 +137,9 @@ namespace mdlp { samples_t X = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(1, cuts.size()); - EXPECT_EQ(numeric_limits::max(), cuts[0]); + ASSERT_EQ(2, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(1, cuts.at(1), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 0, 0 }; EXPECT_EQ(expected, labels); @@ -140,16 +149,18 @@ namespace mdlp { samples_t X = {}; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(1, cuts.size()); - EXPECT_EQ(numeric_limits::max(), cuts[0]); + ASSERT_EQ(2, cuts.size()); + EXPECT_NEAR(0, cuts.at(0), margin); + EXPECT_NEAR(0, cuts.at(1), margin); } TEST_F(TestBinDisc3Q, EmptyQuantile) { samples_t X = {}; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(1, cuts.size()); - EXPECT_EQ(numeric_limits::max(), cuts[0]); + ASSERT_EQ(2, cuts.size()); + EXPECT_NEAR(0, cuts.at(0), margin); + EXPECT_NEAR(0, cuts.at(1), margin); } TEST(TestBinDisc3, ExceptionNumberBins) { @@ -160,10 +171,11 @@ namespace mdlp { samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_NEAR(1.66667, cuts[0], margin); - EXPECT_NEAR(2.33333, cuts[1], margin); - EXPECT_EQ(numeric_limits::max(), cuts[2]); + ASSERT_EQ(4, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(1.66667, cuts.at(1), margin); + EXPECT_NEAR(2.33333, cuts.at(2), margin); + EXPECT_NEAR(3.0, cuts.at(3), margin); auto labels = transform(X); labels_t expected = { 2, 0, 0, 2, 0, 0, 2, 0, 0 }; EXPECT_EQ(expected, labels); @@ -174,9 +186,10 @@ namespace mdlp { samples_t X = { 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(2, cuts.size()); - EXPECT_NEAR(1.66667, cuts[0], margin); - EXPECT_EQ(numeric_limits::max(), cuts[1]); + ASSERT_EQ(3, cuts.size()); + EXPECT_NEAR(1, cuts.at(0), margin); + EXPECT_NEAR(1.66667, cuts.at(1), margin); + EXPECT_NEAR(3.0, cuts.at(2), margin); auto labels = transform(X); labels_t expected = { 1, 0, 0, 1, 0, 0, 1, 0, 0 }; EXPECT_EQ(expected, labels); @@ -187,11 +200,12 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - ASSERT_EQ(3.75, cuts[0]); - EXPECT_EQ(6.5, cuts[1]); - EXPECT_EQ(9.25, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(3.75, cuts.at(1), margin); + EXPECT_NEAR(6.5, cuts.at(2), margin); + EXPECT_NEAR(9.25, cuts.at(3), margin); + EXPECT_NEAR(12.0, cuts.at(4), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -201,11 +215,12 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - ASSERT_EQ(3.75, cuts[0]); - EXPECT_EQ(6.5, cuts[1]); - EXPECT_EQ(9.25, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(3.75, cuts.at(1), margin); + EXPECT_NEAR(6.5, cuts.at(2), margin); + EXPECT_NEAR(9.25, cuts.at(3), margin); + EXPECT_NEAR(12.0, cuts.at(4), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -215,13 +230,14 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - EXPECT_EQ(4.0, cuts[0]); - EXPECT_EQ(7.0, cuts[1]); - EXPECT_EQ(10.0, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(4.0, cuts.at(1), margin); + EXPECT_NEAR(7.0, cuts.at(2), margin); + EXPECT_NEAR(10.0, cuts.at(3), margin); + EXPECT_NEAR(13.0, cuts.at(4), margin); auto labels = transform(X); - labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; EXPECT_EQ(expected, labels); } TEST_F(TestBinDisc4Q, X13BinsQuantile) @@ -229,13 +245,14 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - EXPECT_EQ(4.0, cuts[0]); - EXPECT_EQ(7.0, cuts[1]); - EXPECT_EQ(10.0, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(4.0, cuts.at(1), margin); + EXPECT_NEAR(7.0, cuts.at(2), margin); + EXPECT_NEAR(10.0, cuts.at(3), margin); + EXPECT_NEAR(13.0, cuts.at(4), margin); auto labels = transform(X); - labels_t expected = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; + labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3 }; EXPECT_EQ(expected, labels); } TEST_F(TestBinDisc4U, X14BinsUniform) @@ -243,11 +260,12 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - EXPECT_EQ(4.25, cuts[0]); - EXPECT_EQ(7.5, cuts[1]); - EXPECT_EQ(10.75, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(4.25, cuts.at(1), margin); + EXPECT_NEAR(7.5, cuts.at(2), margin); + EXPECT_NEAR(10.75, cuts.at(3), margin); + EXPECT_NEAR(14.0, cuts.at(4), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -257,11 +275,12 @@ namespace mdlp { samples_t X = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - EXPECT_EQ(4.25, cuts[0]); - EXPECT_EQ(7.5, cuts[1]); - EXPECT_EQ(10.75, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(4.25, cuts.at(1), margin); + EXPECT_NEAR(7.5, cuts.at(2), margin); + EXPECT_NEAR(10.75, cuts.at(3), margin); + EXPECT_NEAR(14.0, cuts.at(4), margin); auto labels = transform(X); labels_t expected = { 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; EXPECT_EQ(expected, labels); @@ -271,13 +290,14 @@ namespace mdlp { samples_t X = { 15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - EXPECT_EQ(4.5, cuts[0]); - EXPECT_EQ(8, cuts[1]); - EXPECT_EQ(11.5, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(4.5, cuts.at(1), margin); + EXPECT_NEAR(8, cuts.at(2), margin); + EXPECT_NEAR(11.5, cuts.at(3), margin); + EXPECT_NEAR(15.0, cuts.at(4), margin); auto labels = transform(X); - labels_t expected = { 3, 2, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 }; + labels_t expected = { 3, 1, 3, 3, 1, 0, 3, 2, 2, 2, 1, 0, 0, 1, 0 }; EXPECT_EQ(expected, labels); } TEST_F(TestBinDisc4Q, X15BinsQuantile) @@ -285,13 +305,14 @@ namespace mdlp { samples_t X = { 15.0, 13.0, 12.0, 14.0, 6.0, 1.0, 8.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0 }; fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - EXPECT_EQ(4.5, cuts[0]); - EXPECT_EQ(8, cuts[1]); - EXPECT_EQ(11.5, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(1.0, cuts.at(0), margin); + EXPECT_NEAR(4.5, cuts.at(1), margin); + EXPECT_NEAR(8, cuts.at(2), margin); + EXPECT_NEAR(11.5, cuts.at(3), margin); + EXPECT_NEAR(15.0, cuts.at(4), margin); auto labels = transform(X); - labels_t expected = { 3, 3, 3, 3, 1, 0, 2, 2, 2, 2, 1, 0, 0, 1, 0 }; + labels_t expected = { 3, 3, 3, 3, 1, 0, 1, 2, 2, 2, 1, 0, 0, 1, 0 }; EXPECT_EQ(expected, labels); } TEST_F(TestBinDisc4U, RepeatedValuesUniform) @@ -300,13 +321,14 @@ namespace mdlp { // 0 1 2 3 4 5 6 7 8 9 fit(X); auto cuts = getCutPoints(); - EXPECT_EQ(4, cuts.size()); - EXPECT_EQ(1.0, cuts[0]); - EXPECT_EQ(2.0, cuts[1]); - ASSERT_EQ(3.0, cuts[2]); - EXPECT_EQ(numeric_limits::max(), cuts[3]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(0.0, cuts.at(0), margin); + EXPECT_NEAR(1.0, cuts.at(1), margin); + EXPECT_NEAR(2.0, cuts.at(2), margin); + EXPECT_NEAR(3.0, cuts.at(3), margin); + EXPECT_NEAR(4.0, cuts.at(4), margin); auto labels = transform(X); - labels_t expected = { 0, 1, 1, 1, 2, 2, 3, 3, 3, 3 }; + labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3 }; EXPECT_EQ(expected, labels); } TEST_F(TestBinDisc4Q, RepeatedValuesQuantile) @@ -315,50 +337,80 @@ namespace mdlp { // 0 1 2 3 4 5 6 7 8 9 fit(X); auto cuts = getCutPoints(); - ASSERT_EQ(3, cuts.size()); - EXPECT_EQ(2.0, cuts[0]); - ASSERT_EQ(3.0, cuts[1]); - EXPECT_EQ(numeric_limits::max(), cuts[2]); + ASSERT_EQ(5, cuts.size()); + EXPECT_NEAR(0.0, cuts.at(0), margin); + EXPECT_NEAR(1.0, cuts.at(1), margin); + EXPECT_NEAR(2.0, cuts.at(2), margin); + EXPECT_NEAR(3.0, cuts.at(3), margin); + EXPECT_NEAR(4.0, cuts.at(4), margin); auto labels = transform(X); - labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 2 }; + labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3 }; EXPECT_EQ(expected, labels); } - TEST_F(TestBinDisc4U, irisUniform) + // TEST_F(TestBinDisc4U, irisUniform) + // { + // ArffFiles file; + // file.load(data_path + "iris.arff", true); + // vector& X = file.getX(); + // fit(X[0]); + // auto Xt = transform(X[0]); + // labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; + // EXPECT_EQ(expected, Xt); + // auto Xtt = fit_transform(X[0], file.getY()); + // EXPECT_EQ(expected, Xtt); + // auto Xt_t = torch::tensor(X[0], torch::kFloat32); + // auto y_t = torch::tensor(file.getY(), torch::kInt32); + // auto Xtt_t = fit_transform_t(Xt_t, y_t); + // for (int i = 0; i < expected.size(); i++) + // EXPECT_EQ(expected[i], Xtt_t[i].item()); + // } + // TEST_F(TestBinDisc4Q, irisQuantile) + // { + // ArffFiles file; + // file.load(data_path + "iris.arff", true); + // vector& X = file.getX(); + // fit(X[0]); + // auto Xt = transform(X[0]); + // labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 }; + // EXPECT_EQ(expected, Xt); + // auto Xtt = fit_transform(X[0], file.getY()); + // EXPECT_EQ(expected, Xtt); + // auto Xt_t = torch::tensor(X[0], torch::kFloat32); + // auto y_t = torch::tensor(file.getY(), torch::kInt32); + // auto Xtt_t = fit_transform_t(Xt_t, y_t); + // for (int i = 0; i < expected.size(); i++) + // EXPECT_EQ(expected[i], Xtt_t[i].item()); + // fit_t(Xt_t, y_t); + // auto Xt_t2 = transform_t(Xt_t); + // for (int i = 0; i < expected.size(); i++) + // EXPECT_EQ(expected[i], Xt_t2[i].item()); + // } + TEST(TestBinDiscGeneric, Fileset) { - ArffFiles file; - file.load(data_path + "iris.arff", true); - vector& X = file.getX(); - fit(X[0]); - auto Xt = transform(X[0]); - labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; - EXPECT_EQ(expected, Xt); - auto Xtt = fit_transform(X[0], file.getY()); - EXPECT_EQ(expected, Xtt); - auto Xt_t = torch::tensor(X[0], torch::kFloat32); - auto y_t = torch::tensor(file.getY(), torch::kInt32); - auto Xtt_t = fit_transform_t(Xt_t, y_t); - for (int i = 0; i < expected.size(); i++) - EXPECT_EQ(expected[i], Xtt_t[i].item()); - } - TEST_F(TestBinDisc4Q, irisQuantile) - { - ArffFiles file; - file.load(data_path + "iris.arff", true); - vector& X = file.getX(); - fit(X[0]); - auto Xt = transform(X[0]); - labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 }; - EXPECT_EQ(expected, Xt); - auto Xtt = fit_transform(X[0], file.getY()); - EXPECT_EQ(expected, Xtt); - auto Xt_t = torch::tensor(X[0], torch::kFloat32); - auto y_t = torch::tensor(file.getY(), torch::kInt32); - auto Xtt_t = fit_transform_t(Xt_t, y_t); - for (int i = 0; i < expected.size(); i++) - EXPECT_EQ(expected[i], Xtt_t[i].item()); - fit_t(Xt_t, y_t); - auto Xt_t2 = transform_t(Xt_t); - for (int i = 0; i < expected.size(); i++) - EXPECT_EQ(expected[i], Xt_t2[i].item()); + Experiments exps(data_path + "tests.txt"); + int num = 0; + while (exps.is_next()) { + Experiment exp = exps.next(); + std::cout << "Exp #: " << ++num << " From: " << exp.from_ << " To: " << exp.to_ << " Step: " << exp.step_ << " Bins: " << exp.n_bins_ << " Strategy: " << exp.strategy_ << std::endl; + BinDisc disc(exp.n_bins_, exp.strategy_ == "Q" ? strategy_t::QUANTILE : strategy_t::UNIFORM); + std::vector test; + for (float i = exp.from_; i < exp.to_; i += exp.step_) { + test.push_back(i); + } + // show_vector(test, "Test"); + auto empty = std::vector(); + auto Xt = disc.fit_transform(test, empty); + auto cuts = disc.getCutPoints(); + EXPECT_EQ(exp.discretized_data_.size(), Xt.size()); + for (int i = 0; i < exp.discretized_data_.size(); ++i) { + if (exp.discretized_data_.at(i) != Xt.at(i)) { + std::cout << "Error at " << i << " Expected: " << exp.discretized_data_.at(i) << " Got: " << Xt.at(i) << std::endl; + } + } + EXPECT_EQ(exp.cutpoints_.size(), cuts.size()); + for (int i = 0; i < exp.cutpoints_.size(); ++i) { + EXPECT_NEAR(exp.cutpoints_.at(i), cuts.at(i), margin); + } + } } } diff --git a/tests/Discretizer_unittest.cpp b/tests/Discretizer_unittest.cpp index 8c8f201..4fcd856 100644 --- a/tests/Discretizer_unittest.cpp +++ b/tests/Discretizer_unittest.cpp @@ -21,6 +21,15 @@ namespace mdlp { } const std::string data_path = set_data_path(); + TEST(Discretizer, Version) + { + Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM); + auto version = disc->version(); + delete disc; + std::cout << "Version computed: " << version; + EXPECT_EQ("1.2.3", version); + } + TEST(Discretizer, BinIrisUniform) { ArffFiles file; diff --git a/tests/Experiments.hpp b/tests/Experiments.hpp new file mode 100644 index 0000000..166c5fb --- /dev/null +++ b/tests/Experiments.hpp @@ -0,0 +1,102 @@ +#ifndef EXPERIMENTS_HPP +#define EXPERIMENTS_HPP +#include +#include +#include +#include +#include +#include +#include "../typesFImdlp.h" +class Experiment { +public: + Experiment(float from_, float to_, float step_, int n_bins, std::string strategy, std::vector data_discretized, std::vector cutpoints) : + from_{ from_ }, to_{ to_ }, step_{ step_ }, n_bins_{ n_bins }, strategy_{ strategy }, discretized_data_{ data_discretized }, cutpoints_{ cutpoints } + { + if (strategy != "Q" && strategy != "U") { + throw std::invalid_argument("Invalid strategy " + strategy); + } + } + float from_; + float to_; + float step_; + int n_bins_; + std::string strategy_; + std::vector discretized_data_; + std::vector cutpoints_; +}; +class Experiments { +public: + Experiments(const std::string filename) : filename{ filename } + { + test_file.open(filename); + if (!test_file.is_open()) { + throw std::runtime_error("File " + filename + " not found"); + } + exp_end = false; + } + ~Experiments() + { + test_file.close(); + } + bool end() const + { + return exp_end; + } + bool is_next() + { + while (std::getline(test_file, line) && line[0] == '#'); + if (test_file.eof()) { + exp_end = true; + return false; + } + return true; + } + Experiment next() + { + return parse_experiment(line); + } +private: + std::tuple parse_header(const std::string& line) + { + std::istringstream iss(line); + std::string from_, to_, step_, n_bins, strategy; + iss >> from_ >> to_ >> step_ >> n_bins >> strategy; + return { std::stof(from_), std::stof(to_), std::stof(step_), std::stoi(n_bins), strategy }; + } + template + std::vector parse_vector(const std::string& line) + { + std::istringstream iss(line); + std::vector data; + std::string d; + while (iss >> d) { + data.push_back(std::is_same::value ? std::stof(d) : std::stoi(d)); + } + return data; + } + Experiment parse_experiment(std::string& line) + { + auto [from_, to_, step_, n_bins, strategy] = parse_header(line); + std::getline(test_file, line); + auto data_discretized = parse_vector(line); + std::getline(test_file, line); + auto cutpoints = parse_vector(line); + return Experiment{ from_, to_, step_, n_bins, strategy, data_discretized, cutpoints }; + } + std::ifstream test_file; + std::string filename; + std::string line; + bool exp_end; +}; +template +void show_vector(const std::vector& data, std::string title) +{ + std::cout << title << ": "; + std::string sep = ""; + for (const auto& d : data) { + std::cout << sep << d; + sep = ", "; + } + std::cout << std::endl; +} +#endif \ No newline at end of file diff --git a/tests/datasets/tests.txt b/tests/datasets/tests.txt new file mode 100644 index 0000000..6712244 --- /dev/null +++ b/tests/datasets/tests.txt @@ -0,0 +1,35 @@ +# +# from, to, step, #bins, Q/U +# discretized data +# cut points +# +0, 100, 1, 4, Q +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +0.0, 24.75, 49.5, 74.25, 99.0 +0, 50, 1, 4, Q +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +0.0, 12.25, 24.5, 36.75, 49.0 +0, 100, 1, 3, Q +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +0.0, 33.0, 66.0, 99.0 +0, 50, 1, 3, Q +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +0.0, 16.33333, 32.66667, 49.0 +0, 10, 1, 3, Q +0, 0, 0, 0, 1, 1, 1, 2, 2, 2 +0.0, 3.0, 6.0, 9.0 +0, 100, 1, 4, U +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +0.0, 24.75, 49.5, 74.25, 99.0 +0, 50, 1, 4, U +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 +0.0, 12.25, 24.5, 36.75, 49.0 +0, 100, 1, 3, U +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +0.0, 33.0, 66.0, 99.0 +0, 50, 1, 3, U +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 +0.0, 16.33333, 32.66667, 49.0 +0, 10, 1, 3, U +0, 0, 0, 1, 1, 1, 2, 2, 2, 2 +0.0, 3.0, 6.0, 9.0 diff --git a/tests/k b/tests/k new file mode 100755 index 0000000..331da27 Binary files /dev/null and b/tests/k differ diff --git a/tests/k.cpp b/tests/k.cpp new file mode 100644 index 0000000..70ba2a3 --- /dev/null +++ b/tests/k.cpp @@ -0,0 +1,32 @@ +#include +#include +#include // For std::lower_bound + +std::vector searchsorted(const std::vector& cuts, const std::vector& data) { + std::vector indices; + indices.reserve(data.size()); + + for (const float& value : data) { + // Find the first position in 'a' where 'value' could be inserted to maintain order + auto it = std::lower_bound(cuts.begin(), cuts.end(), value); + // Calculate the index + int index = it - cuts.begin(); + indices.push_back(index); + } + + return indices; +} + +int main() { + std::vector cuts = { 10.0 }; + std::vector data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0 }; + + std::vector result = searchsorted(cuts, data); + + for (int idx : result) { + std::cout << idx << " "; + } + + return 0; +} + diff --git a/tests/t b/tests/t new file mode 100755 index 0000000..4242a1f Binary files /dev/null and b/tests/t differ diff --git a/tests/t.cpp b/tests/t.cpp new file mode 100644 index 0000000..6b19ed7 --- /dev/null +++ b/tests/t.cpp @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include + +typedef float precision_t; + +std::vector transform(const std::vector cutPoints, const std::vector& data) +{ + std::vector discretizedData; + discretizedData.reserve(data.size()); + for (const float& item : data) { + auto upper = std::lower_bound(cutPoints.begin(), cutPoints.end(), item); + discretizedData.push_back(upper - cutPoints.begin()); + } + return discretizedData; +} +template +void show_vector(const std::vector& data, std::string title) +{ + std::cout << title << ": "; + std::string sep = ""; + for (const auto& d : data) { + std::cout << sep << d; + sep = ", "; + } + std::cout << std::endl; +} +std::vector linspace(precision_t start, precision_t end, int num) +{ + if (start == end) { + return { start, end }; + } + precision_t delta = (end - start) / static_cast(num - 1); + std::vector linspc; + for (size_t i = 0; i < num - 1; ++i) { + precision_t val = start + delta * static_cast(i); + linspc.push_back(val); + } + return linspc; +} +size_t clip(const size_t n, size_t lower, size_t upper) +{ + return std::max(lower, std::min(n, upper)); +} +std::vector percentile(std::vector& data, std::vector& percentiles) +{ + // Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html + std::vector results; + results.reserve(percentiles.size()); + for (auto percentile : percentiles) { + const size_t i = static_cast(std::floor(static_cast(data.size() - 1) * percentile / 100.)); + const auto indexLower = clip(i, 0, data.size() - 2); + const double percentI = static_cast(indexLower) / static_cast(data.size() - 1); + const double fraction = + (percentile / 100.0 - percentI) / + (static_cast(indexLower + 1) / static_cast(data.size() - 1) - percentI); + const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; + if (value != results.back()) + results.push_back(value); + } + return results; +} +int main() +{ + // std::vector test; + // std::vector cuts = { 0, 24.75, 49.5, 74.25, 10000 }; + // for (int i = 0; i < 100; ++i) { + // test.push_back(i); + // } + // auto Xt = transform(cuts, test); + // show_vector(Xt, "Discretized data:"); + // std::vector test2 = { 0,1,2,3,4,5,6,7,8,9,10,11 }; + // std::vector cuts2 = { 0,1,2,3,4,5,6,7,8,9 }; + // auto Xt2 = transform(cuts2, test2); + // show_vector(Xt2, "discretized data2: "); + auto quantiles = linspace(0.0, 100.0, 3 + 1); + std::vector data = { 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 }; + std::vector cutPoints; + std::sort(data.begin(), data.end()); + cutPoints = percentile(data, quantiles); + cutPoints.push_back(std::numeric_limits::max()); + data.push_back(15); + data.push_back(0); + cutPoints.pop_back(); + cutPoints.erase(cutPoints.begin()); + cutPoints.clear(); + cutPoints.push_back(9.0); + auto Xt = transform(cutPoints, data); + show_vector(data, "Original data"); + show_vector(Xt, "Discretized data"); + show_vector(cutPoints, "Cutpoints"); + return 0; +} +/* +n_bins = 3 +data = [1,2,3,4,5,6,7,8,9,10] +quantiles = np.linspace(0, 100, n_bins + 1) +bin_edges = np.percentile(data, quantiles) + +*/ \ No newline at end of file diff --git a/tests/tests_do.py b/tests/tests_do.py new file mode 100644 index 0000000..3cfb500 --- /dev/null +++ b/tests/tests_do.py @@ -0,0 +1,39 @@ +from sklearn.preprocessing import KBinsDiscretizer + +with open("datasets/tests.txt") as f: + data = f.readlines() + +data = [x.strip() for x in data if x[0] != "#"] + +for i in range(0, len(data), 3): + print("Experiment:", data[i]) + from_, to_, step_, n_bins_, strategy_ = data[i].split(",") + strategy = "quantile" if strategy_.strip() == "Q" else "uniform" + disc = KBinsDiscretizer( + n_bins=int(n_bins_), + encode="ordinal", + strategy=strategy, + ) + X = [[float(x)] for x in range(int(from_), int(to_), int(step_))] + # result = disc.fit_transform(X) + disc.fit(X) + result = disc.transform(X) + result = [int(x) for x in result.flatten()] + expected = [int(x) for x in data[i + 1].split(",")] + assert len(result) == len(expected) + for j in range(len(result)): + if result[j] != expected[j]: + print("Error at", j, "Expected=", expected[j], "Result=", result[j]) + expected_cuts = disc.bin_edges_[0] + computed_cuts = [float(x) for x in data[i + 2].split(",")] + assert len(expected_cuts) == len(computed_cuts) + for j in range(len(expected_cuts)): + if round(expected_cuts[j], 5) != computed_cuts[j]: + print( + "Error at", + j, + "Expected=", + expected_cuts[j], + "Result=", + computed_cuts[j], + ) diff --git a/tests/tests_generate.ipynb b/tests/tests_generate.ipynb new file mode 100644 index 0000000..376c76d --- /dev/null +++ b/tests/tests_generate.ipynb @@ -0,0 +1,85 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import KBinsDiscretizer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "experiments = [\n", + " [0, 100, 1, 4, \"Q\"],\n", + " [0, 50, 1, 4, \"Q\"],\n", + " [0, 100, 1, 3, \"Q\"],\n", + " [0, 50, 1, 3, \"Q\"],\n", + " [0, 10, 1, 3, \"Q\"],\n", + " [0, 100, 1, 4, \"U\"],\n", + " [0, 50, 1, 4, \"U\"],\n", + " [0, 100, 1, 3, \"U\"],\n", + " [0, 50, 1, 3, \"U\"],\n", + " [0, 10, 1, 3, \"U\"],\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"datasets/tests.txt\", \"w\") as file:\n", + " file.write(\"#\\n\")\n", + " file.write(\"# from, to, step, #bins, Q/U\\n\")\n", + " file.write(\"# discretized data\\n\")\n", + " file.write(\"# cut points\\n\")\n", + " file.write(\"#\\n\")\n", + " for experiment in experiments:\n", + " (from_, to_, step_, bins_, strategy) = experiment\n", + " disc = KBinsDiscretizer(n_bins=bins_, encode='ordinal', strategy='quantile' if strategy.strip() == \"Q\" else 'uniform')\n", + " data = [[x] for x in range(from_, to_, step_)]\n", + " disc.fit(data)\n", + " result = disc.transform(data)\n", + " file.write(f\"{from_}, {to_}, {step_}, {bins_}, {strategy}\\n\")\n", + " sep = \"\"\n", + " for res in result:\n", + " file.write(f\"{sep}{int(res):d}\")\n", + " sep= \", \"\n", + " file.write(\"\\n\")\n", + " sep = \"\"\n", + " for res in disc.bin_edges_[0]:\n", + " file.write(sep + str(round(res,5)))\n", + " sep = \", \"\n", + " file.write(\"\\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}