From c488ace719267e731a9c53ab1afa2da5cd76f311 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Tue, 2 Jul 2024 11:50:42 +0200 Subject: [PATCH] Fix FImdlp tests --- CPPFImdlp.cpp | 2 +- tests/BinDisc_unittest.cpp | 76 ++++++++++++------------- tests/Experiments.hpp | 8 ++- tests/FImdlp_unittest.cpp | 87 ++++++++++++++-------------- tests/datasets/tests.txt | 114 +++++++++++++++++++++++++++++++++++++ tests/tests_do.py | 25 +++++--- tests/tests_generate.ipynb | 74 +++++++++++++++++++----- 7 files changed, 283 insertions(+), 103 deletions(-) diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index f9fc660..7e38497 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -25,7 +25,7 @@ namespace mdlp { } if (proposed_cuts < 1) return static_cast(round(static_cast(X.size()) * proposed_cuts)); - return static_cast(proposed_cuts); // As the first and last cutpoints shall be ignored in transform + return static_cast(proposed_cuts); // The 2 extra cutpoints should not be considered here as this parameter is considered before they are added } void CPPFImdlp::fit(samples_t& X_, labels_t& y_) diff --git a/tests/BinDisc_unittest.cpp b/tests/BinDisc_unittest.cpp index cdcc895..8827922 100644 --- a/tests/BinDisc_unittest.cpp +++ b/tests/BinDisc_unittest.cpp @@ -347,44 +347,44 @@ namespace mdlp { labels_t expected = { 0, 0, 0, 0, 1, 1, 2, 2, 2, 3 }; EXPECT_EQ(expected, labels); } - // TEST_F(TestBinDisc4U, irisUniform) - // { - // ArffFiles file; - // file.load(data_path + "iris.arff", true); - // vector& X = file.getX(); - // fit(X[0]); - // auto Xt = transform(X[0]); - // labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; - // EXPECT_EQ(expected, Xt); - // auto Xtt = fit_transform(X[0], file.getY()); - // EXPECT_EQ(expected, Xtt); - // auto Xt_t = torch::tensor(X[0], torch::kFloat32); - // auto y_t = torch::tensor(file.getY(), torch::kInt32); - // auto Xtt_t = fit_transform_t(Xt_t, y_t); - // for (int i = 0; i < expected.size(); i++) - // EXPECT_EQ(expected[i], Xtt_t[i].item()); - // } - // TEST_F(TestBinDisc4Q, irisQuantile) - // { - // ArffFiles file; - // file.load(data_path + "iris.arff", true); - // vector& X = file.getX(); - // fit(X[0]); - // auto Xt = transform(X[0]); - // labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 }; - // EXPECT_EQ(expected, Xt); - // auto Xtt = fit_transform(X[0], file.getY()); - // EXPECT_EQ(expected, Xtt); - // auto Xt_t = torch::tensor(X[0], torch::kFloat32); - // auto y_t = torch::tensor(file.getY(), torch::kInt32); - // auto Xtt_t = fit_transform_t(Xt_t, y_t); - // for (int i = 0; i < expected.size(); i++) - // EXPECT_EQ(expected[i], Xtt_t[i].item()); - // fit_t(Xt_t, y_t); - // auto Xt_t2 = transform_t(Xt_t); - // for (int i = 0; i < expected.size(); i++) - // EXPECT_EQ(expected[i], Xt_t2[i].item()); - // } + TEST_F(TestBinDisc4U, irisUniform) + { + ArffFiles file; + file.load(data_path + "iris.arff", true); + vector& X = file.getX(); + fit(X[0]); + auto Xt = transform(X[0]); + labels_t expected = { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 2, 2, 1, 2, 1, 2, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 0, 1, 2, 1, 3, 2, 2, 3, 0, 3, 2, 3, 2, 2, 2, 1, 1, 2, 2, 3, 3, 1, 2, 1, 3, 2, 2, 3, 2, 1, 2, 3, 3, 3, 2, 2, 1, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; + EXPECT_EQ(expected, Xt); + auto Xtt = fit_transform(X[0], file.getY()); + EXPECT_EQ(expected, Xtt); + auto Xt_t = torch::tensor(X[0], torch::kFloat32); + auto y_t = torch::tensor(file.getY(), torch::kInt32); + auto Xtt_t = fit_transform_t(Xt_t, y_t); + for (int i = 0; i < expected.size(); i++) + EXPECT_EQ(expected[i], Xtt_t[i].item()); + } + TEST_F(TestBinDisc4Q, irisQuantile) + { + ArffFiles file; + file.load(data_path + "iris.arff", true); + vector& X = file.getX(); + fit(X[0]); + auto Xt = transform(X[0]); + labels_t expected = { 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 3, 3, 3, 1, 3, 1, 2, 0, 3, 1, 0, 2, 2, 2, 1, 3, 1, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 3, 2, 1, 1, 1, 2, 2, 1, 2, 3, 2, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2, 1, 1, 2, 2, 3, 2, 3, 3, 0, 3, 3, 3, 3, 3, 3, 1, 2, 3, 3, 3, 3, 2, 3, 1, 3, 2, 3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 2, 2 }; + EXPECT_EQ(expected, Xt); + auto Xtt = fit_transform(X[0], file.getY()); + EXPECT_EQ(expected, Xtt); + auto Xt_t = torch::tensor(X[0], torch::kFloat32); + auto y_t = torch::tensor(file.getY(), torch::kInt32); + auto Xtt_t = fit_transform_t(Xt_t, y_t); + for (int i = 0; i < expected.size(); i++) + EXPECT_EQ(expected[i], Xtt_t[i].item()); + fit_t(Xt_t, y_t); + auto Xt_t2 = transform_t(Xt_t); + for (int i = 0; i < expected.size(); i++) + EXPECT_EQ(expected[i], Xt_t2[i].item()); + } TEST(TestBinDiscGeneric, Fileset) { Experiments exps(data_path + "tests.txt"); diff --git a/tests/Experiments.hpp b/tests/Experiments.hpp index 166c5fb..b41e84a 100644 --- a/tests/Experiments.hpp +++ b/tests/Experiments.hpp @@ -76,7 +76,13 @@ private: } Experiment parse_experiment(std::string& line) { - auto [from_, to_, step_, n_bins, strategy] = parse_header(line); + if (line == "RANGE") { + std::getline(test_file, line); + auto [from_, to_, step_, n_bins, strategy] = parse_header(line); + } else { + std::getline(test_file, line); + + } std::getline(test_file, line); auto data_discretized = parse_vector(line); std::getline(test_file, line); diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index b439631..d68b983 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -124,7 +124,7 @@ namespace mdlp { { samples_t X_ = { 1, 2, 2, 3, 4, 2, 3 }; labels_t y_ = { 0, 0, 1, 2, 3, 4, 5 }; - cutPoints_t expected = { 1.5f, 2.5f }; + cutPoints_t expected = { 1.0, 1.5f, 2.5f, 4.0 }; fit(X_, y_); auto computed = getCutPoints(); EXPECT_EQ(computed.size(), expected.size()); @@ -167,29 +167,31 @@ namespace mdlp { y = { 1 }; fit(X, y); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 0); + EXPECT_EQ(computed.size(), 2); X = { 1, 3 }; y = { 1, 2 }; fit(X, y); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 0); + EXPECT_EQ(computed.size(), 2); X = { 2, 4 }; y = { 1, 2 }; fit(X, y); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 0); + EXPECT_EQ(computed.size(), 2); X = { 1, 2, 3 }; y = { 1, 2, 2 }; fit(X, y); computed = getCutPoints(); - EXPECT_EQ(computed.size(), 1); - EXPECT_NEAR(computed[0], 1.5, precision); + EXPECT_EQ(computed.size(), 3); + EXPECT_NEAR(computed[0], 1, precision); + EXPECT_NEAR(computed[1], 1.5, precision); + EXPECT_NEAR(computed[2], 3, precision); } TEST_F(TestFImdlp, TestArtificialDataset) { fit(X, y); - cutPoints_t expected = { 5.05f }; + cutPoints_t expected = { 4.7, 5.05, 6.0 }; vector computed = getCutPoints(); EXPECT_EQ(computed.size(), expected.size()); for (unsigned long i = 0; i < computed.size(); i++) { @@ -200,10 +202,10 @@ namespace mdlp { TEST_F(TestFImdlp, TestIris) { vector expected = { - {5.45f, 5.75f}, - {2.75f, 2.85f, 2.95f, 3.05f, 3.35f}, - {2.45f, 4.75f, 5.05f}, - {0.8f, 1.75f} + {4.3, 5.45f, 5.75f, 7.9}, + {2, 2.75f, 2.85f, 2.95f, 3.05f, 3.35f, 4.4}, + {1, 2.45f, 4.75f, 5.05f, 6.9}, + {0.1, 0.8f, 1.75f, 2.5} }; vector depths = { 3, 5, 4, 3 }; auto test = CPPFImdlp(); @@ -213,7 +215,7 @@ namespace mdlp { TEST_F(TestFImdlp, ComputeCutPointsGCase) { cutPoints_t expected; - expected = { 1.5 }; + expected = { 0, 1.5, 2 }; samples_t X_ = { 0, 1, 2, 2, 2 }; labels_t y_ = { 1, 1, 1, 2, 2 }; fit(X_, y_); @@ -247,10 +249,10 @@ namespace mdlp { // Set max_depth to 1 auto test = CPPFImdlp(3, 1, 0); vector expected = { - {5.45f}, - {3.35f}, - {2.45f}, - {0.8f} + {4.3, 5.45f, 7.9}, + {2, 3.35f, 4.4}, + {1, 2.45f, 6.9}, + {0.1, 0.8f, 2.5} }; vector depths = { 1, 1, 1, 1 }; test_dataset(test, "iris", expected, depths); @@ -261,10 +263,10 @@ namespace mdlp { auto test = CPPFImdlp(75, 100, 0); // Set min_length to 75 vector expected = { - {5.45f, 5.75f}, - {2.85f, 3.35f}, - {2.45f, 4.75f}, - {0.8f, 1.75f} + {4.3, 5.45f, 5.75f, 7.9}, + {2, 2.85f, 3.35f, 4.4}, + {1, 2.45f, 4.75f, 6.9}, + {0.1, 0.8f, 1.75f, 2.5} }; vector depths = { 3, 2, 2, 2 }; test_dataset(test, "iris", expected, depths); @@ -275,10 +277,10 @@ namespace mdlp { // Set min_length to 75 auto test = CPPFImdlp(75, 2, 0); vector expected = { - {5.45f, 5.75f}, - {2.85f, 3.35f}, - {2.45f, 4.75f}, - {0.8f, 1.75f} + {4.3, 5.45f, 5.75f, 7.9}, + {2, 2.85f, 3.35f, 4.4}, + {1, 2.45f, 4.75f, 6.9}, + {0.1, 0.8f, 1.75f, 2.5} }; vector depths = { 2, 2, 2, 2 }; test_dataset(test, "iris", expected, depths); @@ -289,10 +291,10 @@ namespace mdlp { // Set min_length to 75 auto test = CPPFImdlp(75, 2, 1); vector expected = { - {5.45f}, - {2.85f}, - {2.45f}, - {0.8f} + {4.3, 5.45f, 7.9}, + {2, 2.85f, 4.4}, + {1, 2.45f, 6.9}, + {0.1, 0.8f, 2.5} }; vector depths = { 2, 2, 2, 2 }; test_dataset(test, "iris", expected, depths); @@ -304,10 +306,10 @@ namespace mdlp { // Set min_length to 75 auto test = CPPFImdlp(75, 2, 0.2f); vector expected = { - {5.45f, 5.75f}, - {2.85f, 3.35f}, - {2.45f, 4.75f}, - {0.8f, 1.75f} + {4.3, 5.45f, 5.75f, 7.9}, + {2, 2.85f, 3.35f, 4.4}, + {1, 2.45f, 4.75f, 6.9}, + {0.1, 0.8f, 1.75f, 2.5} }; vector depths = { 2, 2, 2, 2 }; test_dataset(test, "iris", expected, depths); @@ -327,7 +329,6 @@ namespace mdlp { computed = compute_max_num_cut_points(); ASSERT_EQ(expected, computed); } - } TEST_F(TestFImdlp, TransformTest) { @@ -345,15 +346,15 @@ namespace mdlp { vector& X = file.getX(); labels_t& y = file.getY(); fit(X[1], y); - // auto computed = transform(X[1]); - // EXPECT_EQ(computed.size(), expected.size()); - // for (unsigned long i = 0; i < computed.size(); i++) { - // EXPECT_EQ(computed[i], expected[i]); - // } - // auto computed_ft = fit_transform(X[1], y); - // EXPECT_EQ(computed_ft.size(), expected.size()); - // for (unsigned long i = 0; i < computed_ft.size(); i++) { - // EXPECT_EQ(computed_ft[i], expected[i]); - // } + auto computed = transform(X[1]); + EXPECT_EQ(computed.size(), expected.size()); + for (unsigned long i = 0; i < computed.size(); i++) { + EXPECT_EQ(computed[i], expected[i]); + } + auto computed_ft = fit_transform(X[1], y); + EXPECT_EQ(computed_ft.size(), expected.size()); + for (unsigned long i = 0; i < computed_ft.size(); i++) { + EXPECT_EQ(computed_ft[i], expected[i]); + } } } diff --git a/tests/datasets/tests.txt b/tests/datasets/tests.txt index 6712244..91e772e 100644 --- a/tests/datasets/tests.txt +++ b/tests/datasets/tests.txt @@ -3,33 +3,147 @@ # discretized data # cut points # +RANGE 0, 100, 1, 4, Q 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 0.0, 24.75, 49.5, 74.25, 99.0 +RANGE 0, 50, 1, 4, Q 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 0.0, 12.25, 24.5, 36.75, 49.0 +RANGE 0, 100, 1, 3, Q 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 0.0, 33.0, 66.0, 99.0 +RANGE 0, 50, 1, 3, Q 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 0.0, 16.33333, 32.66667, 49.0 +RANGE 0, 10, 1, 3, Q 0, 0, 0, 0, 1, 1, 1, 2, 2, 2 0.0, 3.0, 6.0, 9.0 +RANGE 0, 100, 1, 4, U 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 0.0, 24.75, 49.5, 74.25, 99.0 +RANGE 0, 50, 1, 4, U 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 0.0, 12.25, 24.5, 36.75, 49.0 +RANGE 0, 100, 1, 3, U 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 0.0, 33.0, 66.0, 99.0 +RANGE 0, 50, 1, 3, U 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 0.0, 16.33333, 32.66667, 49.0 +RANGE 0, 10, 1, 3, U 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 0.0, 3.0, 6.0, 9.0 +RANGE +1, 10, 1, 3, Q +0, 0, 0, 1, 1, 1, 2, 2, 2 +1.0, 3.66667, 6.33333, 9.0 +RANGE +1, 10, 1, 3, U +0, 0, 0, 1, 1, 1, 2, 2, 2 +1.0, 3.66667, 6.33333, 9.0 +RANGE +1, 11, 1, 3, Q +0, 0, 0, 1, 1, 1, 1, 2, 2, 2 +1.0, 4.0, 7.0, 10.0 +RANGE +1, 11, 1, 3, U +0, 0, 0, 1, 1, 1, 2, 2, 2, 2 +1.0, 4.0, 7.0, 10.0 +RANGE +1, 12, 1, 3, Q +0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 +1.0, 4.33333, 7.66667, 11.0 +RANGE +1, 12, 1, 3, U +0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2 +1.0, 4.33333, 7.66667, 11.0 +RANGE +1, 13, 1, 3, Q +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 +1.0, 4.66667, 8.33333, 12.0 +RANGE +1, 13, 1, 3, U +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 +1.0, 4.66667, 8.33333, 12.0 +RANGE +1, 14, 1, 3, Q +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.0, 9.0, 13.0 +RANGE +1, 14, 1, 3, U +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.0, 9.0, 13.0 +RANGE +1, 15, 1, 3, Q +0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.33333, 9.66667, 14.0 +RANGE +1, 15, 1, 3, U +0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.33333, 9.66667, 14.0 +VECTOR +Q3[3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0] +1, 0, 0, 1, 0, 0, 1, 0, 0 +1.0, 1.66667, 3.0 +VECTOR +U3[3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0] +2, 0, 0, 2, 0, 0, 2, 0, 0 +1.0, 1.66667, 2.33333, 3.0 +VECTOR +Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0] +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 +1.0, 4.66667, 8.33333, 12.0 +VECTOR +U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0] +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 +1.0, 4.66667, 8.33333, 12.0 +VECTOR +Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0] +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.0, 9.0, 13.0 +VECTOR +U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0] +0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.0, 9.0, 13.0 +VECTOR +Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0] +0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.33333, 9.66667, 14.0 +VECTOR +U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0] +0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.33333, 9.66667, 14.0 +VECTOR +Q3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0] +0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.66667, 10.33333, 15.0 +VECTOR +U3[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0] +0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 +1.0, 5.66667, 10.33333, 15.0 +VECTOR +Q3[15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0] +2, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 0, 0, 0, 0 +1.0, 5.66667, 10.33333, 15.0 +VECTOR +U3[15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0] +2, 1, 2, 2, 1, 0, 2, 2, 1, 1, 1, 0, 0, 0, 0 +1.0, 5.66667, 10.33333, 15.0 +VECTOR +Q3[0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0] +0, 0, 0, 0, 1, 1, 2, 2, 2, 2 +0.0, 1.0, 3.0, 4.0 +VECTOR +U3[0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0] +0, 0, 0, 0, 1, 1, 2, 2, 2, 2 +0.0, 1.33333, 2.66667, 4.0 diff --git a/tests/tests_do.py b/tests/tests_do.py index 3cfb500..95a2c26 100644 --- a/tests/tests_do.py +++ b/tests/tests_do.py @@ -1,3 +1,4 @@ +import json from sklearn.preprocessing import KBinsDiscretizer with open("datasets/tests.txt") as f: @@ -5,27 +6,37 @@ with open("datasets/tests.txt") as f: data = [x.strip() for x in data if x[0] != "#"] -for i in range(0, len(data), 3): - print("Experiment:", data[i]) - from_, to_, step_, n_bins_, strategy_ = data[i].split(",") +for i in range(0, len(data), 4): + experiment_type = data[i] + print("Experiment:", data[i + 1]) + if experiment_type == "RANGE": + range_data = data[i + 1] + from_, to_, step_, n_bins_, strategy_ = range_data.split(",") + X = [[float(x)] for x in range(int(from_), int(to_), int(step_))] + else: + strategy_ = data[i + 1][0] + n_bins_ = data[i + 1][1] + vector = data[i + 1][2:] + X = [[float(x)] for x in json.loads(vector)] + strategy = "quantile" if strategy_.strip() == "Q" else "uniform" disc = KBinsDiscretizer( n_bins=int(n_bins_), encode="ordinal", strategy=strategy, ) - X = [[float(x)] for x in range(int(from_), int(to_), int(step_))] - # result = disc.fit_transform(X) + expected_data = data[i + 2] + cuts_data = data[i + 3] disc.fit(X) result = disc.transform(X) result = [int(x) for x in result.flatten()] - expected = [int(x) for x in data[i + 1].split(",")] + expected = [int(x) for x in expected_data.split(",")] assert len(result) == len(expected) for j in range(len(result)): if result[j] != expected[j]: print("Error at", j, "Expected=", expected[j], "Result=", result[j]) expected_cuts = disc.bin_edges_[0] - computed_cuts = [float(x) for x in data[i + 2].split(",")] + computed_cuts = [float(x) for x in cuts_data.split(",")] assert len(expected_cuts) == len(computed_cuts) for j in range(len(expected_cuts)): if round(expected_cuts[j], 5) != computed_cuts[j]: diff --git a/tests/tests_generate.ipynb b/tests/tests_generate.ipynb index 376c76d..467ce2f 100644 --- a/tests/tests_generate.ipynb +++ b/tests/tests_generate.ipynb @@ -15,7 +15,7 @@ "metadata": {}, "outputs": [], "source": [ - "experiments = [\n", + "experiments_range = [\n", " [0, 100, 1, 4, \"Q\"],\n", " [0, 50, 1, 4, \"Q\"],\n", " [0, 100, 1, 3, \"Q\"],\n", @@ -25,7 +25,29 @@ " [0, 50, 1, 4, \"U\"],\n", " [0, 100, 1, 3, \"U\"],\n", " [0, 50, 1, 3, \"U\"],\n", + "# \n", " [0, 10, 1, 3, \"U\"],\n", + " [1, 10, 1, 3, \"Q\"],\n", + " [1, 10, 1, 3, \"U\"],\n", + " [1, 11, 1, 3, \"Q\"],\n", + " [1, 11, 1, 3, \"U\"],\n", + " [1, 12, 1, 3, \"Q\"],\n", + " [1, 12, 1, 3, \"U\"],\n", + " [1, 13, 1, 3, \"Q\"],\n", + " [1, 13, 1, 3, \"U\"],\n", + " [1, 14, 1, 3, \"Q\"],\n", + " [1, 14, 1, 3, \"U\"],\n", + " [1, 15, 1, 3, \"Q\"],\n", + " [1, 15, 1, 3, \"U\"]\n", + "]\n", + "experiments_vectors = [\n", + " (3, [3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0]),\n", + " (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]),\n", + " (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]),\n", + " (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0]),\n", + " (3, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]),\n", + " (3, [15.0, 8.0, 12.0, 14.0, 6.0, 1.0, 13.0, 11.0, 10.0, 9.0, 7.0, 4.0, 3.0, 5.0, 2.0]),\n", + " (3, [0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0])\n", "]" ] }, @@ -33,31 +55,57 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/rmontanana/miniconda3/lib/python3.11/site-packages/sklearn/preprocessing/_discretization.py:307: UserWarning: Bins whose width are too small (i.e., <= 1e-8) in feature 0 are removed. Consider decreasing the number of bins.\n", + " warnings.warn(\n" + ] + } + ], "source": [ + "def write_lists(file, data, cuts):\n", + " sep = \"\"\n", + " for res in data:\n", + " file.write(f\"{sep}{int(res):d}\")\n", + " sep= \", \"\n", + " file.write(\"\\n\")\n", + " sep = \"\"\n", + " for res in cuts:\n", + " file.write(sep + str(round(res,5)))\n", + " sep = \", \"\n", + " file.write(\"\\n\")\n", + "\n", "with open(\"datasets/tests.txt\", \"w\") as file:\n", " file.write(\"#\\n\")\n", " file.write(\"# from, to, step, #bins, Q/U\\n\")\n", " file.write(\"# discretized data\\n\")\n", " file.write(\"# cut points\\n\")\n", " file.write(\"#\\n\")\n", - " for experiment in experiments:\n", + " for experiment in experiments_range:\n", + " file.write(\"RANGE\\n\")\n", " (from_, to_, step_, bins_, strategy) = experiment\n", " disc = KBinsDiscretizer(n_bins=bins_, encode='ordinal', strategy='quantile' if strategy.strip() == \"Q\" else 'uniform')\n", " data = [[x] for x in range(from_, to_, step_)]\n", " disc.fit(data)\n", " result = disc.transform(data)\n", " file.write(f\"{from_}, {to_}, {step_}, {bins_}, {strategy}\\n\")\n", - " sep = \"\"\n", - " for res in result:\n", - " file.write(f\"{sep}{int(res):d}\")\n", - " sep= \", \"\n", - " file.write(\"\\n\")\n", - " sep = \"\"\n", - " for res in disc.bin_edges_[0]:\n", - " file.write(sep + str(round(res,5)))\n", - " sep = \", \"\n", - " file.write(\"\\n\")" + " write_lists(file, result, disc.bin_edges_[0])\n", + " for n_bins, experiment in experiments_vectors:\n", + " for strategy in [\"Q\", \"U\"]:\n", + " file.write(\"VECTOR\\n\")\n", + " file.write(f\"{strategy}{n_bins}{experiment}\\n\")\n", + " disc = KBinsDiscretizer(\n", + " n_bins=n_bins,\n", + " encode=\"ordinal\",\n", + " \n", + " strategy=\"quantile\" if strategy.strip() == \"Q\" else \"uniform\",\n", + " )\n", + " data = [[x] for x in experiment]\n", + " result = disc.fit_transform(data)\n", + " write_lists(file, result, disc.bin_edges_[0])" ] } ],