From 793b2d3cd57bd53945efdd0ebd911c9c0f16075a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Tue, 30 Apr 2024 02:11:14 +0200 Subject: [PATCH] Refactor TestUtils to allow partial and shuffle dataset load --- .vscode/launch.json | 2 +- tests/TestBoostAODE.cc | 8 +- tests/TestUtils.cc | 165 ++++++++++++++++++++++------------------- tests/TestUtils.h | 67 +++++++++++------ 4 files changed, 139 insertions(+), 103 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 73b7f7d..0d2525f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -16,7 +16,7 @@ "name": "test", "program": "${workspaceFolder}/build_debug/tests/TestBayesNet", "args": [ - "\"Test Cannot build dataset with wrong data vector\"" + "\"Bisection Best\"" ], "cwd": "${workspaceFolder}/build_debug/tests" }, diff --git a/tests/TestBoostAODE.cc b/tests/TestBoostAODE.cc index abad4ae..f6bf5fa 100644 --- a/tests/TestBoostAODE.cc +++ b/tests/TestBoostAODE.cc @@ -140,7 +140,7 @@ TEST_CASE("Oddities", "[BoostAODE]") TEST_CASE("Bisection Best", "[BoostAODE]") { auto clf = bayesnet::BoostAODE(); - auto raw = RawDatasets("mfeat-factors", true, 500); + auto raw = RawDatasets("mfeat-factors", true, 300, true); clf.setHyperparameters({ {"bisection", true}, {"maxTolerance", 3}, @@ -149,8 +149,8 @@ TEST_CASE("Bisection Best", "[BoostAODE]") {"convergence_best", true}, }); clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states); - REQUIRE(clf.getNumberOfNodes() == 217); - REQUIRE(clf.getNumberOfEdges() == 431); + REQUIRE(clf.getNumberOfNodes() == 434); + REQUIRE(clf.getNumberOfEdges() == 862); REQUIRE(clf.getNotes().size() == 3); REQUIRE(clf.getNotes()[0] == "Convergence threshold reached & 15 models eliminated"); REQUIRE(clf.getNotes()[1] == "Used features in train: 16 of 216"); @@ -162,7 +162,7 @@ TEST_CASE("Bisection Best", "[BoostAODE]") } TEST_CASE("Bisection Best vs Last", "[BoostAODE]") { - auto raw = RawDatasets("mfeat-factors", true, 1500); + auto raw = RawDatasets("mfeat-factors", true, 500); auto clf = bayesnet::BoostAODE(true); auto hyperparameters = nlohmann::json{ {"select_features", "IWSS"}, diff --git a/tests/TestUtils.cc b/tests/TestUtils.cc index 82fb073..3439145 100644 --- a/tests/TestUtils.cc +++ b/tests/TestUtils.cc @@ -4,6 +4,7 @@ // SPDX-License-Identifier: MIT // *************************************************************** +#include #include "TestUtils.h" #include "bayesnet/config.h" @@ -15,97 +16,109 @@ public: } }; -pair, map> discretize(std::vector& X, mdlp::labels_t& y, std::vector features) +class ShuffleArffFiles : public ArffFiles { +public: + ShuffleArffFiles(int num_lines = 0, bool shuffle = false) : ArffFiles(), num_lines(num_lines), shuffle(shuffle) {} + void load(const std::string& file_name, bool class_last = true) + { + ArffFiles::load(file_name, class_last); + if (num_lines > 0) { + if (num_lines > getY().size()) { + throw std::invalid_argument("num_lines must be less than the number of lines in the file"); + } + auto indices = std::vector(num_lines); + std::iota(indices.begin(), indices.end(), 0); + if (shuffle) { + std::mt19937 g{ 173 }; + std::shuffle(indices.begin(), indices.end(), g); + } + auto XX = std::vector>(attributes.size(), std::vector(num_lines)); + auto yy = std::vector(num_lines); + for (int i = 0; i < num_lines; i++) { + yy[i] = getY()[indices[i]]; + for (int j = 0; j < attributes.size(); j++) { + XX[j][i] = X[j][indices[i]]; + } + } + X = XX; + y = yy; + } + } +private: + int num_lines; + bool shuffle; +}; + +RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_lines_, bool shuffle_) { - std::vector Xd; + num_lines = num_lines_; + shuffle = shuffle_; + discretize = discretize_; + // Xt can be either discretized or not + // Xv is always discretized + loadDataset(file_name); + auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1); + dataset = torch::cat({ Xt, yresized }, 0); + nSamples = dataset.size(1); + weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble); + weightsv = std::vector(nSamples, 1.0 / nSamples); + classNumStates = discretize ? states.at(className).size() : 0; + auto fold = folding::StratifiedKFold(5, yt, 271); + auto [train, test] = fold.getFold(0); + auto train_t = torch::tensor(train); + auto test_t = torch::tensor(test); + // Get train and validation sets + X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), train_t }); + y_train = dataset.index({ -1, train_t }); + X_test = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), test_t }); + y_test = dataset.index({ -1, test_t }); + std::cout << to_string(); +} + +map RawDatasets::discretizeDataset(std::vector& X) +{ + map maxes; auto fimdlp = mdlp::CPPFImdlp(); for (int i = 0; i < X.size(); i++) { - fimdlp.fit(X[i], y); + fimdlp.fit(X[i], yv); mdlp::labels_t& xd = fimdlp.transform(X[i]); maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1; - Xd.push_back(xd); + Xv.push_back(xd); } - return { Xd, maxes }; + return maxes; } -std::vector discretizeDataset(std::vector& X, mdlp::labels_t& y) +void RawDatasets::loadDataset(const std::string& name) { - std::vector Xd; - auto fimdlp = mdlp::CPPFImdlp(); - for (int i = 0; i < X.size(); i++) { - fimdlp.fit(X[i], y); - mdlp::labels_t& xd = fimdlp.transform(X[i]); - Xd.push_back(xd); - } - return Xd; -} - -bool file_exists(const std::string& name) -{ - if (FILE* file = fopen(name.c_str(), "r")) { - fclose(file); - return true; - } else { - return false; - } -} - -tuple, std::string, map>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset) -{ - auto handler = ArffFiles(); - handler.load(Paths::datasets() + static_cast(name) + ".arff", class_last); + auto handler = ShuffleArffFiles(num_lines, shuffle); + handler.load(Paths::datasets() + static_cast(name) + ".arff", true); // Get Dataset X, y std::vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); + yv = handler.getY(); // Get className & Features - auto className = handler.getClassName(); - std::vector features; - auto attributes = handler.getAttributes(); - transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); - torch::Tensor Xd; - auto states = map>(); - if (discretize_dataset) { - auto Xr = discretizeDataset(X, y); - Xd = torch::zeros({ static_cast(Xr.size()), static_cast(Xr[0].size()) }, torch::kInt32); - for (int i = 0; i < features.size(); ++i) { - states[features[i]] = std::vector(*max_element(Xr[i].begin(), Xr[i].end()) + 1); - auto item = states.at(features[i]); - iota(begin(item), end(item), 0); - Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32)); - } - states[className] = std::vector(*max_element(y.begin(), y.end()) + 1); - iota(begin(states.at(className)), end(states.at(className)), 0); - } else { - Xd = torch::zeros({ static_cast(X.size()), static_cast(X[0].size()) }, torch::kFloat32); - for (int i = 0; i < features.size(); ++i) { - Xd.index_put_({ i, "..." }, torch::tensor(X[i])); - } - } - return { Xd, torch::tensor(y, torch::kInt32), features, className, states }; -} - -tuple>, std::vector, std::vector, std::string, map>> loadFile(const std::string& name) -{ - auto handler = ArffFiles(); - handler.load(Paths::datasets() + static_cast(name) + ".arff"); - // Get Dataset X, y - std::vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); - // Get className & Features - auto className = handler.getClassName(); - std::vector features; + className = handler.getClassName(); auto attributes = handler.getAttributes(); transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); // Discretize Dataset - std::vector Xd; - map maxes; - tie(Xd, maxes) = discretize(X, y, features); - maxes[className] = *max_element(y.begin(), y.end()) + 1; - map> states; - for (auto feature : features) { - states[feature] = std::vector(maxes[feature]); + auto maxValues = discretizeDataset(X); + maxValues[className] = *max_element(yv.begin(), yv.end()) + 1; + if (discretize) { + // discretize the tensor as well + Xt = torch::zeros({ static_cast(Xv.size()), static_cast(Xv[0].size()) }, torch::kInt32); + for (int i = 0; i < features.size(); ++i) { + states[features[i]] = std::vector(maxValues[features[i]]); + iota(begin(states.at(features[i])), end(states.at(features[i])), 0); + Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32)); + } + states[className] = std::vector(maxValues[className]); + iota(begin(states.at(className)), end(states.at(className)), 0); + } else { + Xt = torch::zeros({ static_cast(X.size()), static_cast(X[0].size()) }, torch::kFloat32); + for (int i = 0; i < features.size(); ++i) { + Xt.index_put_({ i, "..." }, torch::tensor(X[i])); + } } - states[className] = std::vector(maxes[className]); - return { Xd, y, features, className, states }; + yt = torch::tensor(yv, torch::kInt32); } + diff --git a/tests/TestUtils.h b/tests/TestUtils.h index f77684f..c3a17c3 100644 --- a/tests/TestUtils.h +++ b/tests/TestUtils.h @@ -13,37 +13,60 @@ #include #include #include +#include -bool file_exists(const std::string& name); -std::pair, map> discretize(std::vector& X, mdlp::labels_t& y, std::vector features); -std::vector discretizeDataset(std::vector& X, mdlp::labels_t& y); -std::tuple>, std::vector, std::vector, std::string, map>> loadFile(const std::string& name); -std::tuple, std::string, map>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset); class RawDatasets { public: - RawDatasets(const std::string& file_name, bool discretize) - { - // Xt can be either discretized or not - tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize); - // Xv is always discretized - tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name); - auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1); - dataset = torch::cat({ Xt, yresized }, 0); - nSamples = dataset.size(1); - weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble); - weightsv = std::vector(nSamples, 1.0 / nSamples); - classNumStates = discretize ? statest.at(classNamet).size() : 0; - } + RawDatasets(const std::string& file_name, bool discretize_, int num_lines_ = 0, bool shuffle_ = false); torch::Tensor Xt, yt, dataset, weights; + torch::Tensor X_train, y_train, X_test, y_test; std::vector> Xv; - std::vector weightsv; std::vector yv; - std::vector featurest, featuresv; - map> statest, statesv; - std::string classNamet, classNamev; + std::vector weightsv; + std::vector features; + std::string className; + map> states; int nSamples, classNumStates; double epsilon = 1e-5; + bool discretize; + int num_lines = 0; + bool shuffle = false; +private: + std::string to_string() + { + std::string features_ = ""; + for (auto& f : features) { + features_ += f + " "; + } + std::string states_ = ""; + for (auto& s : states) { + states_ += s.first + " "; + for (auto& v : s.second) { + states_ += std::to_string(v) + " "; + } + states_ += "\n"; + } + return "Xt dimensions: " + std::to_string(Xt.size(0)) + " " + std::to_string(Xt.size(1)) + "\n" + "Xv dimensions: " + std::to_string(Xv.size()) + " " + std::to_string(Xv[0].size()) + "\n" + + "yt dimensions: " + std::to_string(yt.size(0)) + "\n" + + "yv dimensions: " + std::to_string(yv.size()) + "\n" + + "X_train dimensions: " + std::to_string(X_train.size(0)) + " " + std::to_string(X_train.size(1)) + "\n" + + "X_test dimensions: " + std::to_string(X_test.size(0)) + " " + std::to_string(X_test.size(1)) + "\n" + + "y_train dimensions: " + std::to_string(y_train.size(0)) + "\n" + + "y_test dimensions: " + std::to_string(y_test.size(0)) + "\n" + + "features: " + std::to_string(features.size()) + "\n" + + features_ + "\n" + + "className: " + className + "\n" + + "states: " + std::to_string(states.size()) + "\n" + + "nSamples: " + std::to_string(nSamples) + "\n" + + "classNumStates: " + std::to_string(classNumStates) + "\n" + + "states: " + states_ + "\n"; + + + } + map discretizeDataset(std::vector& X); + void loadDataset(const std::string& name); }; #endif //TEST_UTILS_H \ No newline at end of file