From 793b2d3cd57bd53945efdd0ebd911c9c0f16075a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?=
 <rmontanana@gmail.com>
Date: Tue, 30 Apr 2024 02:11:14 +0200
Subject: [PATCH] Refactor TestUtils to allow partial and shuffle dataset load

---
 .vscode/launch.json    |   2 +-
 tests/TestBoostAODE.cc |   8 +-
 tests/TestUtils.cc     | 165 ++++++++++++++++++++++-------------------
 tests/TestUtils.h      |  67 +++++++++++------
 4 files changed, 139 insertions(+), 103 deletions(-)
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 73b7f7d..0d2525f 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -16,7 +16,7 @@
             "name": "test",
             "program": "${workspaceFolder}/build_debug/tests/TestBayesNet",
             "args": [
-                "\"Test Cannot build dataset with wrong data vector\""
+                "\"Bisection Best\""
             ],
             "cwd": "${workspaceFolder}/build_debug/tests"
         },
diff --git a/tests/TestBoostAODE.cc b/tests/TestBoostAODE.cc
index abad4ae..f6bf5fa 100644
--- a/tests/TestBoostAODE.cc
+++ b/tests/TestBoostAODE.cc
@@ -140,7 +140,7 @@ TEST_CASE("Oddities", "[BoostAODE]")
 TEST_CASE("Bisection Best", "[BoostAODE]")
 {
     auto clf = bayesnet::BoostAODE();
-    auto raw = RawDatasets("mfeat-factors", true, 500);
+    auto raw = RawDatasets("mfeat-factors", true, 300, true);
     clf.setHyperparameters({
         {"bisection", true},
         {"maxTolerance", 3},
@@ -149,8 +149,8 @@ TEST_CASE("Bisection Best", "[BoostAODE]")
         {"convergence_best", true},
         });
     clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states);
-    REQUIRE(clf.getNumberOfNodes() == 217);
-    REQUIRE(clf.getNumberOfEdges() == 431);
+    REQUIRE(clf.getNumberOfNodes() == 434);
+    REQUIRE(clf.getNumberOfEdges() == 862);
     REQUIRE(clf.getNotes().size() == 3);
     REQUIRE(clf.getNotes()[0] == "Convergence threshold reached & 15 models eliminated");
     REQUIRE(clf.getNotes()[1] == "Used features in train: 16 of 216");
@@ -162,7 +162,7 @@ TEST_CASE("Bisection Best", "[BoostAODE]")
 }
 TEST_CASE("Bisection Best vs Last", "[BoostAODE]")
 {
-    auto raw = RawDatasets("mfeat-factors", true, 1500);
+    auto raw = RawDatasets("mfeat-factors", true, 500);
     auto clf = bayesnet::BoostAODE(true);
     auto hyperparameters = nlohmann::json{
         {"select_features", "IWSS"},
diff --git a/tests/TestUtils.cc b/tests/TestUtils.cc
index 82fb073..3439145 100644
--- a/tests/TestUtils.cc
+++ b/tests/TestUtils.cc
@@ -4,6 +4,7 @@
 // SPDX-License-Identifier: MIT
 // ***************************************************************
 
+#include <random>
 #include "TestUtils.h"
 #include "bayesnet/config.h"
 
@@ -15,97 +16,109 @@ public:
     }
 };
 
-pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
+class ShuffleArffFiles : public ArffFiles {
+public:
+    ShuffleArffFiles(int num_lines = 0, bool shuffle = false) : ArffFiles(), num_lines(num_lines), shuffle(shuffle) {}
+    void load(const std::string& file_name, bool class_last = true)
+    {
+        ArffFiles::load(file_name, class_last);
+        if (num_lines > 0) {
+            if (num_lines > getY().size()) {
+                throw std::invalid_argument("num_lines must be less than the number of lines in the file");
+            }
+            auto indices = std::vector<int>(num_lines);
+            std::iota(indices.begin(), indices.end(), 0);
+            if (shuffle) {
+                std::mt19937 g{ 173 };
+                std::shuffle(indices.begin(), indices.end(), g);
+            }
+            auto XX = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(num_lines));
+            auto yy = std::vector<int>(num_lines);
+            for (int i = 0; i < num_lines; i++) {
+                yy[i] = getY()[indices[i]];
+                for (int j = 0; j < attributes.size(); j++) {
+                    XX[j][i] = X[j][indices[i]];
+                }
+            }
+            X = XX;
+            y = yy;
+        }
+    }
+private:
+    int num_lines;
+    bool shuffle;
+};
+
+RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_lines_, bool shuffle_)
 {
-    std::vector<mdlp::labels_t> Xd;
+    num_lines = num_lines_;
+    shuffle = shuffle_;
+    discretize = discretize_;
+    // Xt can be either discretized or not
+    // Xv is always discretized
+    loadDataset(file_name);
+    auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
+    dataset = torch::cat({ Xt, yresized }, 0);
+    nSamples = dataset.size(1);
+    weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
+    weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
+    classNumStates = discretize ? states.at(className).size() : 0;
+    auto fold = folding::StratifiedKFold(5, yt, 271);
+    auto [train, test] = fold.getFold(0);
+    auto train_t = torch::tensor(train);
+    auto test_t = torch::tensor(test);
+    // Get train and validation sets
+    X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), train_t });
+    y_train = dataset.index({ -1, train_t });
+    X_test = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), test_t });
+    y_test = dataset.index({ -1, test_t });
+    std::cout << to_string();
+}
+
+map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
+{
+
     map<std::string, int> maxes;
     auto fimdlp = mdlp::CPPFImdlp();
     for (int i = 0; i < X.size(); i++) {
-        fimdlp.fit(X[i], y);
+        fimdlp.fit(X[i], yv);
         mdlp::labels_t& xd = fimdlp.transform(X[i]);
         maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
-        Xd.push_back(xd);
+        Xv.push_back(xd);
     }
-    return { Xd, maxes };
+    return maxes;
 }
 
-std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
+void RawDatasets::loadDataset(const std::string& name)
 {
-    std::vector<mdlp::labels_t> Xd;
-    auto fimdlp = mdlp::CPPFImdlp();
-    for (int i = 0; i < X.size(); i++) {
-        fimdlp.fit(X[i], y);
-        mdlp::labels_t& xd = fimdlp.transform(X[i]);
-        Xd.push_back(xd);
-    }
-    return Xd;
-}
-
-bool file_exists(const std::string& name)
-{
-    if (FILE* file = fopen(name.c_str(), "r")) {
-        fclose(file);
-        return true;
-    } else {
-        return false;
-    }
-}
-
-tuple<torch::Tensor, torch::Tensor, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset)
-{
-    auto handler = ArffFiles();
-    handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff", class_last);
+    auto handler = ShuffleArffFiles(num_lines, shuffle);
+    handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff", true);
     // Get Dataset X, y
     std::vector<mdlp::samples_t>& X = handler.getX();
-    mdlp::labels_t& y = handler.getY();
+    yv = handler.getY();
     // Get className & Features
-    auto className = handler.getClassName();
-    std::vector<std::string> features;
-    auto attributes = handler.getAttributes();
-    transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
-    torch::Tensor Xd;
-    auto states = map<std::string, std::vector<int>>();
-    if (discretize_dataset) {
-        auto Xr = discretizeDataset(X, y);
-        Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
-        for (int i = 0; i < features.size(); ++i) {
-            states[features[i]] = std::vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
-            auto item = states.at(features[i]);
-            iota(begin(item), end(item), 0);
-            Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
-        }
-        states[className] = std::vector<int>(*max_element(y.begin(), y.end()) + 1);
-        iota(begin(states.at(className)), end(states.at(className)), 0);
-    } else {
-        Xd = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
-        for (int i = 0; i < features.size(); ++i) {
-            Xd.index_put_({ i, "..." }, torch::tensor(X[i]));
-        }
-    }
-    return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
-}
-
-tuple<std::vector<std::vector<int>>, std::vector<int>, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadFile(const std::string& name)
-{
-    auto handler = ArffFiles();
-    handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff");
-    // Get Dataset X, y
-    std::vector<mdlp::samples_t>& X = handler.getX();
-    mdlp::labels_t& y = handler.getY();
-    // Get className & Features
-    auto className = handler.getClassName();
-    std::vector<std::string> features;
+    className = handler.getClassName();
     auto attributes = handler.getAttributes();
     transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
     // Discretize Dataset
-    std::vector<mdlp::labels_t> Xd;
-    map<std::string, int> maxes;
-    tie(Xd, maxes) = discretize(X, y, features);
-    maxes[className] = *max_element(y.begin(), y.end()) + 1;
-    map<std::string, std::vector<int>> states;
-    for (auto feature : features) {
-        states[feature] = std::vector<int>(maxes[feature]);
+    auto maxValues = discretizeDataset(X);
+    maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
+    if (discretize) {
+        // discretize the tensor as well
+        Xt = torch::zeros({ static_cast<int>(Xv.size()), static_cast<int>(Xv[0].size()) }, torch::kInt32);
+        for (int i = 0; i < features.size(); ++i) {
+            states[features[i]] = std::vector<int>(maxValues[features[i]]);
+            iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
+            Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
+        }
+        states[className] = std::vector<int>(maxValues[className]);
+        iota(begin(states.at(className)), end(states.at(className)), 0);
+    } else {
+        Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
+        for (int i = 0; i < features.size(); ++i) {
+            Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
+        }
     }
-    states[className] = std::vector<int>(maxes[className]);
-    return { Xd, y, features, className, states };
+    yt = torch::tensor(yv, torch::kInt32);
 }
+
diff --git a/tests/TestUtils.h b/tests/TestUtils.h
index f77684f..c3a17c3 100644
--- a/tests/TestUtils.h
+++ b/tests/TestUtils.h
@@ -13,37 +13,60 @@
 #include <tuple>
 #include <ArffFiles.h>
 #include <CPPFImdlp.h>
+#include <folding.hpp>
 
-bool file_exists(const std::string& name);
-std::pair<vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<string> features);
-std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
-std::tuple<vector<vector<int>>, std::vector<int>, std::vector<string>, std::string, map<std::string, std::vector<int>>> loadFile(const std::string& name);
-std::tuple<torch::Tensor, torch::Tensor, std::vector<string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset);
 
 class RawDatasets {
 public:
-    RawDatasets(const std::string& file_name, bool discretize)
-    {
-        // Xt can be either discretized or not
-        tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize);
-        // Xv is always discretized
-        tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name);
-        auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
-        dataset = torch::cat({ Xt, yresized }, 0);
-        nSamples = dataset.size(1);
-        weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
-        weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
-        classNumStates = discretize ? statest.at(classNamet).size() : 0;
-    }
+    RawDatasets(const std::string& file_name, bool discretize_, int num_lines_ = 0, bool shuffle_ = false);
     torch::Tensor Xt, yt, dataset, weights;
+    torch::Tensor X_train, y_train, X_test, y_test;
     std::vector<vector<int>> Xv;
-    std::vector<double> weightsv;
     std::vector<int> yv;
-    std::vector<string> featurest, featuresv;
-    map<std::string, std::vector<int>> statest, statesv;
-    std::string classNamet, classNamev;
+    std::vector<double> weightsv;
+    std::vector<string> features;
+    std::string className;
+    map<std::string, std::vector<int>> states;
     int nSamples, classNumStates;
     double epsilon = 1e-5;
+    bool discretize;
+    int num_lines = 0;
+    bool shuffle = false;
+private:
+    std::string to_string()
+    {
+        std::string features_ = "";
+        for (auto& f : features) {
+            features_ += f + " ";
+        }
+        std::string states_ = "";
+        for (auto& s : states) {
+            states_ += s.first + " ";
+            for (auto& v : s.second) {
+                states_ += std::to_string(v) + " ";
+            }
+            states_ += "\n";
+        }
+        return "Xt dimensions: " + std::to_string(Xt.size(0)) + " " + std::to_string(Xt.size(1)) + "\n"
+            "Xv dimensions: " + std::to_string(Xv.size()) + " " + std::to_string(Xv[0].size()) + "\n"
+            + "yt dimensions: " + std::to_string(yt.size(0)) + "\n"
+            + "yv dimensions: " + std::to_string(yv.size()) + "\n"
+            + "X_train dimensions: " + std::to_string(X_train.size(0)) + " " + std::to_string(X_train.size(1)) + "\n"
+            + "X_test dimensions: " + std::to_string(X_test.size(0)) + " " + std::to_string(X_test.size(1)) + "\n"
+            + "y_train dimensions: " + std::to_string(y_train.size(0)) + "\n"
+            + "y_test dimensions: " + std::to_string(y_test.size(0)) + "\n"
+            + "features: " + std::to_string(features.size()) + "\n"
+            + features_ + "\n"
+            + "className: " + className + "\n"
+            + "states: " + std::to_string(states.size()) + "\n"
+            + "nSamples: " + std::to_string(nSamples) + "\n"
+            + "classNumStates: " + std::to_string(classNumStates) + "\n"
+            + "states: " + states_ + "\n";
+
+
+    }
+    map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
+    void loadDataset(const std::string& name);
 };
 
 #endif //TEST_UTILS_H
\ No newline at end of file