Refactor TestUtils to allow partial and shuffle dataset load

This commit is contained in:
Ricardo Montañana Gómez 2024-04-30 02:11:14 +02:00
parent ae469b8146
commit 793b2d3cd5
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
4 changed files with 139 additions and 103 deletions

2
.vscode/launch.json vendored
View File

@ -16,7 +16,7 @@
"name": "test",
"program": "${workspaceFolder}/build_debug/tests/TestBayesNet",
"args": [
"\"Test Cannot build dataset with wrong data vector\""
"\"Bisection Best\""
],
"cwd": "${workspaceFolder}/build_debug/tests"
},

View File

@ -140,7 +140,7 @@ TEST_CASE("Oddities", "[BoostAODE]")
TEST_CASE("Bisection Best", "[BoostAODE]")
{
auto clf = bayesnet::BoostAODE();
auto raw = RawDatasets("mfeat-factors", true, 500);
auto raw = RawDatasets("mfeat-factors", true, 300, true);
clf.setHyperparameters({
{"bisection", true},
{"maxTolerance", 3},
@ -149,8 +149,8 @@ TEST_CASE("Bisection Best", "[BoostAODE]")
{"convergence_best", true},
});
clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states);
REQUIRE(clf.getNumberOfNodes() == 217);
REQUIRE(clf.getNumberOfEdges() == 431);
REQUIRE(clf.getNumberOfNodes() == 434);
REQUIRE(clf.getNumberOfEdges() == 862);
REQUIRE(clf.getNotes().size() == 3);
REQUIRE(clf.getNotes()[0] == "Convergence threshold reached & 15 models eliminated");
REQUIRE(clf.getNotes()[1] == "Used features in train: 16 of 216");
@ -162,7 +162,7 @@ TEST_CASE("Bisection Best", "[BoostAODE]")
}
TEST_CASE("Bisection Best vs Last", "[BoostAODE]")
{
auto raw = RawDatasets("mfeat-factors", true, 1500);
auto raw = RawDatasets("mfeat-factors", true, 500);
auto clf = bayesnet::BoostAODE(true);
auto hyperparameters = nlohmann::json{
{"select_features", "IWSS"},

View File

@ -4,6 +4,7 @@
// SPDX-License-Identifier: MIT
// ***************************************************************
#include <random>
#include "TestUtils.h"
#include "bayesnet/config.h"
@ -15,97 +16,109 @@ public:
}
};
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
class ShuffleArffFiles : public ArffFiles {
public:
ShuffleArffFiles(int num_lines = 0, bool shuffle = false) : ArffFiles(), num_lines(num_lines), shuffle(shuffle) {}
void load(const std::string& file_name, bool class_last = true)
{
ArffFiles::load(file_name, class_last);
if (num_lines > 0) {
if (num_lines > getY().size()) {
throw std::invalid_argument("num_lines must be less than the number of lines in the file");
}
auto indices = std::vector<int>(num_lines);
std::iota(indices.begin(), indices.end(), 0);
if (shuffle) {
std::mt19937 g{ 173 };
std::shuffle(indices.begin(), indices.end(), g);
}
auto XX = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(num_lines));
auto yy = std::vector<int>(num_lines);
for (int i = 0; i < num_lines; i++) {
yy[i] = getY()[indices[i]];
for (int j = 0; j < attributes.size(); j++) {
XX[j][i] = X[j][indices[i]];
}
}
X = XX;
y = yy;
}
}
private:
int num_lines;
bool shuffle;
};
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_lines_, bool shuffle_)
{
std::vector<mdlp::labels_t> Xd;
num_lines = num_lines_;
shuffle = shuffle_;
discretize = discretize_;
// Xt can be either discretized or not
// Xv is always discretized
loadDataset(file_name);
auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
dataset = torch::cat({ Xt, yresized }, 0);
nSamples = dataset.size(1);
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
classNumStates = discretize ? states.at(className).size() : 0;
auto fold = folding::StratifiedKFold(5, yt, 271);
auto [train, test] = fold.getFold(0);
auto train_t = torch::tensor(train);
auto test_t = torch::tensor(test);
// Get train and validation sets
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), train_t });
y_train = dataset.index({ -1, train_t });
X_test = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), test_t });
y_test = dataset.index({ -1, test_t });
std::cout << to_string();
}
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
{
map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
fimdlp.fit(X[i], yv);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xd.push_back(xd);
Xv.push_back(xd);
}
return { Xd, maxes };
return maxes;
}
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
void RawDatasets::loadDataset(const std::string& name)
{
std::vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
bool file_exists(const std::string& name)
{
if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file);
return true;
} else {
return false;
}
}
tuple<torch::Tensor, torch::Tensor, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset)
{
auto handler = ArffFiles();
handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff", class_last);
auto handler = ShuffleArffFiles(num_lines, shuffle);
handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff", true);
// Get Dataset X, y
std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
yv = handler.getY();
// Get className & Features
auto className = handler.getClassName();
std::vector<std::string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
torch::Tensor Xd;
auto states = map<std::string, std::vector<int>>();
if (discretize_dataset) {
auto Xr = discretizeDataset(X, y);
Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
}
states[className] = std::vector<int>(*max_element(y.begin(), y.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
} else {
Xd = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
Xd.index_put_({ i, "..." }, torch::tensor(X[i]));
}
}
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
}
tuple<std::vector<std::vector<int>>, std::vector<int>, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadFile(const std::string& name)
{
auto handler = ArffFiles();
handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff");
// Get Dataset X, y
std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
std::vector<std::string> features;
className = handler.getClassName();
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
// Discretize Dataset
std::vector<mdlp::labels_t> Xd;
map<std::string, int> maxes;
tie(Xd, maxes) = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<std::string, std::vector<int>> states;
for (auto feature : features) {
states[feature] = std::vector<int>(maxes[feature]);
auto maxValues = discretizeDataset(X);
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
if (discretize) {
// discretize the tensor as well
Xt = torch::zeros({ static_cast<int>(Xv.size()), static_cast<int>(Xv[0].size()) }, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(maxValues[features[i]]);
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
}
states[className] = std::vector<int>(maxValues[className]);
iota(begin(states.at(className)), end(states.at(className)), 0);
} else {
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
}
}
states[className] = std::vector<int>(maxes[className]);
return { Xd, y, features, className, states };
yt = torch::tensor(yv, torch::kInt32);
}

View File

@ -13,37 +13,60 @@
#include <tuple>
#include <ArffFiles.h>
#include <CPPFImdlp.h>
#include <folding.hpp>
bool file_exists(const std::string& name);
std::pair<vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<string> features);
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
std::tuple<vector<vector<int>>, std::vector<int>, std::vector<string>, std::string, map<std::string, std::vector<int>>> loadFile(const std::string& name);
std::tuple<torch::Tensor, torch::Tensor, std::vector<string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset);
class RawDatasets {
public:
RawDatasets(const std::string& file_name, bool discretize)
{
// Xt can be either discretized or not
tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize);
// Xv is always discretized
tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name);
auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
dataset = torch::cat({ Xt, yresized }, 0);
nSamples = dataset.size(1);
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
classNumStates = discretize ? statest.at(classNamet).size() : 0;
}
RawDatasets(const std::string& file_name, bool discretize_, int num_lines_ = 0, bool shuffle_ = false);
torch::Tensor Xt, yt, dataset, weights;
torch::Tensor X_train, y_train, X_test, y_test;
std::vector<vector<int>> Xv;
std::vector<double> weightsv;
std::vector<int> yv;
std::vector<string> featurest, featuresv;
map<std::string, std::vector<int>> statest, statesv;
std::string classNamet, classNamev;
std::vector<double> weightsv;
std::vector<string> features;
std::string className;
map<std::string, std::vector<int>> states;
int nSamples, classNumStates;
double epsilon = 1e-5;
bool discretize;
int num_lines = 0;
bool shuffle = false;
private:
std::string to_string()
{
std::string features_ = "";
for (auto& f : features) {
features_ += f + " ";
}
std::string states_ = "";
for (auto& s : states) {
states_ += s.first + " ";
for (auto& v : s.second) {
states_ += std::to_string(v) + " ";
}
states_ += "\n";
}
return "Xt dimensions: " + std::to_string(Xt.size(0)) + " " + std::to_string(Xt.size(1)) + "\n"
"Xv dimensions: " + std::to_string(Xv.size()) + " " + std::to_string(Xv[0].size()) + "\n"
+ "yt dimensions: " + std::to_string(yt.size(0)) + "\n"
+ "yv dimensions: " + std::to_string(yv.size()) + "\n"
+ "X_train dimensions: " + std::to_string(X_train.size(0)) + " " + std::to_string(X_train.size(1)) + "\n"
+ "X_test dimensions: " + std::to_string(X_test.size(0)) + " " + std::to_string(X_test.size(1)) + "\n"
+ "y_train dimensions: " + std::to_string(y_train.size(0)) + "\n"
+ "y_test dimensions: " + std::to_string(y_test.size(0)) + "\n"
+ "features: " + std::to_string(features.size()) + "\n"
+ features_ + "\n"
+ "className: " + className + "\n"
+ "states: " + std::to_string(states.size()) + "\n"
+ "nSamples: " + std::to_string(nSamples) + "\n"
+ "classNumStates: " + std::to_string(classNumStates) + "\n"
+ "states: " + states_ + "\n";
}
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
void loadDataset(const std::string& name);
};
#endif //TEST_UTILS_H