2024-04-11 16:02:49 +00:00
|
|
|
// ***************************************************************
|
|
|
|
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
|
|
|
// SPDX-FileType: SOURCE
|
|
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
// ***************************************************************
|
|
|
|
|
2024-04-30 00:11:14 +00:00
|
|
|
#include <random>
|
2023-10-04 21:19:23 +00:00
|
|
|
#include "TestUtils.h"
|
2024-03-08 21:20:54 +00:00
|
|
|
#include "bayesnet/config.h"
|
2023-10-04 21:19:23 +00:00
|
|
|
|
|
|
|
class Paths {
|
|
|
|
public:
|
2023-11-08 17:45:35 +00:00
|
|
|
static std::string datasets()
|
2023-10-04 21:19:23 +00:00
|
|
|
{
|
2024-01-07 18:58:22 +00:00
|
|
|
return { data_path.begin(), data_path.end() };
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2024-04-30 00:11:14 +00:00
|
|
|
class ShuffleArffFiles : public ArffFiles {
|
|
|
|
public:
|
2024-04-30 09:02:23 +00:00
|
|
|
ShuffleArffFiles(int num_samples = 0, bool shuffle = false) : ArffFiles(), num_samples(num_samples), shuffle(shuffle) {}
|
2024-04-30 00:11:14 +00:00
|
|
|
void load(const std::string& file_name, bool class_last = true)
|
|
|
|
{
|
|
|
|
ArffFiles::load(file_name, class_last);
|
2024-04-30 09:02:23 +00:00
|
|
|
if (num_samples > 0) {
|
|
|
|
if (num_samples > getY().size()) {
|
2024-04-30 00:11:14 +00:00
|
|
|
throw std::invalid_argument("num_lines must be less than the number of lines in the file");
|
|
|
|
}
|
2024-04-30 09:02:23 +00:00
|
|
|
auto indices = std::vector<int>(num_samples);
|
2024-04-30 00:11:14 +00:00
|
|
|
std::iota(indices.begin(), indices.end(), 0);
|
|
|
|
if (shuffle) {
|
|
|
|
std::mt19937 g{ 173 };
|
|
|
|
std::shuffle(indices.begin(), indices.end(), g);
|
|
|
|
}
|
2024-04-30 09:02:23 +00:00
|
|
|
auto XX = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(num_samples));
|
|
|
|
auto yy = std::vector<int>(num_samples);
|
|
|
|
for (int i = 0; i < num_samples; i++) {
|
2024-04-30 00:11:14 +00:00
|
|
|
yy[i] = getY()[indices[i]];
|
|
|
|
for (int j = 0; j < attributes.size(); j++) {
|
|
|
|
XX[j][i] = X[j][indices[i]];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
X = XX;
|
|
|
|
y = yy;
|
|
|
|
}
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
2024-04-30 00:11:14 +00:00
|
|
|
private:
|
2024-04-30 09:02:23 +00:00
|
|
|
int num_samples;
|
2024-04-30 00:11:14 +00:00
|
|
|
bool shuffle;
|
|
|
|
};
|
|
|
|
|
2024-04-30 09:02:23 +00:00
|
|
|
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug)
|
2024-04-30 00:11:14 +00:00
|
|
|
{
|
2024-04-30 09:02:23 +00:00
|
|
|
num_samples = num_samples_;
|
2024-04-30 00:11:14 +00:00
|
|
|
shuffle = shuffle_;
|
|
|
|
discretize = discretize_;
|
|
|
|
// Xt can be either discretized or not
|
|
|
|
// Xv is always discretized
|
2024-04-30 09:02:23 +00:00
|
|
|
loadDataset(file_name, class_last);
|
2024-04-30 00:11:14 +00:00
|
|
|
auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
|
|
|
|
dataset = torch::cat({ Xt, yresized }, 0);
|
|
|
|
nSamples = dataset.size(1);
|
|
|
|
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
|
|
|
|
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
|
|
|
|
classNumStates = discretize ? states.at(className).size() : 0;
|
|
|
|
auto fold = folding::StratifiedKFold(5, yt, 271);
|
|
|
|
auto [train, test] = fold.getFold(0);
|
|
|
|
auto train_t = torch::tensor(train);
|
|
|
|
auto test_t = torch::tensor(test);
|
|
|
|
// Get train and validation sets
|
|
|
|
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), train_t });
|
|
|
|
y_train = dataset.index({ -1, train_t });
|
|
|
|
X_test = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), test_t });
|
|
|
|
y_test = dataset.index({ -1, test_t });
|
2024-04-30 09:02:23 +00:00
|
|
|
if (debug)
|
|
|
|
std::cout << to_string();
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
|
|
|
|
2024-04-30 00:11:14 +00:00
|
|
|
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
|
2023-10-04 21:19:23 +00:00
|
|
|
{
|
2024-04-30 00:11:14 +00:00
|
|
|
|
|
|
|
map<std::string, int> maxes;
|
2023-10-04 21:19:23 +00:00
|
|
|
auto fimdlp = mdlp::CPPFImdlp();
|
|
|
|
for (int i = 0; i < X.size(); i++) {
|
2024-04-30 00:11:14 +00:00
|
|
|
fimdlp.fit(X[i], yv);
|
2023-10-04 21:19:23 +00:00
|
|
|
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
2024-04-30 00:11:14 +00:00
|
|
|
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
|
|
|
Xv.push_back(xd);
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
2024-04-30 00:11:14 +00:00
|
|
|
return maxes;
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
|
|
|
|
2024-04-30 09:02:23 +00:00
|
|
|
void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
2023-10-04 21:19:23 +00:00
|
|
|
{
|
2024-04-30 09:02:23 +00:00
|
|
|
auto handler = ShuffleArffFiles(num_samples, shuffle);
|
|
|
|
handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff", class_last);
|
2023-10-04 21:19:23 +00:00
|
|
|
// Get Dataset X, y
|
2023-11-08 17:45:35 +00:00
|
|
|
std::vector<mdlp::samples_t>& X = handler.getX();
|
2024-04-30 00:11:14 +00:00
|
|
|
yv = handler.getY();
|
2023-10-04 21:19:23 +00:00
|
|
|
// Get className & Features
|
2024-04-30 00:11:14 +00:00
|
|
|
className = handler.getClassName();
|
2023-10-04 21:19:23 +00:00
|
|
|
auto attributes = handler.getAttributes();
|
|
|
|
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
2024-04-30 00:11:14 +00:00
|
|
|
// Discretize Dataset
|
|
|
|
auto maxValues = discretizeDataset(X);
|
|
|
|
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
|
|
|
|
if (discretize) {
|
|
|
|
// discretize the tensor as well
|
|
|
|
Xt = torch::zeros({ static_cast<int>(Xv.size()), static_cast<int>(Xv[0].size()) }, torch::kInt32);
|
2023-10-04 21:19:23 +00:00
|
|
|
for (int i = 0; i < features.size(); ++i) {
|
2024-04-30 00:11:14 +00:00
|
|
|
states[features[i]] = std::vector<int>(maxValues[features[i]]);
|
|
|
|
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
|
|
|
|
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
2024-04-30 00:11:14 +00:00
|
|
|
states[className] = std::vector<int>(maxValues[className]);
|
2023-10-04 21:19:23 +00:00
|
|
|
iota(begin(states.at(className)), end(states.at(className)), 0);
|
|
|
|
} else {
|
2024-04-30 00:11:14 +00:00
|
|
|
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
|
2023-10-04 21:19:23 +00:00
|
|
|
for (int i = 0; i < features.size(); ++i) {
|
2024-04-30 00:11:14 +00:00
|
|
|
Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
|
|
|
}
|
2024-04-30 00:11:14 +00:00
|
|
|
yt = torch::tensor(yv, torch::kInt32);
|
2023-10-04 21:19:23 +00:00
|
|
|
}
|
|
|
|
|