97 lines
3.9 KiB
C++
97 lines
3.9 KiB
C++
// ***************************************************************
|
|
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
|
// SPDX-FileType: SOURCE
|
|
// SPDX-License-Identifier: MIT
|
|
// ***************************************************************
|
|
|
|
#ifndef TEST_UTILS_H
|
|
#define TEST_UTILS_H
|
|
#include <torch/torch.h>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <map>
|
|
#include <tuple>
|
|
#include <ArffFiles.hpp>
|
|
#include <fimdlp/CPPFImdlp.h>
|
|
#include <folding.hpp>
|
|
#include <bayesnet/network/Network.h>
|
|
|
|
|
|
class RawDatasets {
|
|
public:
|
|
RawDatasets(const std::string& file_name, bool discretize_, int num_samples_ = 0, bool shuffle_ = false, bool class_last = true, bool debug = false);
|
|
torch::Tensor Xt, yt, dataset, weights;
|
|
torch::Tensor X_train, y_train, X_test, y_test;
|
|
std::vector<vector<int>> Xv;
|
|
std::vector<int> yv;
|
|
std::vector<double> weightsv;
|
|
std::vector<string> features;
|
|
std::string className;
|
|
map<std::string, std::vector<int>> states;
|
|
//catalog holds the mapping between dataset names and their corresponding indices of numeric features (-1) means all are numeric
|
|
//and an empty vector means none are numeric
|
|
map<std::string, std::vector<int>> catalog;
|
|
int nSamples, classNumStates;
|
|
double epsilon = 1e-5;
|
|
bool discretize;
|
|
int num_samples = 0;
|
|
bool shuffle = false;
|
|
bayesnet::Smoothing_t smoothing = bayesnet::Smoothing_t::ORIGINAL;
|
|
private:
|
|
std::string to_string()
|
|
{
|
|
std::string features_ = "";
|
|
for (auto& f : features) {
|
|
features_ += f + " ";
|
|
}
|
|
std::string states_ = "";
|
|
for (auto& s : states) {
|
|
states_ += s.first + " ";
|
|
for (auto& v : s.second) {
|
|
states_ += std::to_string(v) + " ";
|
|
}
|
|
states_ += "\n";
|
|
}
|
|
return "Xt dimensions: " + std::to_string(Xt.size(0)) + " " + std::to_string(Xt.size(1)) + "\n"
|
|
"Xv dimensions: " + std::to_string(Xv.size()) + " " + std::to_string(Xv[0].size()) + "\n"
|
|
+ "yt dimensions: " + std::to_string(yt.size(0)) + "\n"
|
|
+ "yv dimensions: " + std::to_string(yv.size()) + "\n"
|
|
+ "X_train dimensions: " + std::to_string(X_train.size(0)) + " " + std::to_string(X_train.size(1)) + "\n"
|
|
+ "X_test dimensions: " + std::to_string(X_test.size(0)) + " " + std::to_string(X_test.size(1)) + "\n"
|
|
+ "y_train dimensions: " + std::to_string(y_train.size(0)) + "\n"
|
|
+ "y_test dimensions: " + std::to_string(y_test.size(0)) + "\n"
|
|
+ "features: " + std::to_string(features.size()) + "\n"
|
|
+ features_ + "\n"
|
|
+ "className: " + className + "\n"
|
|
+ "states: " + std::to_string(states.size()) + "\n"
|
|
+ "nSamples: " + std::to_string(nSamples) + "\n"
|
|
+ "classNumStates: " + std::to_string(classNumStates) + "\n"
|
|
+ "states: " + states_ + "\n";
|
|
}
|
|
std::string trim(const std::string& str)
|
|
{
|
|
std::string result = str;
|
|
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
|
|
return !std::isspace(ch);
|
|
}));
|
|
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
|
|
return !std::isspace(ch);
|
|
}).base(), result.end());
|
|
return result;
|
|
}
|
|
std::vector<std::string> split(const std::string& text, char delimiter)
|
|
{
|
|
std::vector<std::string> result;
|
|
std::stringstream ss(text);
|
|
std::string token;
|
|
while (std::getline(ss, token, delimiter)) {
|
|
result.push_back(trim(token));
|
|
}
|
|
return result;
|
|
}
|
|
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X, const std::vector<bool>& is_numeric);
|
|
void loadDataset(const std::string& name, bool class_last);
|
|
map<std::string, std::vector<int>> loadCatalog();
|
|
};
|
|
|
|
#endif //TEST_UTILS_H
|