BayesNet/sample/sample.cc

235 lines
9.7 KiB
C++
Raw Normal View History

2023-06-29 20:00:41 +00:00
#include <iostream>
2023-07-25 23:39:01 +00:00
#include <torch/torch.h>
2023-11-20 22:32:34 +00:00
#include <string>
2023-07-22 21:07:56 +00:00
#include <map>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
2023-11-16 16:03:40 +00:00
#include "ArffFiles.h"
2023-07-20 16:55:56 +00:00
#include "BayesMetrics.h"
2023-06-30 00:46:06 +00:00
#include "CPPFImdlp.h"
2023-07-25 23:39:01 +00:00
#include "Folding.h"
2023-07-29 17:00:39 +00:00
#include "Models.h"
#include "modelRegister.h"
2023-08-29 16:20:55 +00:00
#include <fstream>
2023-06-29 20:00:41 +00:00
2023-11-08 17:45:35 +00:00
const std::string PATH = "../../data/";
2023-06-29 20:00:41 +00:00
2023-11-08 17:45:35 +00:00
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
2023-06-30 00:46:06 +00:00
{
2023-11-08 17:45:35 +00:00
std::vector<mdlp::labels_t>Xd;
map<std::string, int> maxes;
2023-06-30 19:24:12 +00:00
2023-06-30 00:46:06 +00:00
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
2023-06-30 19:24:12 +00:00
mdlp::labels_t& xd = fimdlp.transform(X[i]);
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
2023-06-30 19:24:12 +00:00
Xd.push_back(xd);
2023-06-30 00:46:06 +00:00
}
return { Xd, maxes };
2023-06-30 00:46:06 +00:00
}
2023-12-04 18:12:44 +00:00
bool file_exists(const std::string& name)
{
if (FILE* file = fopen(name.c_str(), "r")) {
fclose(file);
return true;
} else {
return false;
}
}
2023-11-08 17:45:35 +00:00
pair<std::vector<std::vector<int>>, std::vector<int>> extract_indices(std::vector<int> indices, std::vector<std::vector<int>> X, std::vector<int> y)
2023-07-25 23:39:01 +00:00
{
2023-11-08 17:45:35 +00:00
std::vector<std::vector<int>> Xr; // nxm
std::vector<int> yr;
2023-07-25 23:39:01 +00:00
for (int col = 0; col < X.size(); ++col) {
2023-11-08 17:45:35 +00:00
Xr.push_back(std::vector<int>());
2023-07-25 23:39:01 +00:00
}
for (auto index : indices) {
for (int col = 0; col < X.size(); ++col) {
Xr[col].push_back(X[col][index]);
}
yr.push_back(y[index]);
}
return { Xr, yr };
}
int main(int argc, char** argv)
{
2023-11-08 17:45:35 +00:00
map<std::string, bool> datasets = {
2023-09-21 21:04:11 +00:00
{"diabetes", true},
{"ecoli", true},
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
2023-11-08 17:45:35 +00:00
auto valid_datasets = std::vector<std::string>();
2023-09-21 21:04:11 +00:00
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
2023-11-08 17:45:35 +00:00
[](const pair<std::string, bool>& pair) { return pair.first; });
2023-09-21 21:04:11 +00:00
argparse::ArgumentParser program("BayesNetSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name")
2023-12-04 18:12:44 +00:00
.action([valid_datasets](const std::string& value) {
2023-09-21 21:04:11 +00:00
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value;
}
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
}
);
program.add_argument("-p", "--path")
.help(" folder where the data files are located, default")
2023-11-08 17:45:35 +00:00
.default_value(std::string{ PATH }
2023-09-21 21:04:11 +00:00
);
program.add_argument("-m", "--model")
2023-12-04 18:12:44 +00:00
.help("Model to use " + platform::Models::instance()->tostring())
.action([](const std::string& value) {
2023-11-08 17:45:35 +00:00
static const std::vector<std::string> choices = platform::Models::instance()->getNames();
2023-09-21 21:04:11 +00:00
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
2023-12-04 18:12:44 +00:00
throw runtime_error("Model must be one of " + platform::Models::instance()->tostring());
2023-09-21 21:04:11 +00:00
}
);
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
2023-12-04 18:12:44 +00:00
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) {
2023-09-21 21:04:11 +00:00
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt;
2023-11-08 17:45:35 +00:00
std::string model_name, file_name, path, complete_file_name;
2023-09-21 21:04:11 +00:00
int nFolds, seed;
try {
program.parse_args(argc, argv);
2023-11-08 17:45:35 +00:00
file_name = program.get<std::string>("dataset");
path = program.get<std::string>("path");
model_name = program.get<std::string>("model");
2023-09-21 21:04:11 +00:00
complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors");
nFolds = program.get<int>("folds");
seed = program.get<int>("seed");
dump_cpt = program.get<bool>("dumpcpt");
class_last = datasets[file_name];
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
}
catch (const exception& err) {
2023-11-08 17:45:35 +00:00
cerr << err.what() << std::endl;
2023-09-21 21:04:11 +00:00
cerr << program;
exit(1);
}
/*
* Begin Processing
*/
2023-09-21 21:04:11 +00:00
auto handler = ArffFiles();
handler.load(complete_file_name, class_last);
// Get Dataset X, y
2023-11-08 17:45:35 +00:00
std::vector<mdlp::samples_t>& X = handler.getX();
2023-09-21 21:04:11 +00:00
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
2023-11-08 17:45:35 +00:00
std::vector<std::string> features;
2023-09-21 21:04:11 +00:00
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features),
2023-11-08 17:45:35 +00:00
[](const pair<std::string, std::string>& item) { return item.first; });
2023-09-21 21:04:11 +00:00
// Discretize Dataset
auto [Xd, maxes] = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
2023-11-08 17:45:35 +00:00
map<std::string, std::vector<int>> states;
2023-09-21 21:04:11 +00:00
for (auto feature : features) {
2023-11-08 17:45:35 +00:00
states[feature] = std::vector<int>(maxes[feature]);
2023-09-21 21:04:11 +00:00
}
2023-11-08 17:45:35 +00:00
states[className] = std::vector<int>(maxes[className]);
2023-09-21 21:04:11 +00:00
auto clf = platform::Models::instance()->create(model_name);
clf->fit(Xd, y, features, className, states);
if (dump_cpt) {
2023-11-08 17:45:35 +00:00
std::cout << "--- CPT Tables ---" << std::endl;
2023-09-21 21:04:11 +00:00
clf->dump_cpt();
}
auto lines = clf->show();
for (auto line : lines) {
2023-11-08 17:45:35 +00:00
std::cout << line << std::endl;
2023-09-21 21:04:11 +00:00
}
2023-11-08 17:45:35 +00:00
std::cout << "--- Topological Order ---" << std::endl;
2023-09-21 21:04:11 +00:00
auto order = clf->topological_order();
for (auto name : order) {
2023-11-08 17:45:35 +00:00
std::cout << name << ", ";
2023-09-21 21:04:11 +00:00
}
2023-11-08 17:45:35 +00:00
std::cout << "end." << std::endl;
2023-09-21 21:04:11 +00:00
auto score = clf->score(Xd, y);
2023-11-08 17:45:35 +00:00
std::cout << "Score: " << score << std::endl;
2023-09-21 21:04:11 +00:00
auto graph = clf->graph();
auto dot_file = model_name + "_" + file_name;
ofstream file(dot_file + ".dot");
file << graph;
file.close();
2023-11-08 17:45:35 +00:00
std::cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << std::endl;
std::cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << std::endl;
2023-12-04 18:12:44 +00:00
std::string stratified_string = stratified ? " Stratified" : "";
std::cout << nFolds << " Folds" << stratified_string << " Cross validation" << std::endl;
2023-11-08 17:45:35 +00:00
std::cout << "==========================================" << std::endl;
2023-09-21 21:04:11 +00:00
torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
torch::Tensor yt = torch::tensor(y, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
}
float total_score = 0, total_score_train = 0, score_train, score_test;
platform::Fold* fold;
if (stratified)
fold = new platform::StratifiedKFold(nFolds, y, seed);
else
fold = new platform::KFold(nFolds, y.size(), seed);
for (auto i = 0; i < nFolds; ++i) {
auto [train, test] = fold->getFold(i);
2023-11-08 17:45:35 +00:00
std::cout << "Fold: " << i + 1 << std::endl;
2023-09-21 21:04:11 +00:00
if (tensors) {
auto ttrain = torch::tensor(train, torch::kInt64);
auto ttest = torch::tensor(test, torch::kInt64);
torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
torch::Tensor ytraint = yt.index({ ttrain });
torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
torch::Tensor ytestt = yt.index({ ttest });
clf->fit(Xtraint, ytraint, features, className, states);
auto temp = clf->predict(Xtraint);
score_train = clf->score(Xtraint, ytraint);
score_test = clf->score(Xtestt, ytestt);
} else {
auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
auto [Xtest, ytest] = extract_indices(test, Xd, y);
clf->fit(Xtrain, ytrain, features, className, states);
score_train = clf->score(Xtrain, ytrain);
score_test = clf->score(Xtest, ytest);
}
if (dump_cpt) {
2023-11-08 17:45:35 +00:00
std::cout << "--- CPT Tables ---" << std::endl;
2023-09-21 21:04:11 +00:00
clf->dump_cpt();
}
total_score_train += score_train;
total_score += score_test;
2023-11-08 17:45:35 +00:00
std::cout << "Score Train: " << score_train << std::endl;
std::cout << "Score Test : " << score_test << std::endl;
std::cout << "-------------------------------------------------------------------------------" << std::endl;
2023-09-21 21:04:11 +00:00
}
2023-11-08 17:45:35 +00:00
std::cout << "**********************************************************************************" << std::endl;
std::cout << "Average Score Train: " << total_score_train / nFolds << std::endl;
std::cout << "Average Score Test : " << total_score / nFolds << std::endl;return 0;
2023-06-29 20:00:41 +00:00
}