From 7bfafe555f7ebc522cef4fa148a144979cc5edb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Thu, 21 Sep 2023 23:04:11 +0200 Subject: [PATCH] Begin BestResults build --- Makefile | 7 +- sample/sample.cc | 344 +++++++++++++++++------------------ src/Platform/BestResults.cc | 68 +++++++ src/Platform/BestResults.h | 20 ++ src/Platform/CMakeLists.txt | 4 +- src/Platform/best.cc | 39 ++-- src/Platform/platformUtils.h | 1 - 7 files changed, 285 insertions(+), 198 deletions(-) create mode 100644 src/Platform/BestResults.cc create mode 100644 src/Platform/BestResults.h diff --git a/Makefile b/Makefile index 099cd8a..c782d1d 100644 --- a/Makefile +++ b/Makefile @@ -19,13 +19,14 @@ copy: ## Copy binary files to selected folder @cp build/src/Platform/main $(dest) @cp build/src/Platform/list $(dest) @cp build/src/Platform/manage $(dest) + @cp build/src/Platform/best $(dest) @echo ">>> Done" dependency: ## Create a dependency graph diagram of the project (build/dependency.png) cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png build: ## Build the main and BayesNetSample - cmake --build build -t main -t BayesNetSample -t manage -t list -j 32 + cmake --build build -t main -t BayesNetSample -t manage -t list -t best -j 32 clean: ## Clean the debug info @echo ">>> Cleaning Debug BayesNet ..."; @@ -40,7 +41,7 @@ debug: ## Build a debug version of the project @if [ -d ./build ]; then rm -rf ./build; fi @mkdir build; cmake -S . -B build -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON; \ - cmake --build build -t main -t BayesNetSample -t manage -t list unit_tests -j 32; + cmake --build build -t main -t BayesNetSample -t manage -t list -t best -t unit_tests -j 32; @echo ">>> Done"; release: ## Build a Release version of the project @@ -48,7 +49,7 @@ release: ## Build a Release version of the project @if [ -d ./build ]; then rm -rf ./build; fi @mkdir build; cmake -S . -B build -D CMAKE_BUILD_TYPE=Release; \ - cmake --build build -t main -t BayesNetSample -t manage -t list -j 32; + cmake --build build -t main -t BayesNetSample -t manage -t list -t best -j 32; @echo ">>> Done"; test: ## Run tests diff --git a/sample/sample.cc b/sample/sample.cc index 7e9d569..89c491c 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -104,180 +104,180 @@ int main(int argc, char** argv) for (int i = 0; i < 10; i++) { cout << weights_.index({ i }).item() << endl; } - // map datasets = { - // {"diabetes", true}, - // {"ecoli", true}, - // {"glass", true}, - // {"iris", true}, - // {"kdd_JapaneseVowels", false}, - // {"letter", true}, - // {"liver-disorders", true}, - // {"mfeat-factors", true}, - // }; - // auto valid_datasets = vector(); - // transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets), - // [](const pair& pair) { return pair.first; }); - // argparse::ArgumentParser program("BayesNetSample"); - // program.add_argument("-d", "--dataset") - // .help("Dataset file name") - // .action([valid_datasets](const std::string& value) { - // if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) { - // return value; - // } - // throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}"); - // } - // ); - // program.add_argument("-p", "--path") - // .help(" folder where the data files are located, default") - // .default_value(string{ PATH } - // ); - // program.add_argument("-m", "--model") - // .help("Model to use " + platform::Models::instance()->toString()) - // .action([](const std::string& value) { - // static const vector choices = platform::Models::instance()->getNames(); - // if (find(choices.begin(), choices.end(), value) != choices.end()) { - // return value; - // } - // throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); - // } - // ); - // program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); - // program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true); - // program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); - // program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true); - // program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) { - // try { - // auto k = stoi(value); - // if (k < 2) { - // throw runtime_error("Number of folds must be greater than 1"); - // } - // return k; - // } - // catch (const runtime_error& err) { - // throw runtime_error(err.what()); - // } - // catch (...) { - // throw runtime_error("Number of folds must be an integer"); - // }}); - // program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); - // bool class_last, stratified, tensors, dump_cpt; - // string model_name, file_name, path, complete_file_name; - // int nFolds, seed; - // try { - // program.parse_args(argc, argv); - // file_name = program.get("dataset"); - // path = program.get("path"); - // model_name = program.get("model"); - // complete_file_name = path + file_name + ".arff"; - // stratified = program.get("stratified"); - // tensors = program.get("tensors"); - // nFolds = program.get("folds"); - // seed = program.get("seed"); - // dump_cpt = program.get("dumpcpt"); - // class_last = datasets[file_name]; - // if (!file_exists(complete_file_name)) { - // throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); - // } - // } - // catch (const exception& err) { - // cerr << err.what() << endl; - // cerr << program; - // exit(1); - // } + map datasets = { + {"diabetes", true}, + {"ecoli", true}, + {"glass", true}, + {"iris", true}, + {"kdd_JapaneseVowels", false}, + {"letter", true}, + {"liver-disorders", true}, + {"mfeat-factors", true}, + }; + auto valid_datasets = vector(); + transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets), + [](const pair& pair) { return pair.first; }); + argparse::ArgumentParser program("BayesNetSample"); + program.add_argument("-d", "--dataset") + .help("Dataset file name") + .action([valid_datasets](const std::string& value) { + if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) { + return value; + } + throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}"); + } + ); + program.add_argument("-p", "--path") + .help(" folder where the data files are located, default") + .default_value(string{ PATH } + ); + program.add_argument("-m", "--model") + .help("Model to use " + platform::Models::instance()->toString()) + .action([](const std::string& value) { + static const vector choices = platform::Models::instance()->getNames(); + if (find(choices.begin(), choices.end(), value) != choices.end()) { + return value; + } + throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); + } + ); + program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); + program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true); + program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); + program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true); + program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) { + try { + auto k = stoi(value); + if (k < 2) { + throw runtime_error("Number of folds must be greater than 1"); + } + return k; + } + catch (const runtime_error& err) { + throw runtime_error(err.what()); + } + catch (...) { + throw runtime_error("Number of folds must be an integer"); + }}); + program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); + bool class_last, stratified, tensors, dump_cpt; + string model_name, file_name, path, complete_file_name; + int nFolds, seed; + try { + program.parse_args(argc, argv); + file_name = program.get("dataset"); + path = program.get("path"); + model_name = program.get("model"); + complete_file_name = path + file_name + ".arff"; + stratified = program.get("stratified"); + tensors = program.get("tensors"); + nFolds = program.get("folds"); + seed = program.get("seed"); + dump_cpt = program.get("dumpcpt"); + class_last = datasets[file_name]; + if (!file_exists(complete_file_name)) { + throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); + } + } + catch (const exception& err) { + cerr << err.what() << endl; + cerr << program; + exit(1); + } /* * Begin Processing */ - // auto handler = ArffFiles(); - // handler.load(complete_file_name, class_last); - // // Get Dataset X, y - // vector& X = handler.getX(); - // mdlp::labels_t& y = handler.getY(); - // // Get className & Features - // auto className = handler.getClassName(); - // vector features; - // auto attributes = handler.getAttributes(); - // transform(attributes.begin(), attributes.end(), back_inserter(features), - // [](const pair& item) { return item.first; }); - // // Discretize Dataset - // auto [Xd, maxes] = discretize(X, y, features); - // maxes[className] = *max_element(y.begin(), y.end()) + 1; - // map> states; - // for (auto feature : features) { - // states[feature] = vector(maxes[feature]); - // } - // states[className] = vector(maxes[className]); - // auto clf = platform::Models::instance()->create(model_name); - // clf->fit(Xd, y, features, className, states); - // if (dump_cpt) { - // cout << "--- CPT Tables ---" << endl; - // clf->dump_cpt(); - // } - // auto lines = clf->show(); - // for (auto line : lines) { - // cout << line << endl; - // } - // cout << "--- Topological Order ---" << endl; - // auto order = clf->topological_order(); - // for (auto name : order) { - // cout << name << ", "; - // } - // cout << "end." << endl; - // auto score = clf->score(Xd, y); - // cout << "Score: " << score << endl; - // auto graph = clf->graph(); - // auto dot_file = model_name + "_" + file_name; - // ofstream file(dot_file + ".dot"); - // file << graph; - // file.close(); - // cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; - // cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; - // string stratified_string = stratified ? " Stratified" : ""; - // cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; - // cout << "==========================================" << endl; - // torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); - // torch::Tensor yt = torch::tensor(y, torch::kInt32); - // for (int i = 0; i < features.size(); ++i) { - // Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); - // } - // float total_score = 0, total_score_train = 0, score_train, score_test; - // platform::Fold* fold; - // if (stratified) - // fold = new platform::StratifiedKFold(nFolds, y, seed); - // else - // fold = new platform::KFold(nFolds, y.size(), seed); - // for (auto i = 0; i < nFolds; ++i) { - // auto [train, test] = fold->getFold(i); - // cout << "Fold: " << i + 1 << endl; - // if (tensors) { - // auto ttrain = torch::tensor(train, torch::kInt64); - // auto ttest = torch::tensor(test, torch::kInt64); - // torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); - // torch::Tensor ytraint = yt.index({ ttrain }); - // torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); - // torch::Tensor ytestt = yt.index({ ttest }); - // clf->fit(Xtraint, ytraint, features, className, states); - // auto temp = clf->predict(Xtraint); - // score_train = clf->score(Xtraint, ytraint); - // score_test = clf->score(Xtestt, ytestt); - // } else { - // auto [Xtrain, ytrain] = extract_indices(train, Xd, y); - // auto [Xtest, ytest] = extract_indices(test, Xd, y); - // clf->fit(Xtrain, ytrain, features, className, states); - // score_train = clf->score(Xtrain, ytrain); - // score_test = clf->score(Xtest, ytest); - // } - // if (dump_cpt) { - // cout << "--- CPT Tables ---" << endl; - // clf->dump_cpt(); - // } - // total_score_train += score_train; - // total_score += score_test; - // cout << "Score Train: " << score_train << endl; - // cout << "Score Test : " << score_test << endl; - // cout << "-------------------------------------------------------------------------------" << endl; - // } - // cout << "**********************************************************************************" << endl; - // cout << "Average Score Train: " << total_score_train / nFolds << endl; - // cout << "Average Score Test : " << total_score / nFolds << endl;return 0; + auto handler = ArffFiles(); + handler.load(complete_file_name, class_last); + // Get Dataset X, y + vector& X = handler.getX(); + mdlp::labels_t& y = handler.getY(); + // Get className & Features + auto className = handler.getClassName(); + vector features; + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), + [](const pair& item) { return item.first; }); + // Discretize Dataset + auto [Xd, maxes] = discretize(X, y, features); + maxes[className] = *max_element(y.begin(), y.end()) + 1; + map> states; + for (auto feature : features) { + states[feature] = vector(maxes[feature]); + } + states[className] = vector(maxes[className]); + auto clf = platform::Models::instance()->create(model_name); + clf->fit(Xd, y, features, className, states); + if (dump_cpt) { + cout << "--- CPT Tables ---" << endl; + clf->dump_cpt(); + } + auto lines = clf->show(); + for (auto line : lines) { + cout << line << endl; + } + cout << "--- Topological Order ---" << endl; + auto order = clf->topological_order(); + for (auto name : order) { + cout << name << ", "; + } + cout << "end." << endl; + auto score = clf->score(Xd, y); + cout << "Score: " << score << endl; + auto graph = clf->graph(); + auto dot_file = model_name + "_" + file_name; + ofstream file(dot_file + ".dot"); + file << graph; + file.close(); + cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; + cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; + string stratified_string = stratified ? " Stratified" : ""; + cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; + cout << "==========================================" << endl; + torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); + torch::Tensor yt = torch::tensor(y, torch::kInt32); + for (int i = 0; i < features.size(); ++i) { + Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + } + float total_score = 0, total_score_train = 0, score_train, score_test; + platform::Fold* fold; + if (stratified) + fold = new platform::StratifiedKFold(nFolds, y, seed); + else + fold = new platform::KFold(nFolds, y.size(), seed); + for (auto i = 0; i < nFolds; ++i) { + auto [train, test] = fold->getFold(i); + cout << "Fold: " << i + 1 << endl; + if (tensors) { + auto ttrain = torch::tensor(train, torch::kInt64); + auto ttest = torch::tensor(test, torch::kInt64); + torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); + torch::Tensor ytraint = yt.index({ ttrain }); + torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); + torch::Tensor ytestt = yt.index({ ttest }); + clf->fit(Xtraint, ytraint, features, className, states); + auto temp = clf->predict(Xtraint); + score_train = clf->score(Xtraint, ytraint); + score_test = clf->score(Xtestt, ytestt); + } else { + auto [Xtrain, ytrain] = extract_indices(train, Xd, y); + auto [Xtest, ytest] = extract_indices(test, Xd, y); + clf->fit(Xtrain, ytrain, features, className, states); + score_train = clf->score(Xtrain, ytrain); + score_test = clf->score(Xtest, ytest); + } + if (dump_cpt) { + cout << "--- CPT Tables ---" << endl; + clf->dump_cpt(); + } + total_score_train += score_train; + total_score += score_test; + cout << "Score Train: " << score_train << endl; + cout << "Score Test : " << score_test << endl; + cout << "-------------------------------------------------------------------------------" << endl; + } + cout << "**********************************************************************************" << endl; + cout << "Average Score Train: " << total_score_train / nFolds << endl; + cout << "Average Score Test : " << total_score / nFolds << endl;return 0; } \ No newline at end of file diff --git a/src/Platform/BestResults.cc b/src/Platform/BestResults.cc new file mode 100644 index 0000000..5c06eb6 --- /dev/null +++ b/src/Platform/BestResults.cc @@ -0,0 +1,68 @@ +#include +#include +#include +#include "platformUtils.h" +#include "BestResults.h" +#include "Results.h" +#include "Colors.h" + +namespace platform { + + void BestResults::build() + { + auto files = loadFiles(); + if (files.size() == 0) { + throw runtime_error("No result files were found!"); + } + json bests; + for (const auto& file : files) { + auto result = Result(path, file); + auto data = result.load(); + for (auto const& item : data.at("results")) { + bool update = false; + if (bests.contains(item.at("dataset").get())) { + if (item.at("score").get() > bests["dataset"].at(0).get()) { + update = true; + } + } else { + update = true; + } + if (update) { + bests[item.at("dataset").get()] = { item.at("score").get(), item.at("hyperparameters"), file }; + } + } + } + string bestFileName = path + "/" + bestResultFile(); + if (file_exists(bestFileName)) { + cout << Colors::MAGENTA() << "File " << bestFileName << " already exists and it shall be overwritten." << Colors::RESET(); + } + ofstream file(bestFileName); + file << bests; + file.close(); + } + + string BestResults::bestResultFile() + { + return "best_results_" + score + "_" + model + ".json"; + } + + vector BestResults::loadFiles() + { + vector files; + using std::filesystem::directory_iterator; + for (const auto& file : directory_iterator(path)) { + auto fileName = file.path().filename().string(); + if (fileName.find(".json") != string::npos && fileName.find("results_") == 0 + && fileName.find("_" + score + "_") != string::npos + && fileName.find("_" + model + "_") != string::npos) { + files.push_back(fileName); + } + } + return files; + } + + void BestResults::report() + { + + } +} \ No newline at end of file diff --git a/src/Platform/BestResults.h b/src/Platform/BestResults.h new file mode 100644 index 0000000..05c04f7 --- /dev/null +++ b/src/Platform/BestResults.h @@ -0,0 +1,20 @@ +#ifndef BESTRESULTS_H +#define BESTRESULTS_H +#include +using namespace std; + +namespace platform { + class BestResults { + public: + explicit BestResults(const string& path, const string& score, const string& model) : path(path), score(score), model(model) {} + void build(); + void report(); + private: + vector loadFiles(); + string bestResultFile(); + string path; + string score; + string model; + }; +} +#endif //BESTRESULTS_H \ No newline at end of file diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 2b899ea..c87fb7f 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -8,11 +8,13 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/include) add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc ReportConsole.cc ReportBase.cc) add_executable(manage manage.cc Results.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc platformUtils.cc) add_executable(list list.cc platformUtils Datasets.cc) -add_executable(best list.cc platformUtils Datasets.cc) +add_executable(best best.cc BestResults.cc Results.cc ReportBase.cc ReportExcel.cc platformUtils.cc) target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs) + target_link_libraries(best "${TORCH_LIBRARIES}" libxlsxwriter.so stdc++fs) else() target_link_libraries(manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) + target_link_libraries(best "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}") endif() target_link_libraries(list ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/best.cc b/src/Platform/best.cc index 585cb17..c4bd9fc 100644 --- a/src/Platform/best.cc +++ b/src/Platform/best.cc @@ -1,31 +1,23 @@ #include #include -#include "platformUtils.h" #include "Paths.h" -#include "Results.h" +#include "BestResults.h" using namespace std; argparse::ArgumentParser manageArguments(int argc, char** argv) { argparse::ArgumentParser program("best"); - program.add_argument("-n", "--number").default_value(0).help("Number of results to show (0 = all)").scan<'i', int>(); program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)"); program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied"); - program.add_argument("--complete").help("Show only results with all datasets").default_value(false).implicit_value(true); - program.add_argument("--partial").help("Show only partial results").default_value(false).implicit_value(true); - program.add_argument("--compare").help("Compare with best results").default_value(false).implicit_value(true); + program.add_argument("--build").help("build best score results file").default_value(false).implicit_value(true); + program.add_argument("--report").help("report of best score results file").default_value(false).implicit_value(true); try { program.parse_args(argc, argv); - auto number = program.get("number"); - if (number < 0) { - throw runtime_error("Number of results must be greater than or equal to 0"); - } auto model = program.get("model"); auto score = program.get("score"); - auto complete = program.get("complete"); - auto partial = program.get("partial"); - auto compare = program.get("compare"); + auto build = program.get("build"); + auto report = program.get("report"); } catch (const exception& err) { cerr << err.what() << endl; @@ -38,15 +30,20 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) int main(int argc, char** argv) { auto program = manageArguments(argc, argv); - auto number = program.get("number"); auto model = program.get("model"); auto score = program.get("score"); - auto complete = program.get("complete"); - auto partial = program.get("partial"); - auto compare = program.get("compare"); - if (complete) - partial = false; - auto results = platform::Results(platform::Paths::results(), number, model, score, complete, partial, compare); - results.manage(); + auto build = program.get("build"); + auto report = program.get("report"); + if (!report && !build) { + cout << "Either build, report or both, have to be selected to do anything!" << endl; + exit(1); + } + auto results = platform::BestResults(platform::Paths::results(), model, score); + if (build) { + results.build(); + } + if (report) { + results.report(); + } return 0; } diff --git a/src/Platform/platformUtils.h b/src/Platform/platformUtils.h index 2b4ca54..213e28a 100644 --- a/src/Platform/platformUtils.h +++ b/src/Platform/platformUtils.h @@ -8,7 +8,6 @@ #include "ArffFiles.h" #include "CPPFImdlp.h" using namespace std; -const string PATH = "../../data/"; bool file_exists(const std::string& name); vector split(const string& text, char delimiter);