From 85cb4472835becac54243852cb54dd822be99637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 16:21:38 +0200 Subject: [PATCH 01/12] Add Dataset, Models and DotEnv --- src/Platform/CMakeLists.txt | 2 +- src/Platform/Datasets.cc | 10 ------ src/Platform/DotEnv.h | 62 ++++++++++++++++++++++++++++++++ src/Platform/Models.cc | 8 +++++ src/Platform/Models.h | 33 +++++++++++++++++ src/Platform/main.cc | 68 ++++++++++++++++++++++------------- src/Platform/platformUtils.cc | 11 ++++++ src/Platform/platformUtils.h | 1 + 8 files changed, 160 insertions(+), 35 deletions(-) create mode 100644 src/Platform/DotEnv.h create mode 100644 src/Platform/Models.cc create mode 100644 src/Platform/Models.h diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index f1fea17..7de4c29 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -4,5 +4,5 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) -add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc) +add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc) target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 0c09c59..0e1b169 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -2,16 +2,6 @@ #include "platformUtils.h" #include "ArffFiles.h" namespace platform { - vector split(string text, char delimiter) - { - vector result; - stringstream ss(text); - string token; - while (getline(ss, token, delimiter)) { - result.push_back(token); - } - return result; - } void Datasets::load() { string line; diff --git a/src/Platform/DotEnv.h b/src/Platform/DotEnv.h new file mode 100644 index 0000000..af6eda2 --- /dev/null +++ b/src/Platform/DotEnv.h @@ -0,0 +1,62 @@ +#ifndef DOTENV_H +#define DOTENV_H +#include +#include +#include +#include +#include "platformUtils.h" +namespace platform { + class DotEnv { + private: + std::map env; + std::string trim(const std::string& str) + { + std::string result = str; + result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) { + return !std::isspace(ch); + })); + result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) { + return !std::isspace(ch); + }).base(), result.end()); + return result; + } + public: + DotEnv() + { + std::ifstream file(".env"); + if (!file.is_open()) { + std::cerr << "File .env not found" << std::endl; + exit(1); + } + std::string line; + while (std::getline(file, line)) { + line = trim(line); + if (line.empty() || line[0] == '#') { + continue; + } + std::istringstream iss(line); + std::string key, value; + if (std::getline(iss, key, '=') && std::getline(iss, value)) { + env[key] = value; + } + } + } + std::string get(const std::string& key) + { + return env[key]; + } + std::vector getSeeds() + { + auto seeds = std::vector(); + auto seeds_str = env["seeds"]; + seeds_str = trim(seeds_str); + seeds_str = seeds_str.substr(1, seeds_str.size() - 2); + auto seeds_str_split = split(seeds_str, ','); + for (auto seed_str : seeds_str_split) { + seeds.push_back(stoi(seed_str)); + } + return seeds; + } + }; +} +#endif \ No newline at end of file diff --git a/src/Platform/Models.cc b/src/Platform/Models.cc new file mode 100644 index 0000000..b6aaa66 --- /dev/null +++ b/src/Platform/Models.cc @@ -0,0 +1,8 @@ +#include "Models.h" +namespace platform { + using namespace std; + map Models::classifiers = map({ + { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, + { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } + }); +} \ No newline at end of file diff --git a/src/Platform/Models.h b/src/Platform/Models.h new file mode 100644 index 0000000..2851036 --- /dev/null +++ b/src/Platform/Models.h @@ -0,0 +1,33 @@ +#ifndef MODELS_H +#define MODELS_H +#include +#include "BaseClassifier.h" +#include "AODE.h" +#include "TAN.h" +#include "KDB.h" +#include "SPODE.h" +namespace platform { + class Models { + private: + static map classifiers; + public: + static bayesnet::BaseClassifier* get(string name) { return classifiers[name]; } + static vector getNames() + { + vector names; + for (auto& [name, classifier] : classifiers) { + names.push_back(name); + } + return names; + } + static string toString() + { + string names = ""; + for (auto& [name, classifier] : classifiers) { + names += name + ", "; + } + return "{" + names.substr(0, names.size() - 2) + "}"; + } + }; +} +#endif \ No newline at end of file diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 9cce8ad..0faa5b3 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -3,33 +3,37 @@ #include "platformUtils.h" #include "Experiment.h" #include "Datasets.h" +#include "DotEnv.h" +#include "Models.h" using namespace std; const string PATH_RESULTS = "results"; +const string PATH_DATASETS = "datasets"; argparse::ArgumentParser manageArguments(int argc, char** argv) { + auto env = platform::DotEnv(); argparse::ArgumentParser program("BayesNetSample"); program.add_argument("-d", "--dataset").default_value("").help("Dataset file name"); program.add_argument("-p", "--path") .help("folder where the data files are located, default") - .default_value(string{ PATH } + .default_value(string{ PATH_DATASETS } ); program.add_argument("-m", "--model") - .help("Model to use {AODE, KDB, SPODE, TAN}") + .help("Model to use " + platform::Models::toString()) .action([](const std::string& value) { - static const vector choices = { "AODE", "KDB", "SPODE", "TAN" }; + static const vector choices = platform::Models::getNames(); if (find(choices.begin(), choices.end(), value) != choices.end()) { return value; } - throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}"); + throw runtime_error("Model must be one of " + platform::Models::toString()); } ); - program.add_argument("--title").required().help("Experiment title"); - program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); - program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); - program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) { + program.add_argument("--title").default_value("").help("Experiment title"); + program.add_argument("--discretize").help("Discretize input dataset").default_value((bool)stoi(env.get("discretize"))).implicit_value(true); + program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true); + program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const string& value) { try { auto k = stoi(value); if (k < 2) { @@ -43,9 +47,11 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) catch (...) { throw runtime_error("Number of folds must be an integer"); }}); - program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); + auto seed_values = env.getSeeds(); + program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values); bool class_last, discretize_dataset, stratified; - int n_folds, seed; + int n_folds; + vector seeds; string model_name, file_name, path, complete_file_name, title; try { program.parse_args(argc, argv); @@ -55,10 +61,13 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) discretize_dataset = program.get("discretize"); stratified = program.get("stratified"); n_folds = program.get("folds"); - seed = program.get("seed"); + seeds = program.get>("seeds"); complete_file_name = path + file_name + ".arff"; class_last = false;//datasets[file_name]; title = program.get("title"); + if (title == "" && file_name == "") { + throw runtime_error("title is mandatory if dataset is not provided"); + } } catch (const exception& err) { cerr << err.what() << endl; @@ -71,25 +80,30 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) int main(int argc, char** argv) { auto program = manageArguments(argc, argv); + bool saveResults = false; auto file_name = program.get("dataset"); auto path = program.get("path"); auto model_name = program.get("model"); auto discretize_dataset = program.get("discretize"); auto stratified = program.get("stratified"); auto n_folds = program.get("folds"); - auto seed = program.get("seed"); + auto seeds = program.get>("seeds"); vector filesToProcess; auto datasets = platform::Datasets(path, true, platform::ARFF); + auto title = program.get("title"); if (file_name != "") { if (!datasets.isDataset(file_name)) { cerr << "Dataset " << file_name << " not found" << endl; exit(1); } + if (title == "") { + title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds"; + } filesToProcess.push_back(file_name); } else { filesToProcess = platform::Datasets(path, true, platform::ARFF).getNames(); + saveResults = true; } - auto title = program.get("title"); /* * Begin Processing @@ -97,7 +111,10 @@ int main(int argc, char** argv) auto experiment = platform::Experiment(); experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("1.0.0"); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet"); - experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy"); + experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy"); + for (auto seed : seeds) { + experiment.addRandomSeed(seed); + } platform::Timer timer; cout << "*** Starting experiment: " << title << " ***" << endl; timer.start(); @@ -109,16 +126,19 @@ int main(int argc, char** argv) auto samples = datasets.getNSamples(fileName); auto className = datasets.getClassName(fileName); cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush; - Fold* fold; - if (stratified) - fold = new StratifiedKFold(n_folds, y, seed); - else - fold = new KFold(n_folds, samples, seed); - auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); - result.setDataset(fileName); - experiment.setModelVersion(result.getModelVersion()); - experiment.addResult(result); - delete fold; + for (auto seed : seeds) { + cout << "(" << seed << ") " << flush; + Fold* fold; + if (stratified) + fold = new StratifiedKFold(n_folds, y, seed); + else + fold = new KFold(n_folds, samples, seed); + auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); + result.setDataset(fileName); + experiment.setModelVersion(result.getModelVersion()); + experiment.addResult(result); + delete fold; + } } experiment.setDuration(timer.getDuration()); experiment.save(PATH_RESULTS); diff --git a/src/Platform/platformUtils.cc b/src/Platform/platformUtils.cc index ea8fad3..f318831 100644 --- a/src/Platform/platformUtils.cc +++ b/src/Platform/platformUtils.cc @@ -2,6 +2,17 @@ using namespace torch; +vector split(string text, char delimiter) +{ + vector result; + stringstream ss(text); + string token; + while (getline(ss, token, delimiter)) { + result.push_back(token); + } + return result; +} + pair, map> discretize(vector& X, mdlp::labels_t& y, vector features) { vector Xd; diff --git a/src/Platform/platformUtils.h b/src/Platform/platformUtils.h index abc69bd..9515bbf 100644 --- a/src/Platform/platformUtils.h +++ b/src/Platform/platformUtils.h @@ -11,6 +11,7 @@ using namespace std; const string PATH = "../../data/"; bool file_exists(const std::string& name); +vector split(string text, char delimiter); pair, map> discretize(vector& X, mdlp::labels_t& y, vector features); vector discretizeDataset(vector& X, mdlp::labels_t& y); pair>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector& features, string className); From b9e76beccea5e6a095275fa6ecb1e27e390949ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 16:31:36 +0200 Subject: [PATCH 02/12] Add show experiment --- src/Platform/Experiment.cc | 15 ++++++++++++--- src/Platform/Experiment.h | 18 +++++++++++++----- src/Platform/main.cc | 8 ++++++-- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index c48e2be..b8c65b8 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -43,7 +43,7 @@ namespace platform { result["discretized"] = discretized; result["stratified"] = stratified; result["folds"] = nfolds; - result["seeds"] = random_seeds; + result["seeds"] = randomSeeds; result["duration"] = duration; result["results"] = json::array(); for (auto& r : results) { @@ -65,6 +65,10 @@ namespace platform { j["test_time_std"] = r.getTestTimeStd(); j["time"] = r.getTestTime() + r.getTrainTime(); j["time_std"] = r.getTestTimeStd() + r.getTrainTimeStd(); + j["scores_train"] = r.getScoresTrain(); + j["scores_test"] = r.getScoresTest(); + j["times_train"] = r.getTimesTrain(); + j["times_test"] = r.getTimesTest(); j["nodes"] = r.getNodes(); j["leaves"] = r.getLeaves(); j["depth"] = r.getDepth(); @@ -79,6 +83,11 @@ namespace platform { file << data; file.close(); } + void Experiment::show() + { + json data = build_json(); + cout << data.dump(4) << endl; + } Result cross_validation(Fold* fold, string model_name, torch::Tensor& Xt, torch::Tensor& y, vector features, string className, map> states) { auto classifiers = map({ @@ -101,7 +110,6 @@ namespace platform { cout << "doing Fold: " << flush; for (int i = 0; i < k; i++) { bayesnet::BaseClassifier* model = classifiers[model_name]; - result.setModelVersion(model->getVersion()); train_timer.start(); auto [train, test] = fold->getFold(i); auto train_t = torch::tensor(train); @@ -122,8 +130,9 @@ namespace platform { test_time[i] = test_timer.getDuration(); accuracy_train[i] = accuracy_train_value; accuracy_test[i] = accuracy_test_value; + } - cout << "end." << endl; + cout << "end. " << flush; result.setScoreTest(torch::mean(accuracy_test).item()).setScoreTrain(torch::mean(accuracy_train).item()); result.setScoreTestStd(torch::std(accuracy_test).item()).setScoreTrainStd(torch::std(accuracy_train).item()); result.setTrainTime(torch::mean(train_time).item()).setTestTime(torch::mean(test_time).item()); diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index 8e0a677..58639d7 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -33,6 +33,7 @@ namespace platform { int samples, features, classes; double score_train, score_test, score_train_std, score_test_std, train_time, train_time_std, test_time, test_time_std; float nodes, leaves, depth; + vector scores_train, scores_test, times_train, times_test; public: Result() = default; Result& setDataset(string dataset) { this->dataset = dataset; return *this; } @@ -51,7 +52,10 @@ namespace platform { Result& setNodes(float nodes) { this->nodes = nodes; return *this; } Result& setLeaves(float leaves) { this->leaves = leaves; return *this; } Result& setDepth(float depth) { this->depth = depth; return *this; } - Result& setModelVersion(string model_version) { this->model_version = model_version; return *this; } + Result& addScoreTrain(double score) { scores_train.push_back(score); return *this; } + Result& addScoreTest(double score) { scores_test.push_back(score); return *this; } + Result& addTimeTrain(double time) { times_train.push_back(time); return *this; } + Result& addTimeTest(double time) { times_test.push_back(time); return *this; } const float get_score_train() const { return score_train; } float get_score_test() { return score_test; } const string& getDataset() const { return dataset; } @@ -70,14 +74,17 @@ namespace platform { const float getNodes() const { return nodes; } const float getLeaves() const { return leaves; } const float getDepth() const { return depth; } - const string& getModelVersion() const { return model_version; } + const vector& getScoresTrain() const { return scores_train; } + const vector& getScoresTest() const { return scores_test; } + const vector& getTimesTrain() const { return times_train; } + const vector& getTimesTest() const { return times_test; } }; class Experiment { private: string title, model, platform, score_name, model_version, language_version, language; bool discretized, stratified; vector results; - vector random_seeds; + vector randomSeeds; int nfolds; float duration; json build_json(); @@ -94,11 +101,12 @@ namespace platform { Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; } Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; } Experiment& addResult(Result result) { results.push_back(result); return *this; } - Experiment& addRandomSeed(int random_seed) { random_seeds.push_back(random_seed); return *this; } + Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; } Experiment& setDuration(float duration) { this->duration = duration; return *this; } string get_file_name(); void save(string path); - void show() { cout << "Showing experiment..." << "Score Test: " << results[0].get_score_test() << " Score Train: " << results[0].get_score_train() << endl; } + Result cross_validation(const string& path, const string& fileName); + void show(); }; Result cross_validation(Fold* fold, string model_name, torch::Tensor& X, torch::Tensor& y, vector features, string className, map> states); } diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 0faa5b3..b7be5b5 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -135,13 +135,17 @@ int main(int argc, char** argv) fold = new KFold(n_folds, samples, seed); auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); result.setDataset(fileName); - experiment.setModelVersion(result.getModelVersion()); + experiment.setModelVersion("-FIXME-"); experiment.addResult(result); delete fold; } + cout << endl; } experiment.setDuration(timer.getDuration()); - experiment.save(PATH_RESULTS); + if (saveResults) + experiment.save(PATH_RESULTS); + else + experiment.show(); cout << "Done!" << endl; return 0; } From adc0ca238f4d5b46081158f36b195f0f685352c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 16:44:07 +0200 Subject: [PATCH 03/12] Refactor cross_validation --- src/Platform/Experiment.cc | 84 +++++++++++++++++++++++--------------- src/Platform/Experiment.h | 4 +- src/Platform/main.cc | 16 ++------ 3 files changed, 55 insertions(+), 49 deletions(-) diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index b8c65b8..22a3c84 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -88,7 +88,7 @@ namespace platform { json data = build_json(); cout << data.dump(4) << endl; } - Result cross_validation(Fold* fold, string model_name, torch::Tensor& Xt, torch::Tensor& y, vector features, string className, map> states) + Result Experiment::cross_validation(string model_name, torch::Tensor& Xt, torch::Tensor& y, vector features, string className, map> states) { auto classifiers = map({ { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, @@ -98,41 +98,57 @@ namespace platform { auto result = Result(); auto [values, counts] = at::_unique(y); result.setSamples(Xt.size(1)).setFeatures(Xt.size(0)).setClasses(values.size(0)); - auto k = fold->getNumberOfFolds(); - auto accuracy_test = torch::zeros({ k }, torch::kFloat64); - auto accuracy_train = torch::zeros({ k }, torch::kFloat64); - auto train_time = torch::zeros({ k }, torch::kFloat64); - auto test_time = torch::zeros({ k }, torch::kFloat64); - auto nodes = torch::zeros({ k }, torch::kFloat64); - auto edges = torch::zeros({ k }, torch::kFloat64); - auto num_states = torch::zeros({ k }, torch::kFloat64); + int nResults = nfolds * static_cast(randomSeeds.size()); + auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64); + auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64); + auto train_time = torch::zeros({ nResults }, torch::kFloat64); + auto test_time = torch::zeros({ nResults }, torch::kFloat64); + auto nodes = torch::zeros({ nResults }, torch::kFloat64); + auto edges = torch::zeros({ nResults }, torch::kFloat64); + auto num_states = torch::zeros({ nResults }, torch::kFloat64); Timer train_timer, test_timer; - cout << "doing Fold: " << flush; - for (int i = 0; i < k; i++) { - bayesnet::BaseClassifier* model = classifiers[model_name]; - train_timer.start(); - auto [train, test] = fold->getFold(i); - auto train_t = torch::tensor(train); - auto test_t = torch::tensor(test); - auto X_train = Xt.index({ "...", train_t }); - auto y_train = y.index({ train_t }); - auto X_test = Xt.index({ "...", test_t }); - auto y_test = y.index({ test_t }); - cout << i + 1 << ", " << flush; - model->fit(X_train, y_train, features, className, states); - nodes[i] = model->getNumberOfNodes(); - edges[i] = model->getNumberOfEdges(); - num_states[i] = model->getNumberOfStates(); - train_time[i] = train_timer.getDuration(); - auto accuracy_train_value = model->score(X_train, y_train); - test_timer.start(); - auto accuracy_test_value = model->score(X_test, y_test); - test_time[i] = test_timer.getDuration(); - accuracy_train[i] = accuracy_train_value; - accuracy_test[i] = accuracy_test_value; - + int item = 0; + for (auto seed : randomSeeds) { + cout << "(" << seed << ") " << flush; + Fold* fold; + if (stratified) + fold = new StratifiedKFold(nfolds, y, seed); + else + fold = new KFold(nfolds, y.size(0), seed); + cout << "doing Fold: " << flush; + for (int nfold = 0; nfold < nfolds; nfold++) { + bayesnet::BaseClassifier* clf = classifiers[model]; + setModelVersion(clf->getVersion()); + train_timer.start(); + auto [train, test] = fold->getFold(nfold); + auto train_t = torch::tensor(train); + auto test_t = torch::tensor(test); + auto X_train = Xt.index({ "...", train_t }); + auto y_train = y.index({ train_t }); + auto X_test = Xt.index({ "...", test_t }); + auto y_test = y.index({ test_t }); + cout << nfold + 1 << ", " << flush; + clf->fit(X_train, y_train, features, className, states); + nodes[item] = clf->getNumberOfNodes(); + edges[item] = clf->getNumberOfEdges(); + num_states[item] = clf->getNumberOfStates(); + train_time[item] = train_timer.getDuration(); + auto accuracy_train_value = clf->score(X_train, y_train); + test_timer.start(); + auto accuracy_test_value = clf->score(X_test, y_test); + test_time[item] = test_timer.getDuration(); + accuracy_train[item] = accuracy_train_value; + accuracy_test[item] = accuracy_test_value; + // Store results and times in vector + result.addScoreTrain(accuracy_train_value); + result.addScoreTest(accuracy_test_value); + result.addTimeTrain(train_time[item].item()); + result.addTimeTest(test_time[item].item()); + item++; + } + cout << "end. " << flush; + delete fold; } - cout << "end. " << flush; result.setScoreTest(torch::mean(accuracy_test).item()).setScoreTrain(torch::mean(accuracy_train).item()); result.setScoreTestStd(torch::std(accuracy_test).item()).setScoreTrainStd(torch::std(accuracy_train).item()); result.setTrainTime(torch::mean(train_time).item()).setTestTime(torch::mean(test_time).item()); diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index 58639d7..a45113d 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -105,9 +105,9 @@ namespace platform { Experiment& setDuration(float duration) { this->duration = duration; return *this; } string get_file_name(); void save(string path); - Result cross_validation(const string& path, const string& fileName); + //Result cross_validation(const string& path, const string& fileName); + Result cross_validation(string model_name, torch::Tensor& X, torch::Tensor& y, vector features, string className, map> states); void show(); }; - Result cross_validation(Fold* fold, string model_name, torch::Tensor& X, torch::Tensor& y, vector features, string className, map> states); } #endif \ No newline at end of file diff --git a/src/Platform/main.cc b/src/Platform/main.cc index b7be5b5..1c6897d 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -126,19 +126,9 @@ int main(int argc, char** argv) auto samples = datasets.getNSamples(fileName); auto className = datasets.getClassName(fileName); cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush; - for (auto seed : seeds) { - cout << "(" << seed << ") " << flush; - Fold* fold; - if (stratified) - fold = new StratifiedKFold(n_folds, y, seed); - else - fold = new KFold(n_folds, samples, seed); - auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); - result.setDataset(fileName); - experiment.setModelVersion("-FIXME-"); - experiment.addResult(result); - delete fold; - } + auto result = experiment.cross_validation(model_name, X, y, features, className, states); + result.setDataset(fileName); + experiment.addResult(result); cout << endl; } experiment.setDuration(timer.getDuration()); From c4f3e6f19a5fd2612954fcd09e3e4ed8645f0dad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 16:49:06 +0200 Subject: [PATCH 04/12] Refactor crossvalidation to remove unneeded params --- src/Platform/Experiment.cc | 20 +++++++++++++++----- src/Platform/Experiment.h | 3 +-- src/Platform/main.cc | 8 +------- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 22a3c84..8592019 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -1,4 +1,5 @@ #include "Experiment.h" +#include "Datasets.h" namespace platform { using json = nlohmann::json; @@ -88,16 +89,25 @@ namespace platform { json data = build_json(); cout << data.dump(4) << endl; } - Result Experiment::cross_validation(string model_name, torch::Tensor& Xt, torch::Tensor& y, vector features, string className, map> states) + Result Experiment::cross_validation(const string& path, const string& fileName) { auto classifiers = map({ { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } } ); + auto datasets = platform::Datasets(path, true, platform::ARFF); + // Get dataset + auto [X, y] = datasets.getTensors(fileName); + auto states = datasets.getStates(fileName); + auto features = datasets.getFeatures(fileName); + auto samples = datasets.getNSamples(fileName); + auto className = datasets.getClassName(fileName); + cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush; + // Prepare Result auto result = Result(); - auto [values, counts] = at::_unique(y); - result.setSamples(Xt.size(1)).setFeatures(Xt.size(0)).setClasses(values.size(0)); + auto [values, counts] = at::_unique(y);; + result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0)); int nResults = nfolds * static_cast(randomSeeds.size()); auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64); auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64); @@ -123,9 +133,9 @@ namespace platform { auto [train, test] = fold->getFold(nfold); auto train_t = torch::tensor(train); auto test_t = torch::tensor(test); - auto X_train = Xt.index({ "...", train_t }); + auto X_train = X.index({ "...", train_t }); auto y_train = y.index({ train_t }); - auto X_test = Xt.index({ "...", test_t }); + auto X_test = X.index({ "...", test_t }); auto y_test = y.index({ test_t }); cout << nfold + 1 << ", " << flush; clf->fit(X_train, y_train, features, className, states); diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index a45113d..84b1627 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -105,8 +105,7 @@ namespace platform { Experiment& setDuration(float duration) { this->duration = duration; return *this; } string get_file_name(); void save(string path); - //Result cross_validation(const string& path, const string& fileName); - Result cross_validation(string model_name, torch::Tensor& X, torch::Tensor& y, vector features, string className, map> states); + Result cross_validation(const string& path, const string& fileName); void show(); }; } diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 1c6897d..d7d040d 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -120,13 +120,7 @@ int main(int argc, char** argv) timer.start(); for (auto fileName : filesToProcess) { cout << "- " << setw(20) << left << fileName << " " << right << flush; - auto [X, y] = datasets.getTensors(fileName); - auto states = datasets.getStates(fileName); - auto features = datasets.getFeatures(fileName); - auto samples = datasets.getNSamples(fileName); - auto className = datasets.getClassName(fileName); - cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush; - auto result = experiment.cross_validation(model_name, X, y, features, className, states); + auto result = experiment.cross_validation(path, fileName); result.setDataset(fileName); experiment.addResult(result); cout << endl; From 07d572a98c813fd274fb9143d674c9dc41c216cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 17:27:43 +0200 Subject: [PATCH 05/12] Add Model factory --- src/BayesNet/AODE.h | 1 + src/BayesNet/KDB.h | 1 + src/BayesNet/SPODE.h | 1 + src/BayesNet/TAN.h | 1 + src/Platform/Experiment.cc | 12 ++++++------ src/Platform/Models.cc | 28 ++++++++++++++++++++++++---- src/Platform/Models.h | 18 +++++------------- 7 files changed, 39 insertions(+), 23 deletions(-) diff --git a/src/BayesNet/AODE.h b/src/BayesNet/AODE.h index 84386d3..bc859e7 100644 --- a/src/BayesNet/AODE.h +++ b/src/BayesNet/AODE.h @@ -8,6 +8,7 @@ namespace bayesnet { void train() override; public: AODE(); + virtual ~AODE() {}; vector graph(string title = "AODE") override; }; } diff --git a/src/BayesNet/KDB.h b/src/BayesNet/KDB.h index 9683955..b0790da 100644 --- a/src/BayesNet/KDB.h +++ b/src/BayesNet/KDB.h @@ -14,6 +14,7 @@ namespace bayesnet { void train() override; public: KDB(int k, float theta = 0.03); + virtual ~KDB() {}; vector graph(string name = "KDB") override; }; } diff --git a/src/BayesNet/SPODE.h b/src/BayesNet/SPODE.h index 668bbca..05bf3a5 100644 --- a/src/BayesNet/SPODE.h +++ b/src/BayesNet/SPODE.h @@ -9,6 +9,7 @@ namespace bayesnet { void train() override; public: SPODE(int root); + virtual ~SPODE() {}; vector graph(string name = "SPODE") override; }; } diff --git a/src/BayesNet/TAN.h b/src/BayesNet/TAN.h index 11e7421..ce9b10a 100644 --- a/src/BayesNet/TAN.h +++ b/src/BayesNet/TAN.h @@ -10,6 +10,7 @@ namespace bayesnet { void train() override; public: TAN(); + virtual ~TAN() {}; vector graph(string name = "TAN") override; }; } diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 8592019..08f09f8 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -1,5 +1,6 @@ #include "Experiment.h" #include "Datasets.h" +#include "Models.h" namespace platform { using json = nlohmann::json; @@ -91,12 +92,12 @@ namespace platform { } Result Experiment::cross_validation(const string& path, const string& fileName) { + auto datasets = platform::Datasets(path, true, platform::ARFF); auto classifiers = map({ - { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, - { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } + { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, + { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } } ); - auto datasets = platform::Datasets(path, true, platform::ARFF); // Get dataset auto [X, y] = datasets.getTensors(fileName); auto states = datasets.getStates(fileName); @@ -119,15 +120,14 @@ namespace platform { Timer train_timer, test_timer; int item = 0; for (auto seed : randomSeeds) { - cout << "(" << seed << ") " << flush; + cout << "(" << seed << ") doing Fold: " << flush; Fold* fold; if (stratified) fold = new StratifiedKFold(nfolds, y, seed); else fold = new KFold(nfolds, y.size(0), seed); - cout << "doing Fold: " << flush; for (int nfold = 0; nfold < nfolds; nfold++) { - bayesnet::BaseClassifier* clf = classifiers[model]; + auto clf = Models::createInstance(model); setModelVersion(clf->getVersion()); train_timer.start(); auto [train, test] = fold->getFold(nfold); diff --git a/src/Platform/Models.cc b/src/Platform/Models.cc index b6aaa66..7bed6c3 100644 --- a/src/Platform/Models.cc +++ b/src/Platform/Models.cc @@ -1,8 +1,28 @@ #include "Models.h" namespace platform { using namespace std; - map Models::classifiers = map({ - { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, - { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } - }); + // map Models::classifiers = map({ + // { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, + // { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } + // }); + // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory + shared_ptr Models::createInstance(const string& name) + { + bayesnet::BaseClassifier* instance = nullptr; + if (name == "AODE") { + instance = new bayesnet::AODE(); + } else if (name == "KDB") { + instance = new bayesnet::KDB(2); + } else if (name == "SPODE") { + instance = new bayesnet::SPODE(2); + } else if (name == "TAN") { + instance = new bayesnet::TAN(); + } else { + throw runtime_error("Model " + name + " not found"); + } + if (instance != nullptr) + return shared_ptr(instance); + else + return nullptr; + } } \ No newline at end of file diff --git a/src/Platform/Models.h b/src/Platform/Models.h index 2851036..379e00f 100644 --- a/src/Platform/Models.h +++ b/src/Platform/Models.h @@ -8,25 +8,17 @@ #include "SPODE.h" namespace platform { class Models { - private: - static map classifiers; public: - static bayesnet::BaseClassifier* get(string name) { return classifiers[name]; } + // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory + static shared_ptr createInstance(const string& name); static vector getNames() { - vector names; - for (auto& [name, classifier] : classifiers) { - names.push_back(name); - } - return names; + return { "aaaaaAODE", "KDB", "SPODE", "TAN" }; } static string toString() { - string names = ""; - for (auto& [name, classifier] : classifiers) { - names += name + ", "; - } - return "{" + names.substr(0, names.size() - 2) + "}"; + return "{aaaaae34223AODE, KDB, SPODE, TAN}"; + //return "{" + names.substr(0, names.size() - 2) + "}"; } }; } From cb54f61a694e3ef2f5de357b2c5b2abb36ab3e4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 18:22:15 +0200 Subject: [PATCH 06/12] Refactor Models to be a singleton factory Add Registrar of models --- src/BayesNet/SPODE.h | 1 + src/Platform/Experiment.cc | 7 +--- src/Platform/Models.cc | 77 ++++++++++++++++++++++++++++++-------- src/Platform/Models.h | 27 ++++++++----- src/Platform/main.cc | 18 +++++++-- 5 files changed, 95 insertions(+), 35 deletions(-) diff --git a/src/BayesNet/SPODE.h b/src/BayesNet/SPODE.h index 05bf3a5..0f422a7 100644 --- a/src/BayesNet/SPODE.h +++ b/src/BayesNet/SPODE.h @@ -1,6 +1,7 @@ #ifndef SPODE_H #define SPODE_H #include "Classifier.h" + namespace bayesnet { class SPODE : public Classifier { private: diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 08f09f8..ab62ab2 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -93,11 +93,6 @@ namespace platform { Result Experiment::cross_validation(const string& path, const string& fileName) { auto datasets = platform::Datasets(path, true, platform::ARFF); - auto classifiers = map({ - { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, - { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } - } - ); // Get dataset auto [X, y] = datasets.getTensors(fileName); auto states = datasets.getStates(fileName); @@ -127,7 +122,7 @@ namespace platform { else fold = new KFold(nfolds, y.size(0), seed); for (int nfold = 0; nfold < nfolds; nfold++) { - auto clf = Models::createInstance(model); + auto clf = Models::instance()->create(model); setModelVersion(clf->getVersion()); train_timer.start(); auto [train, test] = fold->getFold(nfold); diff --git a/src/Platform/Models.cc b/src/Platform/Models.cc index 7bed6c3..aa23a2d 100644 --- a/src/Platform/Models.cc +++ b/src/Platform/Models.cc @@ -1,28 +1,73 @@ #include "Models.h" namespace platform { using namespace std; - // map Models::classifiers = map({ - // { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, - // { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } - // }); // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory - shared_ptr Models::createInstance(const string& name) + // shared_ptr Models::createInstance(const string& name) + // { + // bayesnet::BaseClassifier* instance = nullptr; + // if (name == "AODE") { + // instance = new bayesnet::AODE(); + // } else if (name == "KDB") { + // instance = new bayesnet::KDB(2); + // } else if (name == "SPODE") { + // instance = new bayesnet::SPODE(2); + // } else if (name == "TAN") { + // instance = new bayesnet::TAN(); + // } else { + // throw runtime_error("Model " + name + " not found"); + // } + // if (instance != nullptr) + // return shared_ptr(instance); + // else + // return nullptr; + // } + Models* Models::factory = nullptr;; + Models* Models::instance() + { + //manages singleton + if (factory == nullptr) + factory = new Models(); + return factory; + } + void Models::registerFactoryFunction(const string& name, + function classFactoryFunction) + { + // register the class factory function + functionRegistry[name] = classFactoryFunction; + } + shared_ptr Models::create(const string& name) { bayesnet::BaseClassifier* instance = nullptr; - if (name == "AODE") { - instance = new bayesnet::AODE(); - } else if (name == "KDB") { - instance = new bayesnet::KDB(2); - } else if (name == "SPODE") { - instance = new bayesnet::SPODE(2); - } else if (name == "TAN") { - instance = new bayesnet::TAN(); - } else { - throw runtime_error("Model " + name + " not found"); - } + + // find name in the registry and call factory method. + auto it = functionRegistry.find(name); + if (it != functionRegistry.end()) + instance = it->second(); + // wrap instance in a shared ptr and return if (instance != nullptr) return shared_ptr(instance); else return nullptr; } + vector Models::getNames() + { + vector names; + transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names), + [](const pair>& pair) { return pair.first; }); + return names; + } + string Models::toString() + { + string result = ""; + for (auto& pair : functionRegistry) { + result += pair.first + ", "; + } + return "{" + result.substr(0, result.size() - 2) + "}"; + } + + Registrar::Registrar(const string& name, function classFactoryFunction) + { + // register the class factory function + Models::instance()->registerFactoryFunction(name, classFactoryFunction); + } } \ No newline at end of file diff --git a/src/Platform/Models.h b/src/Platform/Models.h index 379e00f..0bb8d51 100644 --- a/src/Platform/Models.h +++ b/src/Platform/Models.h @@ -8,18 +8,25 @@ #include "SPODE.h" namespace platform { class Models { + private: + map> functionRegistry; + static Models* factory; //singleton + Models() {}; public: + Models(Models&) = delete; + void operator=(const Models&) = delete; // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory - static shared_ptr createInstance(const string& name); - static vector getNames() - { - return { "aaaaaAODE", "KDB", "SPODE", "TAN" }; - } - static string toString() - { - return "{aaaaae34223AODE, KDB, SPODE, TAN}"; - //return "{" + names.substr(0, names.size() - 2) + "}"; - } + static Models* instance(); + shared_ptr create(const string& name); + void registerFactoryFunction(const string& name, + function classFactoryFunction); + vector getNames(); + string toString(); + + }; + class Registrar { + public: + Registrar(const string& className, function classFactoryFunction); }; } #endif \ No newline at end of file diff --git a/src/Platform/main.cc b/src/Platform/main.cc index d7d040d..3a4b238 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -21,13 +21,13 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) .default_value(string{ PATH_DATASETS } ); program.add_argument("-m", "--model") - .help("Model to use " + platform::Models::toString()) + .help("Model to use " + platform::Models::instance()->toString()) .action([](const std::string& value) { - static const vector choices = platform::Models::getNames(); + static const vector choices = platform::Models::instance()->getNames(); if (find(choices.begin(), choices.end(), value) != choices.end()) { return value; } - throw runtime_error("Model must be one of " + platform::Models::toString()); + throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); } ); program.add_argument("--title").default_value("").help("Experiment title"); @@ -76,9 +76,21 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) } return program; } +void registerModels() +{ + static platform::Registrar registrarT("TAN", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::TAN();}); + static platform::Registrar registrarS("SPODE", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODE(2);}); + static platform::Registrar registrarK("KDB", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDB(2);}); + static platform::Registrar registrarA("AODE", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();}); +} int main(int argc, char** argv) { + registerModels(); auto program = manageArguments(argc, argv); bool saveResults = false; auto file_name = program.get("dataset"); From 7222119dfbfc0644dae5786489f972bc031a5126 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 19:00:39 +0200 Subject: [PATCH 07/12] Refactor experiment crossvalidation --- sample/CMakeLists.txt | 2 +- sample/sample.cc | 19 +++++++------------ src/Platform/Experiment.cc | 14 ++++++++++++++ src/Platform/Experiment.h | 1 + src/Platform/main.cc | 21 +++------------------ src/Platform/modelRegister.h | 11 +++++++++++ 6 files changed, 37 insertions(+), 31 deletions(-) create mode 100644 src/Platform/modelRegister.h diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index 4f9d087..000a88b 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -3,5 +3,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) -add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc) +add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNetSample BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/sample/sample.cc b/sample/sample.cc index f515405..502dcfc 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -4,16 +4,12 @@ #include #include #include -#include "BaseClassifier.h" #include "ArffFiles.h" -#include "Network.h" #include "BayesMetrics.h" #include "CPPFImdlp.h" -#include "KDB.h" -#include "SPODE.h" -#include "AODE.h" -#include "TAN.h" #include "Folding.h" +#include "Models.h" +#include "modelRegister.h" using namespace std; @@ -73,9 +69,8 @@ int main(int argc, char** argv) {"mfeat-factors", true}, }; auto valid_datasets = vector(); - for (auto dataset : datasets) { - valid_datasets.push_back(dataset.first); - } + transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets), + [](const pair& pair) { return pair.first; }); argparse::ArgumentParser program("BayesNetSample"); program.add_argument("-d", "--dataset") .help("Dataset file name") @@ -91,13 +86,13 @@ int main(int argc, char** argv) .default_value(string{ PATH } ); program.add_argument("-m", "--model") - .help("Model to use {AODE, KDB, SPODE, TAN}") + .help("Model to use " + platform::Models::instance()->toString()) .action([](const std::string& value) { - static const vector choices = { "AODE", "KDB", "SPODE", "TAN" }; + static const vector choices = platform::Models::instance()->getNames(); if (find(choices.begin(), choices.end(), value) != choices.end()) { return value; } - throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}"); + throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); } ); program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index ab62ab2..71e8cf4 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -85,11 +85,25 @@ namespace platform { file << data; file.close(); } + void Experiment::show() { json data = build_json(); cout << data.dump(4) << endl; } + + void Experiment::go(vector filesToProcess, const string& path) + { + cout << "*** Starting experiment: " << title << " ***" << endl; + for (auto fileName : filesToProcess) { + cout << "- " << setw(20) << left << fileName << " " << right << flush; + auto result = cross_validation(path, fileName); + result.setDataset(fileName); + addResult(result); + cout << endl; + } + } + Result Experiment::cross_validation(const string& path, const string& fileName) { auto datasets = platform::Datasets(path, true, platform::ARFF); diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index 84b1627..951ac4a 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -106,6 +106,7 @@ namespace platform { string get_file_name(); void save(string path); Result cross_validation(const string& path, const string& fileName); + void go(vector filesToProcess, const string& path); void show(); }; } diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 3a4b238..29c8505 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -5,7 +5,7 @@ #include "Datasets.h" #include "DotEnv.h" #include "Models.h" - +#include "modelRegister.h" using namespace std; const string PATH_RESULTS = "results"; @@ -78,19 +78,11 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) } void registerModels() { - static platform::Registrar registrarT("TAN", - [](void) -> bayesnet::BaseClassifier* { return new bayesnet::TAN();}); - static platform::Registrar registrarS("SPODE", - [](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODE(2);}); - static platform::Registrar registrarK("KDB", - [](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDB(2);}); - static platform::Registrar registrarA("AODE", - [](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();}); + } int main(int argc, char** argv) { - registerModels(); auto program = manageArguments(argc, argv); bool saveResults = false; auto file_name = program.get("dataset"); @@ -128,15 +120,8 @@ int main(int argc, char** argv) experiment.addRandomSeed(seed); } platform::Timer timer; - cout << "*** Starting experiment: " << title << " ***" << endl; timer.start(); - for (auto fileName : filesToProcess) { - cout << "- " << setw(20) << left << fileName << " " << right << flush; - auto result = experiment.cross_validation(path, fileName); - result.setDataset(fileName); - experiment.addResult(result); - cout << endl; - } + experiment.go(filesToProcess, path); experiment.setDuration(timer.getDuration()); if (saveResults) experiment.save(PATH_RESULTS); diff --git a/src/Platform/modelRegister.h b/src/Platform/modelRegister.h new file mode 100644 index 0000000..a4188bc --- /dev/null +++ b/src/Platform/modelRegister.h @@ -0,0 +1,11 @@ +#ifndef MODEL_REGISTER_H +#define MODEL_REGISTER_H +static platform::Registrar registrarT("TAN", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::TAN();}); +static platform::Registrar registrarS("SPODE", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODE(2);}); +static platform::Registrar registrarK("KDB", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDB(2);}); +static platform::Registrar registrarA("AODE", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();}); +#endif \ No newline at end of file From 9a0449c12d8a03a66ade2ae5b24412a16247c47d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 19:38:42 +0200 Subject: [PATCH 08/12] Fix some lint warnings --- sample/sample.cc | 13 ++++--------- src/BayesNet/BayesMetrics.cc | 3 +-- src/BayesNet/BayesMetrics.h | 2 +- src/BayesNet/KDB.h | 2 +- src/BayesNet/Mst.h | 4 ++-- src/BayesNet/Network.h | 6 +++--- src/BayesNet/SPODE.h | 2 +- src/Platform/Models.cc | 19 ------------------- src/Platform/main.cc | 12 ++++-------- 9 files changed, 17 insertions(+), 46 deletions(-) diff --git a/sample/sample.cc b/sample/sample.cc index 502dcfc..2d7efa4 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -148,9 +148,9 @@ int main(int argc, char** argv) // Get className & Features auto className = handler.getClassName(); vector features; - for (auto feature : handler.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), + [](const pair& item) { return item.first; }); // Discretize Dataset auto [Xd, maxes] = discretize(X, y, features); maxes[className] = *max_element(y.begin(), y.end()) + 1; @@ -159,12 +159,7 @@ int main(int argc, char** argv) states[feature] = vector(maxes[feature]); } states[className] = vector(maxes[className]); - auto classifiers = map({ - { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, - { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } - } - ); - bayesnet::BaseClassifier* clf = classifiers[model_name]; + auto clf = platform::Models::instance()->create(model_name); clf->fit(Xd, y, features, className, states); auto score = clf->score(Xd, y); auto lines = clf->show(); diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 6671995..be15f07 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -12,8 +12,8 @@ namespace bayesnet { : features(features) , className(className) , classNumStates(classNumStates) + , samples(torch::zeros({ static_cast(vsamples[0].size()), static_cast(vsamples.size() + 1) }, torch::kInt32)) { - samples = torch::zeros({ static_cast(vsamples[0].size()), static_cast(vsamples.size() + 1) }, torch::kInt32); for (int i = 0; i < vsamples.size(); ++i) { samples.index_put_({ "...", i }, torch::tensor(vsamples[i], torch::kInt32)); } @@ -123,7 +123,6 @@ namespace bayesnet { */ vector> Metrics::maximumSpanningTree(vector features, Tensor& weights, int root) { - auto result = vector>(); auto mst = MST(features, weights, root); return mst.maximumSpanningTree(); } diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index f8557a0..427b1d4 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -11,7 +11,7 @@ namespace bayesnet { Tensor samples; vector features; string className; - int classNumStates; + int classNumStates = 0; public: Metrics() = default; Metrics(Tensor&, vector&, string&, int); diff --git a/src/BayesNet/KDB.h b/src/BayesNet/KDB.h index b0790da..e3f257f 100644 --- a/src/BayesNet/KDB.h +++ b/src/BayesNet/KDB.h @@ -13,7 +13,7 @@ namespace bayesnet { protected: void train() override; public: - KDB(int k, float theta = 0.03); + explicit KDB(int k, float theta = 0.03); virtual ~KDB() {}; vector graph(string name = "KDB") override; }; diff --git a/src/BayesNet/Mst.h b/src/BayesNet/Mst.h index 15b0dbb..71a46a5 100644 --- a/src/BayesNet/Mst.h +++ b/src/BayesNet/Mst.h @@ -10,7 +10,7 @@ namespace bayesnet { private: Tensor weights; vector features; - int root; + int root = 0; public: MST() = default; MST(vector& features, Tensor& weights, int root); @@ -23,7 +23,7 @@ namespace bayesnet { vector >> T; // vector for mst vector parent; public: - Graph(int V); + explicit Graph(int V); void addEdge(int u, int v, float wt); int find_set(int i); void union_set(int u, int v); diff --git a/src/BayesNet/Network.h b/src/BayesNet/Network.h index c8d832d..ffa4f67 100644 --- a/src/BayesNet/Network.h +++ b/src/BayesNet/Network.h @@ -27,9 +27,9 @@ namespace bayesnet { void completeFit(); public: Network(); - Network(float, int); - Network(float); - Network(Network&); + explicit Network(float, int); + explicit Network(float); + explicit Network(Network&); torch::Tensor& getSamples(); float getmaxThreads(); void addNode(string, int); diff --git a/src/BayesNet/SPODE.h b/src/BayesNet/SPODE.h index 0f422a7..30f0b46 100644 --- a/src/BayesNet/SPODE.h +++ b/src/BayesNet/SPODE.h @@ -9,7 +9,7 @@ namespace bayesnet { protected: void train() override; public: - SPODE(int root); + explicit SPODE(int root); virtual ~SPODE() {}; vector graph(string name = "SPODE") override; }; diff --git a/src/Platform/Models.cc b/src/Platform/Models.cc index aa23a2d..df1b517 100644 --- a/src/Platform/Models.cc +++ b/src/Platform/Models.cc @@ -2,25 +2,6 @@ namespace platform { using namespace std; // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory - // shared_ptr Models::createInstance(const string& name) - // { - // bayesnet::BaseClassifier* instance = nullptr; - // if (name == "AODE") { - // instance = new bayesnet::AODE(); - // } else if (name == "KDB") { - // instance = new bayesnet::KDB(2); - // } else if (name == "SPODE") { - // instance = new bayesnet::SPODE(2); - // } else if (name == "TAN") { - // instance = new bayesnet::TAN(); - // } else { - // throw runtime_error("Model " + name + " not found"); - // } - // if (instance != nullptr) - // return shared_ptr(instance); - // else - // return nullptr; - // } Models* Models::factory = nullptr;; Models* Models::instance() { diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 29c8505..d9dfb40 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -76,10 +76,6 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) } return program; } -void registerModels() -{ - -} int main(int argc, char** argv) { @@ -92,7 +88,7 @@ int main(int argc, char** argv) auto stratified = program.get("stratified"); auto n_folds = program.get("folds"); auto seeds = program.get>("seeds"); - vector filesToProcess; + vector filesToTest; auto datasets = platform::Datasets(path, true, platform::ARFF); auto title = program.get("title"); if (file_name != "") { @@ -103,9 +99,9 @@ int main(int argc, char** argv) if (title == "") { title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds"; } - filesToProcess.push_back(file_name); + filesToTest.push_back(file_name); } else { - filesToProcess = platform::Datasets(path, true, platform::ARFF).getNames(); + filesToTest = platform::Datasets(path, true, platform::ARFF).getNames(); saveResults = true; } @@ -121,7 +117,7 @@ int main(int argc, char** argv) } platform::Timer timer; timer.start(); - experiment.go(filesToProcess, path); + experiment.go(filesToTest, path); experiment.setDuration(timer.getDuration()); if (saveResults) experiment.save(PATH_RESULTS); From 5efa3beaeeefad8226aada0f19e21fe874163ee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 20:20:38 +0200 Subject: [PATCH 09/12] Fix some lint warnings --- src/BayesNet/Network.cc | 10 +++++----- src/BayesNet/Network.h | 6 +++--- src/Platform/DotEnv.h | 6 +++--- src/Platform/Models.cc | 2 +- src/Platform/main.cc | 23 +++++++++-------------- src/Platform/platformUtils.cc | 18 ++++++++---------- src/Platform/platformUtils.h | 8 ++++---- 7 files changed, 33 insertions(+), 40 deletions(-) diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index eb3ffeb..1c8abeb 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -20,7 +20,7 @@ namespace bayesnet { { return samples; } - void Network::addNode(string name, int numStates) + void Network::addNode(const string& name, int numStates) { if (find(features.begin(), features.end(), name) == features.end()) { features.push_back(name); @@ -69,7 +69,7 @@ namespace bayesnet { recStack.erase(nodeId); // remove node from recursion stack before function ends return false; } - void Network::addEdge(const string parent, const string child) + void Network::addEdge(const string& parent, const string& child) { if (nodes.find(parent) == nodes.end()) { throw invalid_argument("Parent node " + parent + " does not exist"); @@ -105,8 +105,8 @@ namespace bayesnet { for (int i = 0; i < featureNames.size(); ++i) { auto column = torch::flatten(X.index({ "...", i })); auto k = vector(); - for (auto i = 0; i < X.size(0); ++i) { - k.push_back(column[i].item()); + for (auto z = 0; z < X.size(0); ++z) { + k.push_back(column[z].item()); } dataset[featureNames[i]] = k; } @@ -280,7 +280,7 @@ namespace bayesnet { } return result; } - vector Network::graph(string title) + vector Network::graph(const string& title) { auto output = vector(); auto prefix = "digraph BayesNet {\nlabel=>& getNodes(); vector getFeatures(); int getStates(); @@ -48,7 +48,7 @@ namespace bayesnet { vector> predict_proba(const vector>&); double score(const vector>&, const vector&); vector show(); - vector graph(string title); // Returns a vector of strings representing the graph in graphviz format + vector graph(const string& title); // Returns a vector of strings representing the graph in graphviz format inline string version() { return "0.1.0"; } }; } diff --git a/src/Platform/DotEnv.h b/src/Platform/DotEnv.h index af6eda2..a7e3e36 100644 --- a/src/Platform/DotEnv.h +++ b/src/Platform/DotEnv.h @@ -52,9 +52,9 @@ namespace platform { seeds_str = trim(seeds_str); seeds_str = seeds_str.substr(1, seeds_str.size() - 2); auto seeds_str_split = split(seeds_str, ','); - for (auto seed_str : seeds_str_split) { - seeds.push_back(stoi(seed_str)); - } + transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) { + return stoi(str); + }); return seeds; } }; diff --git a/src/Platform/Models.cc b/src/Platform/Models.cc index df1b517..1a66156 100644 --- a/src/Platform/Models.cc +++ b/src/Platform/Models.cc @@ -40,7 +40,7 @@ namespace platform { string Models::toString() { string result = ""; - for (auto& pair : functionRegistry) { + for (const auto& pair : functionRegistry) { result += pair.first + ", "; } return "{" + result.substr(0, result.size() - 2) + "}"; diff --git a/src/Platform/main.cc b/src/Platform/main.cc index d9dfb40..55c0cfe 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -49,22 +49,17 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) }}); auto seed_values = env.getSeeds(); program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values); - bool class_last, discretize_dataset, stratified; - int n_folds; - vector seeds; - string model_name, file_name, path, complete_file_name, title; try { program.parse_args(argc, argv); - file_name = program.get("dataset"); - path = program.get("path"); - model_name = program.get("model"); - discretize_dataset = program.get("discretize"); - stratified = program.get("stratified"); - n_folds = program.get("folds"); - seeds = program.get>("seeds"); - complete_file_name = path + file_name + ".arff"; - class_last = false;//datasets[file_name]; - title = program.get("title"); + auto file_name = program.get("dataset"); + auto path = program.get("path"); + auto model_name = program.get("model"); + auto discretize_dataset = program.get("discretize"); + auto stratified = program.get("stratified"); + auto n_folds = program.get("folds"); + auto seeds = program.get>("seeds"); + auto complete_file_name = path + file_name + ".arff"; + auto title = program.get("title"); if (title == "" && file_name == "") { throw runtime_error("title is mandatory if dataset is not provided"); } diff --git a/src/Platform/platformUtils.cc b/src/Platform/platformUtils.cc index f318831..6fca9d9 100644 --- a/src/Platform/platformUtils.cc +++ b/src/Platform/platformUtils.cc @@ -2,7 +2,7 @@ using namespace torch; -vector split(string text, char delimiter) +vector split(const string& text, char delimiter) { vector result; stringstream ss(text); @@ -39,7 +39,7 @@ vector discretizeDataset(vector& X, mdlp::label return Xd; } -bool file_exists(const std::string& name) +bool file_exists(const string& name) { if (FILE* file = fopen(name.c_str(), "r")) { fclose(file); @@ -49,7 +49,7 @@ bool file_exists(const std::string& name) } } -tuple, string, map>> loadDataset(string path, string name, bool class_last, bool discretize_dataset) +tuple, string, map>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset) { auto handler = ArffFiles(); handler.load(path + static_cast(name) + ".arff", class_last); @@ -59,9 +59,8 @@ tuple, string, map>> loadData // Get className & Features auto className = handler.getClassName(); vector features; - for (auto feature : handler.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); Tensor Xd; auto states = map>(); if (discretize_dataset) { @@ -83,7 +82,7 @@ tuple, string, map>> loadData return { Xd, torch::tensor(y, torch::kInt32), features, className, states }; } -tuple>, vector, vector, string, map>> loadFile(string name) +tuple>, vector, vector, string, map>> loadFile(const string& name) { auto handler = ArffFiles(); handler.load(PATH + static_cast(name) + ".arff"); @@ -93,9 +92,8 @@ tuple>, vector, vector, string, map features; - for (auto feature : handler.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); // Discretize Dataset vector Xd; map maxes; diff --git a/src/Platform/platformUtils.h b/src/Platform/platformUtils.h index 9515bbf..2b4ca54 100644 --- a/src/Platform/platformUtils.h +++ b/src/Platform/platformUtils.h @@ -11,11 +11,11 @@ using namespace std; const string PATH = "../../data/"; bool file_exists(const std::string& name); -vector split(string text, char delimiter); +vector split(const string& text, char delimiter); pair, map> discretize(vector& X, mdlp::labels_t& y, vector features); vector discretizeDataset(vector& X, mdlp::labels_t& y); -pair>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector& features, string className); -tuple>, vector, vector, string, map>> loadFile(string name); -tuple, string, map>> loadDataset(string path, string name, bool class_last, bool discretize_dataset); +pair>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector& features, const string& className); +tuple>, vector, vector, string, map>> loadFile(const string& name); +tuple, string, map>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset); map> get_states(vector& features, string className, map& maxes); #endif //PLATFORM_UTILS_H From 8b2ed26ab786e69a2c0c22f73e70a16267de5c36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 29 Jul 2023 20:37:51 +0200 Subject: [PATCH 10/12] Fix some lint warnings --- src/BayesNet/TAN.cc | 2 +- src/Platform/Datasets.cc | 21 +++------------------ src/Platform/Datasets.h | 8 ++++---- src/Platform/Experiment.cc | 13 ++++++------- src/Platform/Experiment.h | 34 +++++++++++++++++----------------- src/Platform/Folding.cc | 3 +-- src/Platform/Folding.h | 4 ++-- 7 files changed, 34 insertions(+), 51 deletions(-) diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index 9c8dfff..51f0c1b 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -18,7 +18,7 @@ namespace bayesnet { auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset); mi.push_back({ i, mi_value }); } - sort(mi.begin(), mi.end(), [](auto& left, auto& right) {return left.second < right.second;}); + sort(mi.begin(), mi.end(), [](const auto& left, const auto& right) {return left.second < right.second;}); auto root = mi[mi.size() - 1].first; // 2. Compute mutual information between each feature and the class auto weights = metrics.conditionalEdge(); diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 0e1b169..1fef8d0 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -4,9 +4,9 @@ namespace platform { void Datasets::load() { - string line; ifstream catalog(path + "/all.txt"); if (catalog.is_open()) { + string line; while (getline(catalog, line)) { vector tokens = split(line, ','); string name = tokens[0]; @@ -83,23 +83,8 @@ namespace platform { { return datasets.find(name) != datasets.end(); } - Dataset::Dataset(Dataset& dataset) + Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType) { - path = dataset.path; - name = dataset.name; - className = dataset.className; - n_samples = dataset.n_samples; - n_features = dataset.n_features; - features = dataset.features; - states = dataset.states; - loaded = dataset.loaded; - discretize = dataset.discretize; - X = dataset.X; - y = dataset.y; - Xv = dataset.Xv; - Xd = dataset.Xd; - yv = dataset.yv; - fileType = dataset.fileType; } string Dataset::getName() { @@ -168,9 +153,9 @@ namespace platform { } void Dataset::load_csv() { - string line; ifstream file(path + "/" + name + ".csv"); if (file.is_open()) { + string line; getline(file, line); vector tokens = split(line, ','); features = vector(tokens.begin(), tokens.end() - 1); diff --git a/src/Platform/Datasets.h b/src/Platform/Datasets.h index f6a4c5b..7b68c71 100644 --- a/src/Platform/Datasets.h +++ b/src/Platform/Datasets.h @@ -13,7 +13,7 @@ namespace platform { string name; fileType_t fileType; string className; - int n_samples, n_features; + int n_samples{ 0 }, n_features{ 0 }; vector features; map> states; bool loaded; @@ -27,8 +27,8 @@ namespace platform { void load_arff(); void computeStates(); public: - Dataset(string path, string name, string className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; - Dataset(Dataset&); + Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; + explicit Dataset(const Dataset&); string getName(); string getClassName(); vector getFeatures(); @@ -49,7 +49,7 @@ namespace platform { bool discretize; void load(); // Loads the list of datasets public: - Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; + Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; vector getNames(); vector getFeatures(string name); int getNSamples(string name); diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 71e8cf4..97e7289 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -48,7 +48,7 @@ namespace platform { result["seeds"] = randomSeeds; result["duration"] = duration; result["results"] = json::array(); - for (auto& r : results) { + for (const auto& r : results) { json j; j["dataset"] = r.getDataset(); j["hyperparameters"] = r.getHyperparameters(); @@ -78,7 +78,7 @@ namespace platform { } return result; } - void Experiment::save(string path) + void Experiment::save(const string& path) { json data = build_json(); ofstream file(path + "/" + get_file_name()); @@ -97,14 +97,12 @@ namespace platform { cout << "*** Starting experiment: " << title << " ***" << endl; for (auto fileName : filesToProcess) { cout << "- " << setw(20) << left << fileName << " " << right << flush; - auto result = cross_validation(path, fileName); - result.setDataset(fileName); - addResult(result); + cross_validation(path, fileName); cout << endl; } } - Result Experiment::cross_validation(const string& path, const string& fileName) + void Experiment::cross_validation(const string& path, const string& fileName) { auto datasets = platform::Datasets(path, true, platform::ARFF); // Get dataset @@ -172,6 +170,7 @@ namespace platform { result.setScoreTestStd(torch::std(accuracy_test).item()).setScoreTrainStd(torch::std(accuracy_train).item()); result.setTrainTime(torch::mean(train_time).item()).setTestTime(torch::mean(test_time).item()); result.setNodes(torch::mean(nodes).item()).setLeaves(torch::mean(edges).item()).setDepth(torch::mean(num_states).item()); - return result; + result.setDataset(fileName); + addResult(result); } } \ No newline at end of file diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index 951ac4a..4305316 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -30,14 +30,14 @@ namespace platform { class Result { private: string dataset, hyperparameters, model_version; - int samples, features, classes; - double score_train, score_test, score_train_std, score_test_std, train_time, train_time_std, test_time, test_time_std; - float nodes, leaves, depth; + int samples{ 0 }, features{ 0 }, classes{ 0 }; + double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 }; + float nodes{ 0 }, leaves{ 0 }, depth{ 0 }; vector scores_train, scores_test, times_train, times_test; public: Result() = default; - Result& setDataset(string dataset) { this->dataset = dataset; return *this; } - Result& setHyperparameters(string hyperparameters) { this->hyperparameters = hyperparameters; return *this; } + Result& setDataset(const string& dataset) { this->dataset = dataset; return *this; } + Result& setHyperparameters(const string& hyperparameters) { this->hyperparameters = hyperparameters; return *this; } Result& setSamples(int samples) { this->samples = samples; return *this; } Result& setFeatures(int features) { this->features = features; return *this; } Result& setClasses(int classes) { this->classes = classes; return *this; } @@ -82,21 +82,21 @@ namespace platform { class Experiment { private: string title, model, platform, score_name, model_version, language_version, language; - bool discretized, stratified; + bool discretized{ false }, stratified{ false }; vector results; vector randomSeeds; - int nfolds; - float duration; + int nfolds{ 0 }; + float duration{ 0 }; json build_json(); public: Experiment() = default; - Experiment& setTitle(string title) { this->title = title; return *this; } - Experiment& setModel(string model) { this->model = model; return *this; } - Experiment& setPlatform(string platform) { this->platform = platform; return *this; } - Experiment& setScoreName(string score_name) { this->score_name = score_name; return *this; } - Experiment& setModelVersion(string model_version) { this->model_version = model_version; return *this; } - Experiment& setLanguage(string language) { this->language = language; return *this; } - Experiment& setLanguageVersion(string language_version) { this->language_version = language_version; return *this; } + Experiment& setTitle(const string& title) { this->title = title; return *this; } + Experiment& setModel(const string& model) { this->model = model; return *this; } + Experiment& setPlatform(const string& platform) { this->platform = platform; return *this; } + Experiment& setScoreName(const string& score_name) { this->score_name = score_name; return *this; } + Experiment& setModelVersion(const string& model_version) { this->model_version = model_version; return *this; } + Experiment& setLanguage(const string& language) { this->language = language; return *this; } + Experiment& setLanguageVersion(const string& language_version) { this->language_version = language_version; return *this; } Experiment& setDiscretized(bool discretized) { this->discretized = discretized; return *this; } Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; } Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; } @@ -104,8 +104,8 @@ namespace platform { Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; } Experiment& setDuration(float duration) { this->duration = duration; return *this; } string get_file_name(); - void save(string path); - Result cross_validation(const string& path, const string& fileName); + void save(const string& path); + void cross_validation(const string& path, const string& fileName); void go(vector filesToProcess, const string& path); void show(); }; diff --git a/src/Platform/Folding.cc b/src/Platform/Folding.cc index ec7c4b5..7c59bce 100644 --- a/src/Platform/Folding.cc +++ b/src/Platform/Folding.cc @@ -7,9 +7,8 @@ Fold::Fold(int k, int n, int seed) : k(k), n(n), seed(seed) random_seed = default_random_engine(seed == -1 ? rd() : seed); srand(seed == -1 ? time(0) : seed); } -KFold::KFold(int k, int n, int seed) : Fold(k, n, seed) +KFold::KFold(int k, int n, int seed) : Fold(k, n, seed), indices(vector(n)) { - indices = vector(n); iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1 shuffle(indices.begin(), indices.end(), random_seed); } diff --git a/src/Platform/Folding.h b/src/Platform/Folding.h index d7736d0..eaf0c4b 100644 --- a/src/Platform/Folding.h +++ b/src/Platform/Folding.h @@ -22,7 +22,7 @@ private: vector indices; public: KFold(int k, int n, int seed = -1); - pair, vector> getFold(int nFold); + pair, vector> getFold(int nFold) override; }; class StratifiedKFold : public Fold { private: @@ -32,6 +32,6 @@ private: public: StratifiedKFold(int k, const vector& y, int seed = -1); StratifiedKFold(int k, torch::Tensor& y, int seed = -1); - pair, vector> getFold(int nFold); + pair, vector> getFold(int nFold) override; }; #endif \ No newline at end of file From b882569169065d70e05d7df28b929e1dd9aeba81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 30 Jul 2023 00:04:18 +0200 Subject: [PATCH 11/12] Fix some more lint warnings --- src/BayesNet/Mst.cc | 8 ++++---- src/BayesNet/Node.cc | 19 ++++++------------- src/BayesNet/Node.h | 4 ++-- src/Platform/Datasets.cc | 11 ++++------- src/Platform/Datasets.h | 4 ++-- 5 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/BayesNet/Mst.cc b/src/BayesNet/Mst.cc index b86812b..5e94dc3 100644 --- a/src/BayesNet/Mst.cc +++ b/src/BayesNet/Mst.cc @@ -7,7 +7,7 @@ namespace bayesnet { using namespace std; - Graph::Graph(int V) + Graph::Graph(int V) : V(V) { parent = vector(V); for (int i = 0; i < V; i++) @@ -34,10 +34,10 @@ namespace bayesnet { } void Graph::kruskal_algorithm() { - int i, uSt, vEd; // sort the edges ordered on decreasing weight - sort(G.begin(), G.end(), [](auto& left, auto& right) {return left.first > right.first;}); - for (i = 0; i < G.size(); i++) { + sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;}); + for (int i = 0; i < G.size(); i++) { + int uSt, vEd; uSt = find_set(G[i].second.first); vEd = find_set(G[i].second.second); if (uSt != vEd) { diff --git a/src/BayesNet/Node.cc b/src/BayesNet/Node.cc index d33fecf..095cff7 100644 --- a/src/BayesNet/Node.cc +++ b/src/BayesNet/Node.cc @@ -88,18 +88,15 @@ namespace bayesnet { { // Get dimensions of the CPT dimensions.push_back(numStates); - for (auto father : getParents()) { - dimensions.push_back(father->getNumStates()); - } + transform(parents.begin(), parents.end(), back_inserter(dimensions), [](const auto& parent) { return parent->getNumStates(); }); + // Create a tensor of zeros with the dimensions of the CPT cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing; // Fill table with counts for (int n_sample = 0; n_sample < dataset[name].size(); ++n_sample) { torch::List> coordinates; coordinates.push_back(torch::tensor(dataset[name][n_sample])); - for (auto father : getParents()) { - coordinates.push_back(torch::tensor(dataset[father->getName()][n_sample])); - } + transform(parents.begin(), parents.end(), back_inserter(coordinates), [&dataset, &n_sample](const auto& parent) { return torch::tensor(dataset[parent->getName()][n_sample]); }); // Increment the count of the corresponding coordinate cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1); } @@ -111,19 +108,15 @@ namespace bayesnet { torch::List> coordinates; // following predetermined order of indices in the cpTable (see Node.h) coordinates.push_back(torch::tensor(evidence[name])); - for (auto parent : getParents()) { - coordinates.push_back(torch::tensor(evidence[parent->getName()])); - } + transform(parents.begin(), parents.end(), back_inserter(coordinates), [&evidence](const auto& parent) { return torch::tensor(evidence[parent->getName()]); }); return cpTable.index({ coordinates }).item(); } - vector Node::graph(string className) + vector Node::graph(const string& className) { auto output = vector(); auto suffix = name == className ? ", fontcolor=red, fillcolor=lightblue, style=filled " : ""; output.push_back(name + " [shape=circle" + suffix + "] \n"); - for (auto& child : children) { - output.push_back(name + " -> " + child->getName()); - } + transform(children.begin(), children.end(), back_inserter(output), [this](const auto& child) { return name + " -> " + child->getName(); }); return output; } } \ No newline at end of file diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index 5c5932a..3a5bbe6 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -16,7 +16,7 @@ namespace bayesnet { vector dimensions; // dimensions of the cpTable public: vector> combinations(const vector&); - Node(const std::string&, int); + Node(const string&, int); void clear(); void addParent(Node*); void addChild(Node*); @@ -30,7 +30,7 @@ namespace bayesnet { int getNumStates() const; void setNumStates(int); unsigned minFill(); - vector graph(string clasName); // Returns a vector of strings representing the graph in graphviz format + vector graph(const string& clasName); // Returns a vector of strings representing the graph in graphviz format float getFactorValue(map&); }; } diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 1fef8d0..11b83ac 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -21,9 +21,7 @@ namespace platform { vector Datasets::getNames() { vector result; - for (auto& d : datasets) { - result.push_back(d.first); - } + transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; }); return result; } vector Datasets::getFeatures(string name) @@ -79,7 +77,7 @@ namespace platform { } return datasets[name]->getTensors(); } - bool Datasets::isDataset(string name) + bool Datasets::isDataset(const string& name) { return datasets.find(name) != datasets.end(); } @@ -193,9 +191,8 @@ namespace platform { yv = arff.getY(); // Get className & Features className = arff.getClassName(); - for (auto feature : arff.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = arff.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; }); } void Dataset::load() { diff --git a/src/Platform/Datasets.h b/src/Platform/Datasets.h index 7b68c71..4ccd1f0 100644 --- a/src/Platform/Datasets.h +++ b/src/Platform/Datasets.h @@ -49,7 +49,7 @@ namespace platform { bool discretize; void load(); // Loads the list of datasets public: - Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; + explicit Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; vector getNames(); vector getFeatures(string name); int getNSamples(string name); @@ -58,7 +58,7 @@ namespace platform { pair>&, vector&> getVectors(string name); pair>&, vector&> getVectorsDiscretized(string name); pair getTensors(string name); - bool isDataset(string name); + bool isDataset(const string& name); }; }; From 4ebc9c201341f2b49de82bc7bc97c69126991f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 30 Jul 2023 00:16:58 +0200 Subject: [PATCH 12/12] Complete fixing the linter warnings --- src/BayesNet/Classifier.cc | 1 - src/BayesNet/Ensemble.cc | 6 +++--- src/BayesNet/Mst.cc | 3 +-- src/BayesNet/Network.cc | 9 ++------- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index d77c2c8..545525e 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -125,7 +125,6 @@ namespace bayesnet { } void Classifier::addNodes() { - auto test = model.getEdges(); // Add all nodes to the network for (auto feature : features) { model.addNode(feature, states[feature].size()); diff --git a/src/BayesNet/Ensemble.cc b/src/BayesNet/Ensemble.cc index 8aa2518..dce0d3d 100644 --- a/src/BayesNet/Ensemble.cc +++ b/src/BayesNet/Ensemble.cc @@ -148,10 +148,10 @@ namespace bayesnet { } int Ensemble::getNumberOfStates() { - int states = 0; + int nstates = 0; for (auto i = 0; i < n_models; ++i) { - states += models[i]->getNumberOfStates(); + nstates += models[i]->getNumberOfStates(); } - return states; + return nstates; } } \ No newline at end of file diff --git a/src/BayesNet/Mst.cc b/src/BayesNet/Mst.cc index 5e94dc3..3a48d05 100644 --- a/src/BayesNet/Mst.cc +++ b/src/BayesNet/Mst.cc @@ -7,9 +7,8 @@ namespace bayesnet { using namespace std; - Graph::Graph(int V) : V(V) + Graph::Graph(int V) : V(V), parent(vector(V)) { - parent = vector(V); for (int i = 0; i < V; i++) parent[i] = i; G.clear(); diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index 1c8abeb..35b3cc5 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -8,7 +8,7 @@ namespace bayesnet { Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.getmaxThreads()), fitted(other.fitted) { - for (auto& pair : other.nodes) { + for (const auto& pair : other.nodes) { nodes[pair.first] = std::make_unique(*pair.second); } } @@ -145,9 +145,6 @@ namespace bayesnet { while (nextNodeIndex < nodes.size()) { unique_lock lock(mtx); cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; }); - if (nextNodeIndex >= nodes.size()) { - break; // No more work remaining - } threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() { while (true) { unique_lock lock(mtx); @@ -262,9 +259,7 @@ namespace bayesnet { // Normalize result double sum = accumulate(result.begin(), result.end(), 0.0); - for (double& value : result) { - value /= sum; - } + transform(result.begin(), result.end(), result.begin(), [sum](double& value) { return value / sum; }); return result; } vector Network::show()