From 5a7c8f18182b48a4c0c4a2e06db6f0d00e9f90dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 5 Sep 2023 13:39:43 +0200 Subject: [PATCH] Add status to classifier and Experiment --- sample/sample.cc | 386 +++++++++++++++++++--------------- src/BayesNet/BaseClassifier.h | 2 + src/BayesNet/BoostAODE.cc | 17 +- src/BayesNet/Classifier.h | 2 + src/Platform/Experiment.cc | 29 ++- 5 files changed, 256 insertions(+), 180 deletions(-) diff --git a/sample/sample.cc b/sample/sample.cc index 9d7175f..7e9d569 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -58,180 +58,226 @@ pair>, vector> extract_indices(vector indices, vect int main(int argc, char** argv) { - map datasets = { - {"diabetes", true}, - {"ecoli", true}, - {"glass", true}, - {"iris", true}, - {"kdd_JapaneseVowels", false}, - {"letter", true}, - {"liver-disorders", true}, - {"mfeat-factors", true}, - }; - auto valid_datasets = vector(); - transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets), - [](const pair& pair) { return pair.first; }); - argparse::ArgumentParser program("BayesNetSample"); - program.add_argument("-d", "--dataset") - .help("Dataset file name") - .action([valid_datasets](const std::string& value) { - if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) { - return value; - } - throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}"); - } - ); - program.add_argument("-p", "--path") - .help(" folder where the data files are located, default") - .default_value(string{ PATH } - ); - program.add_argument("-m", "--model") - .help("Model to use " + platform::Models::instance()->toString()) - .action([](const std::string& value) { - static const vector choices = platform::Models::instance()->getNames(); - if (find(choices.begin(), choices.end(), value) != choices.end()) { - return value; - } - throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); - } - ); - program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); - program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true); - program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); - program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true); - program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) { - try { - auto k = stoi(value); - if (k < 2) { - throw runtime_error("Number of folds must be greater than 1"); - } - return k; - } - catch (const runtime_error& err) { - throw runtime_error(err.what()); - } - catch (...) { - throw runtime_error("Number of folds must be an integer"); - }}); - program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); - bool class_last, stratified, tensors, dump_cpt; - string model_name, file_name, path, complete_file_name; - int nFolds, seed; - try { - program.parse_args(argc, argv); - file_name = program.get("dataset"); - path = program.get("path"); - model_name = program.get("model"); - complete_file_name = path + file_name + ".arff"; - stratified = program.get("stratified"); - tensors = program.get("tensors"); - nFolds = program.get("folds"); - seed = program.get("seed"); - dump_cpt = program.get("dumpcpt"); - class_last = datasets[file_name]; - if (!file_exists(complete_file_name)) { - throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); - } + torch::Tensor weights_ = torch::full({ 10 }, 1.0 / 10, torch::kFloat64); + torch::Tensor y_ = torch::tensor({ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, torch::kInt32); + torch::Tensor ypred = torch::tensor({ 1, 1, 1, 0, 0, 1, 1, 1, 1, 0 }, torch::kInt32); + cout << "Initial weights_: " << endl; + for (int i = 0; i < 10; i++) { + cout << weights_.index({ i }).item() << ", "; } - catch (const exception& err) { - cerr << err.what() << endl; - cerr << program; - exit(1); + cout << "end." << endl; + cout << "y_: " << endl; + for (int i = 0; i < 10; i++) { + cout << y_.index({ i }).item() << ", "; } + cout << "end." << endl; + cout << "ypred: " << endl; + for (int i = 0; i < 10; i++) { + cout << ypred.index({ i }).item() << ", "; + } + cout << "end." << endl; + auto mask_wrong = ypred != y_; + auto mask_right = ypred == y_; + auto masked_weights = weights_ * mask_wrong.to(weights_.dtype()); + double epsilon_t = masked_weights.sum().item(); + cout << "epsilon_t: " << epsilon_t << endl; + double wt = (1 - epsilon_t) / epsilon_t; + cout << "wt: " << wt << endl; + double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt); + cout << "alpha_t: " << alpha_t << endl; + // Step 3.2: Update weights for next classifier + // Step 3.2.1: Update weights of wrong samples + cout << "exp(alpha_t): " << exp(alpha_t) << endl; + cout << "exp(-alpha_t): " << exp(-alpha_t) << endl; + weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_; + // Step 3.2.2: Update weights of right samples + weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_; + // Step 3.3: Normalise the weights + double totalWeights = torch::sum(weights_).item(); + cout << "totalWeights: " << totalWeights << endl; + cout << "Before normalization: " << endl; + for (int i = 0; i < 10; i++) { + cout << weights_.index({ i }).item() << endl; + } + weights_ = weights_ / totalWeights; + cout << "After normalization: " << endl; + for (int i = 0; i < 10; i++) { + cout << weights_.index({ i }).item() << endl; + } + // map datasets = { + // {"diabetes", true}, + // {"ecoli", true}, + // {"glass", true}, + // {"iris", true}, + // {"kdd_JapaneseVowels", false}, + // {"letter", true}, + // {"liver-disorders", true}, + // {"mfeat-factors", true}, + // }; + // auto valid_datasets = vector(); + // transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets), + // [](const pair& pair) { return pair.first; }); + // argparse::ArgumentParser program("BayesNetSample"); + // program.add_argument("-d", "--dataset") + // .help("Dataset file name") + // .action([valid_datasets](const std::string& value) { + // if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) { + // return value; + // } + // throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}"); + // } + // ); + // program.add_argument("-p", "--path") + // .help(" folder where the data files are located, default") + // .default_value(string{ PATH } + // ); + // program.add_argument("-m", "--model") + // .help("Model to use " + platform::Models::instance()->toString()) + // .action([](const std::string& value) { + // static const vector choices = platform::Models::instance()->getNames(); + // if (find(choices.begin(), choices.end(), value) != choices.end()) { + // return value; + // } + // throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); + // } + // ); + // program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); + // program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true); + // program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); + // program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true); + // program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) { + // try { + // auto k = stoi(value); + // if (k < 2) { + // throw runtime_error("Number of folds must be greater than 1"); + // } + // return k; + // } + // catch (const runtime_error& err) { + // throw runtime_error(err.what()); + // } + // catch (...) { + // throw runtime_error("Number of folds must be an integer"); + // }}); + // program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); + // bool class_last, stratified, tensors, dump_cpt; + // string model_name, file_name, path, complete_file_name; + // int nFolds, seed; + // try { + // program.parse_args(argc, argv); + // file_name = program.get("dataset"); + // path = program.get("path"); + // model_name = program.get("model"); + // complete_file_name = path + file_name + ".arff"; + // stratified = program.get("stratified"); + // tensors = program.get("tensors"); + // nFolds = program.get("folds"); + // seed = program.get("seed"); + // dump_cpt = program.get("dumpcpt"); + // class_last = datasets[file_name]; + // if (!file_exists(complete_file_name)) { + // throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); + // } + // } + // catch (const exception& err) { + // cerr << err.what() << endl; + // cerr << program; + // exit(1); + // } /* * Begin Processing */ - auto handler = ArffFiles(); - handler.load(complete_file_name, class_last); - // Get Dataset X, y - vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); - // Get className & Features - auto className = handler.getClassName(); - vector features; - auto attributes = handler.getAttributes(); - transform(attributes.begin(), attributes.end(), back_inserter(features), - [](const pair& item) { return item.first; }); - // Discretize Dataset - auto [Xd, maxes] = discretize(X, y, features); - maxes[className] = *max_element(y.begin(), y.end()) + 1; - map> states; - for (auto feature : features) { - states[feature] = vector(maxes[feature]); - } - states[className] = vector(maxes[className]); - auto clf = platform::Models::instance()->create(model_name); - clf->fit(Xd, y, features, className, states); - if (dump_cpt) { - cout << "--- CPT Tables ---" << endl; - clf->dump_cpt(); - } - auto lines = clf->show(); - for (auto line : lines) { - cout << line << endl; - } - cout << "--- Topological Order ---" << endl; - auto order = clf->topological_order(); - for (auto name : order) { - cout << name << ", "; - } - cout << "end." << endl; - auto score = clf->score(Xd, y); - cout << "Score: " << score << endl; - auto graph = clf->graph(); - auto dot_file = model_name + "_" + file_name; - ofstream file(dot_file + ".dot"); - file << graph; - file.close(); - cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; - cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; - string stratified_string = stratified ? " Stratified" : ""; - cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; - cout << "==========================================" << endl; - torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); - torch::Tensor yt = torch::tensor(y, torch::kInt32); - for (int i = 0; i < features.size(); ++i) { - Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); - } - float total_score = 0, total_score_train = 0, score_train, score_test; - platform::Fold* fold; - if (stratified) - fold = new platform::StratifiedKFold(nFolds, y, seed); - else - fold = new platform::KFold(nFolds, y.size(), seed); - for (auto i = 0; i < nFolds; ++i) { - auto [train, test] = fold->getFold(i); - cout << "Fold: " << i + 1 << endl; - if (tensors) { - auto ttrain = torch::tensor(train, torch::kInt64); - auto ttest = torch::tensor(test, torch::kInt64); - torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); - torch::Tensor ytraint = yt.index({ ttrain }); - torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); - torch::Tensor ytestt = yt.index({ ttest }); - clf->fit(Xtraint, ytraint, features, className, states); - auto temp = clf->predict(Xtraint); - score_train = clf->score(Xtraint, ytraint); - score_test = clf->score(Xtestt, ytestt); - } else { - auto [Xtrain, ytrain] = extract_indices(train, Xd, y); - auto [Xtest, ytest] = extract_indices(test, Xd, y); - clf->fit(Xtrain, ytrain, features, className, states); - score_train = clf->score(Xtrain, ytrain); - score_test = clf->score(Xtest, ytest); - } - if (dump_cpt) { - cout << "--- CPT Tables ---" << endl; - clf->dump_cpt(); - } - total_score_train += score_train; - total_score += score_test; - cout << "Score Train: " << score_train << endl; - cout << "Score Test : " << score_test << endl; - cout << "-------------------------------------------------------------------------------" << endl; - } - cout << "**********************************************************************************" << endl; - cout << "Average Score Train: " << total_score_train / nFolds << endl; - cout << "Average Score Test : " << total_score / nFolds << endl;return 0; + // auto handler = ArffFiles(); + // handler.load(complete_file_name, class_last); + // // Get Dataset X, y + // vector& X = handler.getX(); + // mdlp::labels_t& y = handler.getY(); + // // Get className & Features + // auto className = handler.getClassName(); + // vector features; + // auto attributes = handler.getAttributes(); + // transform(attributes.begin(), attributes.end(), back_inserter(features), + // [](const pair& item) { return item.first; }); + // // Discretize Dataset + // auto [Xd, maxes] = discretize(X, y, features); + // maxes[className] = *max_element(y.begin(), y.end()) + 1; + // map> states; + // for (auto feature : features) { + // states[feature] = vector(maxes[feature]); + // } + // states[className] = vector(maxes[className]); + // auto clf = platform::Models::instance()->create(model_name); + // clf->fit(Xd, y, features, className, states); + // if (dump_cpt) { + // cout << "--- CPT Tables ---" << endl; + // clf->dump_cpt(); + // } + // auto lines = clf->show(); + // for (auto line : lines) { + // cout << line << endl; + // } + // cout << "--- Topological Order ---" << endl; + // auto order = clf->topological_order(); + // for (auto name : order) { + // cout << name << ", "; + // } + // cout << "end." << endl; + // auto score = clf->score(Xd, y); + // cout << "Score: " << score << endl; + // auto graph = clf->graph(); + // auto dot_file = model_name + "_" + file_name; + // ofstream file(dot_file + ".dot"); + // file << graph; + // file.close(); + // cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; + // cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; + // string stratified_string = stratified ? " Stratified" : ""; + // cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; + // cout << "==========================================" << endl; + // torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); + // torch::Tensor yt = torch::tensor(y, torch::kInt32); + // for (int i = 0; i < features.size(); ++i) { + // Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + // } + // float total_score = 0, total_score_train = 0, score_train, score_test; + // platform::Fold* fold; + // if (stratified) + // fold = new platform::StratifiedKFold(nFolds, y, seed); + // else + // fold = new platform::KFold(nFolds, y.size(), seed); + // for (auto i = 0; i < nFolds; ++i) { + // auto [train, test] = fold->getFold(i); + // cout << "Fold: " << i + 1 << endl; + // if (tensors) { + // auto ttrain = torch::tensor(train, torch::kInt64); + // auto ttest = torch::tensor(test, torch::kInt64); + // torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); + // torch::Tensor ytraint = yt.index({ ttrain }); + // torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); + // torch::Tensor ytestt = yt.index({ ttest }); + // clf->fit(Xtraint, ytraint, features, className, states); + // auto temp = clf->predict(Xtraint); + // score_train = clf->score(Xtraint, ytraint); + // score_test = clf->score(Xtestt, ytestt); + // } else { + // auto [Xtrain, ytrain] = extract_indices(train, Xd, y); + // auto [Xtest, ytest] = extract_indices(test, Xd, y); + // clf->fit(Xtrain, ytrain, features, className, states); + // score_train = clf->score(Xtrain, ytrain); + // score_test = clf->score(Xtest, ytest); + // } + // if (dump_cpt) { + // cout << "--- CPT Tables ---" << endl; + // clf->dump_cpt(); + // } + // total_score_train += score_train; + // total_score += score_test; + // cout << "Score Train: " << score_train << endl; + // cout << "Score Test : " << score_test << endl; + // cout << "-------------------------------------------------------------------------------" << endl; + // } + // cout << "**********************************************************************************" << endl; + // cout << "Average Score Train: " << total_score_train / nFolds << endl; + // cout << "Average Score Test : " << total_score / nFolds << endl;return 0; } \ No newline at end of file diff --git a/src/BayesNet/BaseClassifier.h b/src/BayesNet/BaseClassifier.h index 8cdb038..5337e78 100644 --- a/src/BayesNet/BaseClassifier.h +++ b/src/BayesNet/BaseClassifier.h @@ -5,6 +5,7 @@ #include namespace bayesnet { using namespace std; + enum status_t { NORMAL, WARNING, ERROR }; class BaseClassifier { protected: virtual void trainModel(const torch::Tensor& weights) = 0; @@ -18,6 +19,7 @@ namespace bayesnet { virtual ~BaseClassifier() = default; torch::Tensor virtual predict(torch::Tensor& X) = 0; vector virtual predict(vector>& X) = 0; + status_t virtual getStatus() const = 0; float virtual score(vector>& X, vector& y) = 0; float virtual score(torch::Tensor& X, torch::Tensor& y) = 0; int virtual getNumberOfNodes()const = 0; diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index 25e7602..61d013d 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -1,6 +1,7 @@ #include "BoostAODE.h" #include #include "BayesMetrics.h" +#include "Colors.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} @@ -64,22 +65,26 @@ namespace bayesnet { auto ypred = model->predict(X_); // Step 3.1: Compute the classifier amout of say auto mask_wrong = ypred != y_; + auto mask_right = ypred == y_; auto masked_weights = weights_ * mask_wrong.to(weights_.dtype()); - double wrongWeights = masked_weights.sum().item(); - double significance = wrongWeights == 0 ? 1 : 0.5 * log((1 - wrongWeights) / wrongWeights); + double epsilon_t = masked_weights.sum().item(); + double wt = (1 - epsilon_t) / epsilon_t; + double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt); // Step 3.2: Update weights for next classifier // Step 3.2.1: Update weights of wrong samples - weights_ += mask_wrong.to(weights_.dtype()) * exp(significance) * weights_; + weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_; + // Step 3.2.2: Update weights of right samples + weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_; // Step 3.3: Normalise the weights double totalWeights = torch::sum(weights_).item(); weights_ = weights_ / totalWeights; // Step 3.4: Store classifier and its accuracy to weigh its future vote models.push_back(std::move(model)); - significanceModels.push_back(significance); - exitCondition = n_models == maxModels && repeatSparent; + significanceModels.push_back(alpha_t); + exitCondition = n_models == maxModels && repeatSparent || epsilon_t > 0.5; } if (featuresUsed.size() != features.size()) { - cout << "Warning: BoostAODE did not use all the features" << endl; + status = WARNING; } weights.copy_(weights_); } diff --git a/src/BayesNet/Classifier.h b/src/BayesNet/Classifier.h index 6be3300..011987b 100644 --- a/src/BayesNet/Classifier.h +++ b/src/BayesNet/Classifier.h @@ -21,6 +21,7 @@ namespace bayesnet { string className; map> states; Tensor dataset; // (n+1)xm tensor + status_t status = NORMAL; void checkFitParameters(); virtual void buildModel(const torch::Tensor& weights) = 0; void trainModel(const torch::Tensor& weights) override; @@ -37,6 +38,7 @@ namespace bayesnet { int getNumberOfEdges() const override; int getNumberOfStates() const override; Tensor predict(Tensor& X) override; + status_t getStatus() const override { return status; } vector predict(vector>& X) override; float score(Tensor& X, Tensor& y) override; float score(vector>& X, vector& y) override; diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 7fff706..c1d1048 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -111,6 +111,26 @@ namespace platform { } } + string getColor(bayesnet::status_t status) + { + switch (status) { + case bayesnet::NORMAL: + return Colors::GREEN(); + case bayesnet::WARNING: + return Colors::YELLOW(); + case bayesnet::ERROR: + return Colors::RED(); + default: + return Colors::RESET(); + } + } + + void showProgress(int fold, const string& color, const string& phase) + { + string prefix = phase == "a" ? "" : "\b\b\b\b"; + cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush; + + } void Experiment::cross_validation(const string& path, const string& fileName) { auto datasets = platform::Datasets(path, discretized, platform::ARFF); @@ -159,23 +179,24 @@ namespace platform { auto y_train = y.index({ train_t }); auto X_test = X.index({ "...", test_t }); auto y_test = y.index({ test_t }); - cout << nfold + 1 << "(a)" << flush; + showProgress(nfold + 1, getColor(clf->getStatus()), "a"); // Train model clf->fit(X_train, y_train, features, className, states); - cout << "\b\bb)" << flush; + showProgress(nfold + 1, getColor(clf->getStatus()), "b"); nodes[item] = clf->getNumberOfNodes(); edges[item] = clf->getNumberOfEdges(); num_states[item] = clf->getNumberOfStates(); train_time[item] = train_timer.getDuration(); + // Score train auto accuracy_train_value = clf->score(X_train, y_train); - cout << "\b\bc)" << flush; // Test model + showProgress(nfold + 1, getColor(clf->getStatus()), "c"); test_timer.start(); auto accuracy_test_value = clf->score(X_test, y_test); - cout << "\b\b\b, " << flush; test_time[item] = test_timer.getDuration(); accuracy_train[item] = accuracy_train_value; accuracy_test[item] = accuracy_test_value; + cout << "\b\b\b, " << flush; // Store results and times in vector result.addScoreTrain(accuracy_train_value); result.addScoreTest(accuracy_test_value);