From 2f2ed00ca195296468feb4bd2eb7179afa480415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sun, 14 Jul 2024 12:48:33 +0200 Subject: [PATCH] Add roc-auc-ovr as score to b_main --- src/CMakeLists.txt | 2 +- src/commands/b_main.cpp | 12 ++++++--- src/common/DotEnv.h | 28 ++++++++++----------- src/common/Utils.h | 10 ++++++++ src/main/Experiment.cpp | 46 +++++++++++++++-------------------- src/main/Experiment.h | 3 ++- src/main/PartialResult.h | 6 ----- src/main/RocAuc.cpp | 22 +---------------- src/main/Scores.cpp | 41 ++++++++++++++++++++++++++++++- src/main/Scores.h | 6 ++++- src/reports/ReportConsole.cpp | 9 ++----- 11 files changed, 104 insertions(+), 81 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 13070ad..1e87c96 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -47,7 +47,7 @@ add_executable(b_list commands/b_list.cpp target_link_libraries(b_list "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}") # b_main -set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp RocAuc.cpp) +set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp) list(TRANSFORM main_sources PREPEND main/) add_executable(b_main commands/b_main.cpp ${main_sources} common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp diff --git a/src/commands/b_main.cpp b/src/commands/b_main.cpp index 32c2436..e4b0ea1 100644 --- a/src/commands/b_main.cpp +++ b/src/commands/b_main.cpp @@ -58,6 +58,11 @@ void manageArguments(argparse::ArgumentParser& program) for (auto choice : valid_choices) { smooth_arg.choices(choice); } + auto& score_arg = program.add_argument("-s", "--score").help("Score to use. Valid values: " + env.valid_values("score")).default_value(env.get("score")); + valid_choices = env.valid_tokens("score"); + for (auto choice : valid_choices) { + score_arg.choices(choice); + } program.add_argument("--generate-fold-files").help("generate fold information in datasets_experiment folder").default_value(false).implicit_value(true); program.add_argument("--graph").help("generate graphviz dot files with the model").default_value(false).implicit_value(true); program.add_argument("--no-train-score").help("Don't compute train score").default_value(false).implicit_value(true); @@ -79,14 +84,14 @@ void manageArguments(argparse::ArgumentParser& program) throw std::runtime_error("Number of folds must be an integer"); }}); auto seed_values = env.getSeeds(); - program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values); + program.add_argument("--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values); } int main(int argc, char** argv) { argparse::ArgumentParser program("b_main", { platform_project_version.begin(), platform_project_version.end() }); manageArguments(program); - std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat; + std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat, score; json hyperparameters_json; bool discretize_dataset, stratified, saveResults, quiet, no_train_score, generate_fold_files, graph; std::vector seeds; @@ -106,6 +111,7 @@ int main(int argc, char** argv) quiet = program.get("quiet"); graph = program.get("graph"); n_folds = program.get("folds"); + score = program.get("score"); seeds = program.get>("seeds"); auto hyperparameters = program.get("hyperparameters"); hyperparameters_json = json::parse(hyperparameters); @@ -195,7 +201,7 @@ int main(int argc, char** argv) experiment.setTitle(title).setLanguage("c++").setLanguageVersion("gcc 14.1.1"); experiment.setDiscretizationAlgorithm(discretize_algo).setSmoothSrategy(smooth_strat); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform")); - experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy"); + experiment.setStratified(stratified).setNFolds(n_folds).setScoreName(score); experiment.setHyperparameters(test_hyperparams); for (auto seed : seeds) { experiment.addRandomSeed(seed); diff --git a/src/common/DotEnv.h b/src/common/DotEnv.h index a576a75..0935423 100644 --- a/src/common/DotEnv.h +++ b/src/common/DotEnv.h @@ -19,24 +19,24 @@ namespace platform { { valid = { - {"source_data", {"Arff", "Tanveer", "Surcov", "Test"}}, + {"depth", {"any"}}, + {"discretize", {"0", "1"}}, + {"discretize_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}}, {"experiment", {"discretiz", "odte", "covid", "Test"}}, {"fit_features", {"0", "1"}}, - {"discretize", {"0", "1"}}, - {"ignore_nan", {"0", "1"}}, - {"stratified", {"0", "1"}}, - {"score", {"accuracy"}}, {"framework", {"bulma", "bootstrap"}}, - {"margin", {"0.1", "0.2", "0.3"}}, - {"n_folds", {"5", "10"}}, - {"discretize_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}}, - {"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}}, - {"platform", {"any"}}, - {"model", {"any"}}, - {"seeds", {"any"}}, - {"nodes", {"any"}}, + {"ignore_nan", {"0", "1"}}, {"leaves", {"any"}}, - {"depth", {"any"}}, + {"margin", {"0.1", "0.2", "0.3"}}, + {"model", {"any"}}, + {"n_folds", {"5", "10"}}, + {"nodes", {"any"}}, + {"platform", {"any"}}, + {"stratified", {"0", "1"}}, + {"score", {"accuracy", "roc-auc-ovr"}}, + {"seeds", {"any"}}, + {"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}}, + {"source_data", {"Arff", "Tanveer", "Surcov", "Test"}}, }; if (create) { // For testing purposes diff --git a/src/common/Utils.h b/src/common/Utils.h index 692ed11..b62b783 100644 --- a/src/common/Utils.h +++ b/src/common/Utils.h @@ -4,7 +4,17 @@ #include #include #include +#include namespace platform { + template + std::vector tensorToVector(const torch::Tensor& tensor) + { + torch::Tensor contig_tensor = tensor.contiguous(); + auto num_elements = contig_tensor.numel(); + const T* tensor_data = contig_tensor.data_ptr(); + std::vector result(tensor_data, tensor_data + num_elements); + return result; + } static std::string trim(const std::string& str) { std::string result = str; diff --git a/src/main/Experiment.cpp b/src/main/Experiment.cpp index 4e98cb8..10997d0 100644 --- a/src/main/Experiment.cpp +++ b/src/main/Experiment.cpp @@ -3,7 +3,6 @@ #include "common/Paths.h" #include "Models.h" #include "Scores.h" -#include "RocAuc.h" #include "Experiment.h" namespace platform { using json = nlohmann::ordered_json; @@ -86,7 +85,14 @@ namespace platform { return Colors::RESET(); } } - + score_t Experiment::parse_score() const + { + if (result.getScoreName() == "accuracy") + return score_t::ACCURACY; + if (result.getScoreName() == "roc-auc-ovr") + return score_t::ROC_AUC_OVR; + throw std::runtime_error("Unknown score: " + result.getScoreName()); + } void showProgress(int fold, const std::string& color, const std::string& phase) { std::string prefix = phase == "-" ? "" : "\b\b\b\b"; @@ -159,10 +165,8 @@ namespace platform { // Initialize results std::vectors // int nResults = nfolds * static_cast(randomSeeds.size()); - auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64); - auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64); - auto auc_test = torch::zeros({ nResults }, torch::kFloat64); - auto auc_train = torch::zeros({ nResults }, torch::kFloat64); + auto score_test = torch::zeros({ nResults }, torch::kFloat64); + auto score_train = torch::zeros({ nResults }, torch::kFloat64); auto train_time = torch::zeros({ nResults }, torch::kFloat64); auto test_time = torch::zeros({ nResults }, torch::kFloat64); auto nodes = torch::zeros({ nResults }, torch::kFloat64); @@ -178,6 +182,7 @@ namespace platform { // // Loop over random seeds // + auto score = parse_score(); for (auto seed : randomSeeds) { if (!quiet) { string prefix = " "; @@ -227,17 +232,14 @@ namespace platform { edges[item] = clf->getNumberOfEdges(); num_states[item] = clf->getNumberOfStates(); train_time[item] = train_timer.getDuration(); - double accuracy_train_value = 0.0; + double score_train_value = 0.0; // // Score train // - double auc_train_value = 0; if (!no_train_score) { - auto roc_auc = RocAuc(); auto y_proba_train = clf->predict_proba(X_train); Scores scores(y_train, y_proba_train, num_classes, labels); - accuracy_train_value = scores.accuracy(); - auc_train_value = roc_auc.compute(y_proba_train, y_train); + score_train_value = score == score_t::ACCURACY ? scores.accuracy() : scores.auc(); confusion_matrices_train.push_back(scores.get_confusion_matrix_json(true)); } // @@ -249,24 +251,18 @@ namespace platform { // auto y_predict = clf->predict(X_test); auto y_proba_test = clf->predict_proba(X_test); Scores scores(y_test, y_proba_test, num_classes, labels); - auto accuracy_test_value = scores.accuracy(); - auto roc_auc = RocAuc(); - double auc_test_value = roc_auc.compute(y_proba_test, y_test); + auto score_test_value = score == score_t::ACCURACY ? scores.accuracy() : scores.auc(); test_time[item] = test_timer.getDuration(); - auc_train[item] = auc_train_value; - auc_test[item] = auc_test_value; - accuracy_train[item] = accuracy_train_value; - accuracy_test[item] = accuracy_test_value; + score_train[item] = score_train_value; + score_test[item] = score_test_value; confusion_matrices.push_back(scores.get_confusion_matrix_json(true)); if (!quiet) std::cout << "\b\b\b, " << flush; // // Store results and times in std::vector // - partial_result.addAucTrain(auc_train_value); - partial_result.addAucTest(auc_test_value); - partial_result.addScoreTrain(accuracy_train_value); - partial_result.addScoreTest(accuracy_test_value); + partial_result.addScoreTrain(score_train_value); + partial_result.addScoreTest(score_test_value); partial_result.addTimeTrain(train_time[item].item()); partial_result.addTimeTest(test_time[item].item()); item++; @@ -286,10 +282,8 @@ namespace platform { // Store result totals in Result // partial_result.setGraph(graphs); - partial_result.setScoreTest(torch::mean(accuracy_test).item()).setScoreTrain(torch::mean(accuracy_train).item()); - partial_result.setScoreTestStd(torch::std(accuracy_test).item()).setScoreTrainStd(torch::std(accuracy_train).item()); - partial_result.setAucTest(torch::mean(auc_test).item()).setAucTrain(torch::mean(auc_train).item()); - partial_result.setAucTestStd(torch::std(auc_test).item()).setAucTrainStd(torch::std(auc_train).item()); + partial_result.setScoreTest(torch::mean(score_test).item()).setScoreTrain(torch::mean(score_train).item()); + partial_result.setScoreTestStd(torch::std(score_test).item()).setScoreTrainStd(torch::std(score_train).item()); partial_result.setTrainTime(torch::mean(train_time).item()).setTestTime(torch::mean(test_time).item()); partial_result.setTestTimeStd(torch::std(test_time).item()).setTrainTimeStd(torch::std(train_time).item()); partial_result.setNodes(torch::mean(nodes).item()).setLeaves(torch::mean(edges).item()).setDepth(torch::mean(num_states).item()); diff --git a/src/main/Experiment.h b/src/main/Experiment.h index d036032..0aed891 100644 --- a/src/main/Experiment.h +++ b/src/main/Experiment.h @@ -11,7 +11,7 @@ namespace platform { using json = nlohmann::ordered_json; - + enum class score_t { NONE, ACCURACY, ROC_AUC_OVR }; class Experiment { public: Experiment() = default; @@ -55,6 +55,7 @@ namespace platform { void saveGraph(); void report(bool classification_report = false); private: + score_t parse_score() const; Result result; bool discretized{ false }, stratified{ false }; std::vector results; diff --git a/src/main/PartialResult.h b/src/main/PartialResult.h index 2e9e75f..d5e7667 100644 --- a/src/main/PartialResult.h +++ b/src/main/PartialResult.h @@ -44,10 +44,6 @@ namespace platform { PartialResult& setScoreTrainStd(double score_std) { data["score_train_std"] = score_std; return *this; } PartialResult& setScoreTest(double score) { data["score"] = score; return *this; } PartialResult& setScoreTestStd(double score_std) { data["score_std"] = score_std; return *this; } - PartialResult& setAucTrain(double score) { data["auc_train"] = score; return *this; } - PartialResult& setAucTrainStd(double score_std) { data["auc_train_std"] = score_std; return *this; } - PartialResult& setAucTest(double score) { data["auc"] = score; return *this; } - PartialResult& setAucTestStd(double score_std) { data["auc_std"] = score_std; return *this; } PartialResult& setTrainTime(double train_time) { data["train_time"] = train_time; @@ -75,8 +71,6 @@ namespace platform { PartialResult& setNodes(float nodes) { data["nodes"] = nodes; return *this; } PartialResult& setLeaves(float leaves) { data["leaves"] = leaves; return *this; } PartialResult& setDepth(float depth) { data["depth"] = depth; return *this; } - PartialResult& addAucTrain(double score) { data["aucs_train"].push_back(score); return *this; } - PartialResult& addAucTest(double score) { data["aucs_test"].push_back(score); return *this; } PartialResult& addScoreTrain(double score) { data["scores_train"].push_back(score); return *this; } PartialResult& addScoreTest(double score) { data["scores_test"].push_back(score); return *this; } PartialResult& addTimeTrain(double time) { data["times_train"].push_back(time); return *this; } diff --git a/src/main/RocAuc.cpp b/src/main/RocAuc.cpp index 7c301f8..03a1c31 100644 --- a/src/main/RocAuc.cpp +++ b/src/main/RocAuc.cpp @@ -4,27 +4,7 @@ #include #include "RocAuc.h" namespace platform { - std::vector tensorToVector(const torch::Tensor& tensor) - { - // Ensure the tensor is of type kInt32 - if (tensor.dtype() != torch::kInt32) { - throw std::runtime_error("Tensor must be of type kInt32"); - } - - // Ensure the tensor is contiguous - torch::Tensor contig_tensor = tensor.contiguous(); - - // Get the number of elements in the tensor - auto num_elements = contig_tensor.numel(); - - // Get a pointer to the tensor data - const int32_t* tensor_data = contig_tensor.data_ptr(); - - // Create a std::vector and copy the data - std::vector result(tensor_data, tensor_data + num_elements); - - return result; - } + double RocAuc::compute(const torch::Tensor& y_proba, const torch::Tensor& labels) { size_t nClasses = y_proba.size(1); diff --git a/src/main/Scores.cpp b/src/main/Scores.cpp index 2f7e226..951ca5f 100644 --- a/src/main/Scores.cpp +++ b/src/main/Scores.cpp @@ -1,8 +1,9 @@ #include #include "Scores.h" +#include "common/Utils.h" // tensorToVector #include "common/Colors.h" namespace platform { - Scores::Scores(torch::Tensor& y_test, torch::Tensor& y_proba, int num_classes, std::vector labels) : num_classes(num_classes), labels(labels) + Scores::Scores(torch::Tensor& y_test, torch::Tensor& y_proba, int num_classes, std::vector labels) : num_classes(num_classes), labels(labels), y_test(y_test), y_proba(y_proba) { if (labels.size() == 0) { init_default_labels(); @@ -41,6 +42,44 @@ namespace platform { } compute_accuracy_value(); } + float Scores::auc() + { + size_t nSamples = y_test.numel(); + if (nSamples == 0) return 0; + // In binary classification problem there's no need to calculate the average of the AUCs + auto nClasses = num_classes; + if (num_classes == 2) + nClasses = 1; + auto y_testv = tensorToVector(y_test); + std::vector aucScores(nClasses, 0.0); + std::vector> scoresAndLabels; + for (size_t classIdx = 0; classIdx < nClasses; ++classIdx) { + scoresAndLabels.clear(); + for (size_t i = 0; i < nSamples; ++i) { + scoresAndLabels.emplace_back(y_proba[i][classIdx].item(), y_testv[i] == classIdx ? 1 : 0); + } + std::sort(scoresAndLabels.begin(), scoresAndLabels.end(), std::greater<>()); + std::vector tpr, fpr; + double tp = 0, fp = 0; + double totalPos = std::count(y_testv.begin(), y_testv.end(), classIdx); + double totalNeg = nSamples - totalPos; + for (const auto& [score, label] : scoresAndLabels) { + if (label == 1) { + tp += 1; + } else { + fp += 1; + } + tpr.push_back(tp / totalPos); + fpr.push_back(fp / totalNeg); + } + double auc = 0.0; + for (size_t i = 1; i < tpr.size(); ++i) { + auc += 0.5 * (fpr[i] - fpr[i - 1]) * (tpr[i] + tpr[i - 1]); + } + aucScores[classIdx] = auc; + } + return std::accumulate(aucScores.begin(), aucScores.end(), 0.0) / nClasses; + } Scores Scores::create_aggregate(const json& data, const std::string key) { auto scores = Scores(data[key][0]); diff --git a/src/main/Scores.h b/src/main/Scores.h index d0f32f0..a8ecc5b 100644 --- a/src/main/Scores.h +++ b/src/main/Scores.h @@ -9,10 +9,11 @@ namespace platform { using json = nlohmann::ordered_json; class Scores { public: - Scores(torch::Tensor& y_test, torch::Tensor& y_pred, int num_classes, std::vector labels = {}); + Scores(torch::Tensor& y_test, torch::Tensor& y_proba, int num_classes, std::vector labels = {}); explicit Scores(const json& confusion_matrix_); static Scores create_aggregate(const json& data, const std::string key); float accuracy(); + float auc(); float f1_score(int num_class); float f1_weighted(); float f1_macro(); @@ -34,6 +35,9 @@ namespace platform { int total; std::vector labels; torch::Tensor confusion_matrix; // Rows ar actual, columns are predicted + torch::Tensor null_t; // Covenient null tensor needed when confusion_matrix constructor is used + torch::Tensor& y_test = null_t; // for ROC AUC + torch::Tensor& y_proba = null_t; // for ROC AUC int label_len = 16; int dlen = 9; int ndec = 7; diff --git a/src/reports/ReportConsole.cpp b/src/reports/ReportConsole.cpp index 9d4b410..ccc5a87 100644 --- a/src/reports/ReportConsole.cpp +++ b/src/reports/ReportConsole.cpp @@ -65,9 +65,9 @@ namespace platform { maxHyper = std::max(maxHyper, (int)r["hyperparameters"].dump().size()); maxDataset = std::max(maxDataset, (int)r["dataset"].get().size()); } - std::vector header_labels = { " #", "Dataset", "Sampl.", "Feat.", "Cls", nodes_label, leaves_label, depth_label, "Score", "ROC-AUC ovr", "Time", "Hyperparameters" }; + std::vector header_labels = { " #", "Dataset", "Sampl.", "Feat.", "Cls", nodes_label, leaves_label, depth_label, "Score", "Time", "Hyperparameters" }; sheader << Colors::GREEN(); - std::vector header_lengths = { 3, maxDataset, 6, 5, 3, 9, 9, 9, 15, 15, 20, maxHyper }; + std::vector header_lengths = { 3, maxDataset, 6, 5, 3, 9, 9, 9, 15, 20, maxHyper }; for (int i = 0; i < header_labels.size(); i++) { sheader << std::setw(header_lengths[i]) << std::left << header_labels[i] << " "; } @@ -99,7 +99,6 @@ namespace platform { line << std::setw(8) << std::right << std::setprecision(6) << std::fixed << r["score"].get() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["score_std"].get(); const std::string status = compareResult(r["dataset"].get(), r["score"].get()); line << status; - line << std::setw(8) << std::right << std::setprecision(6) << std::fixed << r["auc"].get() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["auc_std"].get() << " "; line << std::setw(12) << std::right << std::setprecision(6) << std::fixed << r["time"].get() << "±" << std::setw(7) << std::setprecision(4) << std::fixed << r["time_std"].get() << " "; line << r["hyperparameters"].dump(); line << std::endl; @@ -129,10 +128,6 @@ namespace platform { vbody.push_back(line.str()); sbody << line.str(); line.str(""); line << headerLine(fVector("Test scores: ", lastResult["scores_test"], 14, 12)); vbody.push_back(line.str()); sbody << line.str(); - line.str(""); line << headerLine(fVector("Train auc : ", lastResult["aucs_train"], 14, 12)); - vbody.push_back(line.str()); sbody << line.str(); - line.str(""); line << headerLine(fVector("Test auc : ", lastResult["aucs_test"], 14, 12)); - vbody.push_back(line.str()); sbody << line.str(); line.str(""); line << headerLine(fVector("Train times: ", lastResult["times_train"], 10, 3)); vbody.push_back(line.str()); sbody << line.str(); line.str(""); line << headerLine(fVector("Test times: ", lastResult["times_test"], 10, 3));