From 84adf13a79e29f25f8a1d2e060fd1eb242221afe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Fri, 12 Jul 2024 17:23:03 +0200 Subject: [PATCH] Add AUC computing in Experiment and store in result --- lib/json | 2 +- src/CMakeLists.txt | 2 +- src/main/Experiment.cpp | 23 +++++++++-- src/main/PartialResult.h | 6 +++ src/main/RocAuc.cpp | 84 ++++++++++++++++++++++++++++++++++++++++ src/main/RocAuc.h | 21 ++++++++++ src/main/Scores.cpp | 3 +- 7 files changed, 134 insertions(+), 7 deletions(-) create mode 100644 src/main/RocAuc.cpp create mode 100644 src/main/RocAuc.h diff --git a/lib/json b/lib/json index 8c391e0..960b763 160000 --- a/lib/json +++ b/lib/json @@ -1 +1 @@ -Subproject commit 8c391e04fe4195d8be862c97f38cfe10e2a3472e +Subproject commit 960b763ecd144f156d05ec61f577b04107290137 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1e87c96..13070ad 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -47,7 +47,7 @@ add_executable(b_list commands/b_list.cpp target_link_libraries(b_list "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}") # b_main -set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp) +set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp RocAuc.cpp) list(TRANSFORM main_sources PREPEND main/) add_executable(b_main commands/b_main.cpp ${main_sources} common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp diff --git a/src/main/Experiment.cpp b/src/main/Experiment.cpp index aa6f6f1..4e98cb8 100644 --- a/src/main/Experiment.cpp +++ b/src/main/Experiment.cpp @@ -3,6 +3,7 @@ #include "common/Paths.h" #include "Models.h" #include "Scores.h" +#include "RocAuc.h" #include "Experiment.h" namespace platform { using json = nlohmann::ordered_json; @@ -160,6 +161,8 @@ namespace platform { int nResults = nfolds * static_cast(randomSeeds.size()); auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64); auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64); + auto auc_test = torch::zeros({ nResults }, torch::kFloat64); + auto auc_train = torch::zeros({ nResults }, torch::kFloat64); auto train_time = torch::zeros({ nResults }, torch::kFloat64); auto test_time = torch::zeros({ nResults }, torch::kFloat64); auto nodes = torch::zeros({ nResults }, torch::kFloat64); @@ -228,10 +231,13 @@ namespace platform { // // Score train // + double auc_train_value = 0; if (!no_train_score) { - auto y_predict = clf->predict(X_train); - Scores scores(y_train, y_predict, num_classes, labels); + auto roc_auc = RocAuc(); + auto y_proba_train = clf->predict_proba(X_train); + Scores scores(y_train, y_proba_train, num_classes, labels); accuracy_train_value = scores.accuracy(); + auc_train_value = roc_auc.compute(y_proba_train, y_train); confusion_matrices_train.push_back(scores.get_confusion_matrix_json(true)); } // @@ -240,10 +246,15 @@ namespace platform { if (!quiet) showProgress(nfold + 1, getColor(clf->getStatus()), "c"); test_timer.start(); - auto y_predict = clf->predict(X_test); - Scores scores(y_test, y_predict, num_classes, labels); + // auto y_predict = clf->predict(X_test); + auto y_proba_test = clf->predict_proba(X_test); + Scores scores(y_test, y_proba_test, num_classes, labels); auto accuracy_test_value = scores.accuracy(); + auto roc_auc = RocAuc(); + double auc_test_value = roc_auc.compute(y_proba_test, y_test); test_time[item] = test_timer.getDuration(); + auc_train[item] = auc_train_value; + auc_test[item] = auc_test_value; accuracy_train[item] = accuracy_train_value; accuracy_test[item] = accuracy_test_value; confusion_matrices.push_back(scores.get_confusion_matrix_json(true)); @@ -252,6 +263,8 @@ namespace platform { // // Store results and times in std::vector // + partial_result.addAucTrain(auc_train_value); + partial_result.addAucTest(auc_test_value); partial_result.addScoreTrain(accuracy_train_value); partial_result.addScoreTest(accuracy_test_value); partial_result.addTimeTrain(train_time[item].item()); @@ -275,6 +288,8 @@ namespace platform { partial_result.setGraph(graphs); partial_result.setScoreTest(torch::mean(accuracy_test).item()).setScoreTrain(torch::mean(accuracy_train).item()); partial_result.setScoreTestStd(torch::std(accuracy_test).item()).setScoreTrainStd(torch::std(accuracy_train).item()); + partial_result.setAucTest(torch::mean(auc_test).item()).setAucTrain(torch::mean(auc_train).item()); + partial_result.setAucTestStd(torch::std(auc_test).item()).setAucTrainStd(torch::std(auc_train).item()); partial_result.setTrainTime(torch::mean(train_time).item()).setTestTime(torch::mean(test_time).item()); partial_result.setTestTimeStd(torch::std(test_time).item()).setTrainTimeStd(torch::std(train_time).item()); partial_result.setNodes(torch::mean(nodes).item()).setLeaves(torch::mean(edges).item()).setDepth(torch::mean(num_states).item()); diff --git a/src/main/PartialResult.h b/src/main/PartialResult.h index d5e7667..2e9e75f 100644 --- a/src/main/PartialResult.h +++ b/src/main/PartialResult.h @@ -44,6 +44,10 @@ namespace platform { PartialResult& setScoreTrainStd(double score_std) { data["score_train_std"] = score_std; return *this; } PartialResult& setScoreTest(double score) { data["score"] = score; return *this; } PartialResult& setScoreTestStd(double score_std) { data["score_std"] = score_std; return *this; } + PartialResult& setAucTrain(double score) { data["auc_train"] = score; return *this; } + PartialResult& setAucTrainStd(double score_std) { data["auc_train_std"] = score_std; return *this; } + PartialResult& setAucTest(double score) { data["auc"] = score; return *this; } + PartialResult& setAucTestStd(double score_std) { data["auc_std"] = score_std; return *this; } PartialResult& setTrainTime(double train_time) { data["train_time"] = train_time; @@ -71,6 +75,8 @@ namespace platform { PartialResult& setNodes(float nodes) { data["nodes"] = nodes; return *this; } PartialResult& setLeaves(float leaves) { data["leaves"] = leaves; return *this; } PartialResult& setDepth(float depth) { data["depth"] = depth; return *this; } + PartialResult& addAucTrain(double score) { data["aucs_train"].push_back(score); return *this; } + PartialResult& addAucTest(double score) { data["aucs_test"].push_back(score); return *this; } PartialResult& addScoreTrain(double score) { data["scores_train"].push_back(score); return *this; } PartialResult& addScoreTest(double score) { data["scores_test"].push_back(score); return *this; } PartialResult& addTimeTrain(double time) { data["times_train"].push_back(time); return *this; } diff --git a/src/main/RocAuc.cpp b/src/main/RocAuc.cpp new file mode 100644 index 0000000..d396d76 --- /dev/null +++ b/src/main/RocAuc.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include +#include +#include "common/Colors.h" +#include "RocAuc.h" +namespace platform { + std::vector tensorToVector(const torch::Tensor& tensor) + { + // Ensure the tensor is of type kInt32 + if (tensor.dtype() != torch::kInt32) { + throw std::runtime_error("Tensor must be of type kInt32"); + } + + // Ensure the tensor is contiguous + torch::Tensor contig_tensor = tensor.contiguous(); + + // Get the number of elements in the tensor + auto num_elements = contig_tensor.numel(); + + // Get a pointer to the tensor data + const int32_t* tensor_data = contig_tensor.data_ptr(); + + // Create a std::vector and copy the data + std::vector result(tensor_data, tensor_data + num_elements); + + return result; + } + double RocAuc::compute(const torch::Tensor& y_proba, const torch::Tensor& labels) + { + size_t nClasses = y_proba.size(1); + size_t nSamples = y_proba.size(0); + assert(nSamples = y_test.size(0)); + y_test = tensorToVector(labels); + std::vector aucScores(nClasses, 0.0); + for (size_t classIdx = 0; classIdx < nClasses; ++classIdx) { + scoresAndLabels.clear(); + for (size_t i = 0; i < nSamples; ++i) { + scoresAndLabels.emplace_back(y_proba[i][classIdx].item(), y_test[i] == classIdx ? 1 : 0); + } + aucScores[classIdx] = compute_common(nSamples, classIdx); + } + return std::accumulate(aucScores.begin(), aucScores.end(), 0.0) / nClasses; + } + double RocAuc::compute(const std::vector>& y_proba, const std::vector& labels) + { + y_test = labels; + size_t nClasses = y_proba[0].size(); + size_t nSamples = y_proba.size(); + std::vector aucScores(nClasses, 0.0); + for (size_t classIdx = 0; classIdx < nClasses; ++classIdx) { + scoresAndLabels.clear(); + for (size_t i = 0; i < nSamples; ++i) { + scoresAndLabels.emplace_back(y_proba[i][classIdx], labels[i] == classIdx ? 1 : 0); + } + aucScores[classIdx] = compute_common(nSamples, classIdx); + } + return std::accumulate(aucScores.begin(), aucScores.end(), 0.0) / nClasses; + } + double RocAuc::compute_common(size_t nSamples, size_t classIdx) + { + std::sort(scoresAndLabels.begin(), scoresAndLabels.end(), std::greater<>()); + std::vector tpr, fpr; + double tp = 0, fp = 0; + double totalPos = std::count(y_test.begin(), y_test.end(), classIdx); + double totalNeg = nSamples - totalPos; + + for (const auto& [score, label] : scoresAndLabels) { + if (label == 1) { + tp += 1; + } else { + fp += 1; + } + tpr.push_back(tp / totalPos); + fpr.push_back(fp / totalNeg); + } + double auc = 0.0; + for (size_t i = 1; i < tpr.size(); ++i) { + auc += 0.5 * (fpr[i] - fpr[i - 1]) * (tpr[i] + tpr[i - 1]); + } + return auc; + } +} \ No newline at end of file diff --git a/src/main/RocAuc.h b/src/main/RocAuc.h new file mode 100644 index 0000000..cd614a4 --- /dev/null +++ b/src/main/RocAuc.h @@ -0,0 +1,21 @@ +#ifndef ROCAUC_H +#define ROCAUC_H +#include +#include +#include +#include + +namespace platform { + using json = nlohmann::ordered_json; + class RocAuc { + public: + RocAuc() = default; + double compute(const std::vector>& y_proba, const std::vector& y_test); + double compute(const torch::Tensor& y_proba, const torch::Tensor& y_test); + private: + double compute_common(size_t nSamples, size_t classIdx); + std::vector> scoresAndLabels; + std::vector y_test; + }; +} +#endif \ No newline at end of file diff --git a/src/main/Scores.cpp b/src/main/Scores.cpp index 9a4935d..2f7e226 100644 --- a/src/main/Scores.cpp +++ b/src/main/Scores.cpp @@ -2,12 +2,13 @@ #include "Scores.h" #include "common/Colors.h" namespace platform { - Scores::Scores(torch::Tensor& y_test, torch::Tensor& y_pred, int num_classes, std::vector labels) : num_classes(num_classes), labels(labels) + Scores::Scores(torch::Tensor& y_test, torch::Tensor& y_proba, int num_classes, std::vector labels) : num_classes(num_classes), labels(labels) { if (labels.size() == 0) { init_default_labels(); } total = y_test.size(0); + auto y_pred = y_proba.argmax(1); accuracy_value = (y_pred == y_test).sum().item() / total; init_confusion_matrix(); for (int i = 0; i < total; i++) {