Add roc-auc-ovr as score to b_main

2024-07-14 12:48:33 +02:00
parent 28f6a0d7a7
commit 2f2ed00ca1
11 changed files with 104 additions and 81 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -47,7 +47,7 @@ add_executable(b_list commands/b_list.cpp
 target_link_libraries(b_list "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}")

 # b_main
-set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp RocAuc.cpp)
+set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp)
 list(TRANSFORM main_sources PREPEND main/)
 add_executable(b_main commands/b_main.cpp ${main_sources} 
    common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
--- a/src/commands/b_main.cpp
+++ b/src/commands/b_main.cpp
@@ -58,6 +58,11 @@ void manageArguments(argparse::ArgumentParser& program)
    for (auto choice : valid_choices) {
        smooth_arg.choices(choice);
    }
+    auto& score_arg = program.add_argument("-s", "--score").help("Score to use. Valid values: " + env.valid_values("score")).default_value(env.get("score"));
+    valid_choices = env.valid_tokens("score");
+    for (auto choice : valid_choices) {
+        score_arg.choices(choice);
+    }
    program.add_argument("--generate-fold-files").help("generate fold information in datasets_experiment folder").default_value(false).implicit_value(true);
    program.add_argument("--graph").help("generate graphviz dot files with the model").default_value(false).implicit_value(true);
    program.add_argument("--no-train-score").help("Don't compute train score").default_value(false).implicit_value(true);
@@ -79,14 +84,14 @@ void manageArguments(argparse::ArgumentParser& program)
            throw std::runtime_error("Number of folds must be an integer");
        }});
        auto seed_values = env.getSeeds();
-        program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
+        program.add_argument("--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
 }

 int main(int argc, char** argv)
 {
    argparse::ArgumentParser program("b_main", { platform_project_version.begin(), platform_project_version.end() });
    manageArguments(program);
-    std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat;
+    std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat, score;
    json hyperparameters_json;
    bool discretize_dataset, stratified, saveResults, quiet, no_train_score, generate_fold_files, graph;
    std::vector<int> seeds;
@@ -106,6 +111,7 @@ int main(int argc, char** argv)
        quiet = program.get<bool>("quiet");
        graph = program.get<bool>("graph");
        n_folds = program.get<int>("folds");
+        score = program.get<std::string>("score");
        seeds = program.get<std::vector<int>>("seeds");
        auto hyperparameters = program.get<std::string>("hyperparameters");
        hyperparameters_json = json::parse(hyperparameters);
@@ -195,7 +201,7 @@ int main(int argc, char** argv)
    experiment.setTitle(title).setLanguage("c++").setLanguageVersion("gcc 14.1.1");
    experiment.setDiscretizationAlgorithm(discretize_algo).setSmoothSrategy(smooth_strat);
    experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
-    experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy");
+    experiment.setStratified(stratified).setNFolds(n_folds).setScoreName(score);
    experiment.setHyperparameters(test_hyperparams);
    for (auto seed : seeds) {
        experiment.addRandomSeed(seed);
--- a/src/common/DotEnv.h
+++ b/src/common/DotEnv.h
@@ -19,24 +19,24 @@ namespace platform {
        {
            valid =
            {
-                {"source_data", {"Arff", "Tanveer", "Surcov", "Test"}},
+                {"depth", {"any"}},
+                {"discretize", {"0", "1"}},
+                {"discretize_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}},
                {"experiment", {"discretiz", "odte", "covid", "Test"}},
                {"fit_features", {"0", "1"}},
-                {"discretize", {"0", "1"}},
-                {"ignore_nan", {"0", "1"}},
-                {"stratified", {"0", "1"}},
-                {"score", {"accuracy"}},
                {"framework", {"bulma", "bootstrap"}},
-                {"margin", {"0.1", "0.2", "0.3"}},
-                {"n_folds", {"5", "10"}},
-                {"discretize_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q", "bin5q", "bin5u", "bin6q", "bin6u", "bin7q", "bin7u", "bin8q", "bin8u", "bin9q", "bin9u", "bin10q", "bin10u"}},
-                {"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}},
-                {"platform", {"any"}},
-                {"model", {"any"}},
-                {"seeds", {"any"}},
-                {"nodes", {"any"}},
+                {"ignore_nan", {"0", "1"}},
                {"leaves", {"any"}},
-                {"depth", {"any"}},
+                {"margin", {"0.1", "0.2", "0.3"}},
+                {"model", {"any"}},
+                {"n_folds", {"5", "10"}},
+                {"nodes", {"any"}},
+                {"platform", {"any"}},
+                {"stratified", {"0", "1"}},
+                {"score", {"accuracy", "roc-auc-ovr"}},
+                {"seeds", {"any"}},
+                {"smooth_strat", {"ORIGINAL", "LAPLACE", "CESTNIK"}},
+                {"source_data", {"Arff", "Tanveer", "Surcov", "Test"}},
            };
            if (create) {
                // For testing purposes
--- a/src/common/Utils.h
+++ b/src/common/Utils.h
@@ -4,7 +4,17 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#include <torch/torch.h>
 namespace platform {
+    template <typename T>
+    std::vector<T> tensorToVector(const torch::Tensor& tensor)
+    {
+        torch::Tensor contig_tensor = tensor.contiguous();
+        auto num_elements = contig_tensor.numel();
+        const T* tensor_data = contig_tensor.data_ptr<T>();
+        std::vector<T> result(tensor_data, tensor_data + num_elements);
+        return result;
+    }
    static std::string trim(const std::string& str)
    {
        std::string result = str;
--- a/src/main/Experiment.cpp
+++ b/src/main/Experiment.cpp
@@ -3,7 +3,6 @@
 #include "common/Paths.h"
 #include "Models.h"
 #include "Scores.h"
-#include "RocAuc.h"
 #include "Experiment.h"
 namespace platform {
    using json = nlohmann::ordered_json;
@@ -86,7 +85,14 @@ namespace platform {
                return Colors::RESET();
        }
    }
-
+    score_t Experiment::parse_score() const
+    {
+        if (result.getScoreName() == "accuracy")
+            return score_t::ACCURACY;
+        if (result.getScoreName() == "roc-auc-ovr")
+            return score_t::ROC_AUC_OVR;
+        throw std::runtime_error("Unknown score: " + result.getScoreName());
+    }
    void showProgress(int fold, const std::string& color, const std::string& phase)
    {
        std::string prefix = phase == "-" ? "" : "\b\b\b\b";
@@ -159,10 +165,8 @@ namespace platform {
        // Initialize results std::vectors
        //
        int nResults = nfolds * static_cast<int>(randomSeeds.size());
-        auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64);
-        auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64);
-        auto auc_test = torch::zeros({ nResults }, torch::kFloat64);
-        auto auc_train = torch::zeros({ nResults }, torch::kFloat64);
+        auto score_test = torch::zeros({ nResults }, torch::kFloat64);
+        auto score_train = torch::zeros({ nResults }, torch::kFloat64);
        auto train_time = torch::zeros({ nResults }, torch::kFloat64);
        auto test_time = torch::zeros({ nResults }, torch::kFloat64);
        auto nodes = torch::zeros({ nResults }, torch::kFloat64);
@@ -178,6 +182,7 @@ namespace platform {
        //
        // Loop over random seeds
        //
+        auto score = parse_score();
        for (auto seed : randomSeeds) {
            if (!quiet) {
                string prefix = " ";
@@ -227,17 +232,14 @@ namespace platform {
                edges[item] = clf->getNumberOfEdges();
                num_states[item] = clf->getNumberOfStates();
                train_time[item] = train_timer.getDuration();
-                double accuracy_train_value = 0.0;
+                double score_train_value = 0.0;
                //
                // Score train
                //
-                double auc_train_value = 0;
                if (!no_train_score) {
-                    auto roc_auc = RocAuc();
                    auto y_proba_train = clf->predict_proba(X_train);
                    Scores scores(y_train, y_proba_train, num_classes, labels);
-                    accuracy_train_value = scores.accuracy();
-                    auc_train_value = roc_auc.compute(y_proba_train, y_train);
+                    score_train_value = score == score_t::ACCURACY ? scores.accuracy() : scores.auc();
                    confusion_matrices_train.push_back(scores.get_confusion_matrix_json(true));
                }
                //
@@ -249,24 +251,18 @@ namespace platform {
                // auto y_predict = clf->predict(X_test);
                auto y_proba_test = clf->predict_proba(X_test);
                Scores scores(y_test, y_proba_test, num_classes, labels);
-                auto accuracy_test_value = scores.accuracy();
-                auto roc_auc = RocAuc();
-                double auc_test_value = roc_auc.compute(y_proba_test, y_test);
+                auto score_test_value = score == score_t::ACCURACY ? scores.accuracy() : scores.auc();
                test_time[item] = test_timer.getDuration();
-                auc_train[item] = auc_train_value;
-                auc_test[item] = auc_test_value;
-                accuracy_train[item] = accuracy_train_value;
-                accuracy_test[item] = accuracy_test_value;
+                score_train[item] = score_train_value;
+                score_test[item] = score_test_value;
                confusion_matrices.push_back(scores.get_confusion_matrix_json(true));
                if (!quiet)
                    std::cout << "\b\b\b, " << flush;
                //
                // Store results and times in std::vector
                //
-                partial_result.addAucTrain(auc_train_value);
-                partial_result.addAucTest(auc_test_value);
-                partial_result.addScoreTrain(accuracy_train_value);
-                partial_result.addScoreTest(accuracy_test_value);
+                partial_result.addScoreTrain(score_train_value);
+                partial_result.addScoreTest(score_test_value);
                partial_result.addTimeTrain(train_time[item].item<double>());
                partial_result.addTimeTest(test_time[item].item<double>());
                item++;
@@ -286,10 +282,8 @@ namespace platform {
        // Store result totals in Result
        //
        partial_result.setGraph(graphs);
-        partial_result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
-        partial_result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
-        partial_result.setAucTest(torch::mean(auc_test).item<double>()).setAucTrain(torch::mean(auc_train).item<double>());
-        partial_result.setAucTestStd(torch::std(auc_test).item<double>()).setAucTrainStd(torch::std(auc_train).item<double>());
+        partial_result.setScoreTest(torch::mean(score_test).item<double>()).setScoreTrain(torch::mean(score_train).item<double>());
+        partial_result.setScoreTestStd(torch::std(score_test).item<double>()).setScoreTrainStd(torch::std(score_train).item<double>());
        partial_result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());
        partial_result.setTestTimeStd(torch::std(test_time).item<double>()).setTrainTimeStd(torch::std(train_time).item<double>());
        partial_result.setNodes(torch::mean(nodes).item<double>()).setLeaves(torch::mean(edges).item<double>()).setDepth(torch::mean(num_states).item<double>());
--- a/src/main/Experiment.h
+++ b/src/main/Experiment.h
@@ -11,7 +11,7 @@

 namespace platform {
    using json = nlohmann::ordered_json;
-
+    enum class score_t { NONE, ACCURACY, ROC_AUC_OVR };
    class Experiment {
    public:
        Experiment() = default;
@@ -55,6 +55,7 @@ namespace platform {
        void saveGraph();
        void report(bool classification_report = false);
    private:
+        score_t parse_score() const;
        Result result;
        bool discretized{ false }, stratified{ false };
        std::vector<PartialResult> results;
--- a/src/main/PartialResult.h
+++ b/src/main/PartialResult.h
@@ -44,10 +44,6 @@ namespace platform {
        PartialResult& setScoreTrainStd(double score_std) { data["score_train_std"] = score_std; return *this; }
        PartialResult& setScoreTest(double score) { data["score"] = score; return *this; }
        PartialResult& setScoreTestStd(double score_std) { data["score_std"] = score_std; return *this; }
-        PartialResult& setAucTrain(double score) { data["auc_train"] = score; return *this; }
-        PartialResult& setAucTrainStd(double score_std) { data["auc_train_std"] = score_std; return *this; }
-        PartialResult& setAucTest(double score) { data["auc"] = score; return *this; }
-        PartialResult& setAucTestStd(double score_std) { data["auc_std"] = score_std; return *this; }
        PartialResult& setTrainTime(double train_time)
        {
            data["train_time"] = train_time;
@@ -75,8 +71,6 @@ namespace platform {
        PartialResult& setNodes(float nodes) { data["nodes"] = nodes; return *this; }
        PartialResult& setLeaves(float leaves) { data["leaves"] = leaves; return *this; }
        PartialResult& setDepth(float depth) { data["depth"] = depth; return *this; }
-        PartialResult& addAucTrain(double score) { data["aucs_train"].push_back(score); return *this; }
-        PartialResult& addAucTest(double score) { data["aucs_test"].push_back(score); return *this; }
        PartialResult& addScoreTrain(double score) { data["scores_train"].push_back(score); return *this; }
        PartialResult& addScoreTest(double score) { data["scores_test"].push_back(score); return *this; }
        PartialResult& addTimeTrain(double time) { data["times_train"].push_back(time); return *this; }
--- a/src/main/RocAuc.cpp
+++ b/src/main/RocAuc.cpp
@@ -4,27 +4,7 @@
 #include <utility>
 #include "RocAuc.h"
 namespace platform {
-    std::vector<int> tensorToVector(const torch::Tensor& tensor)
-    {
-        // Ensure the tensor is of type kInt32
-        if (tensor.dtype() != torch::kInt32) {
-            throw std::runtime_error("Tensor must be of type kInt32");
-        }
-
-        // Ensure the tensor is contiguous
-        torch::Tensor contig_tensor = tensor.contiguous();
-
-        // Get the number of elements in the tensor
-        auto num_elements = contig_tensor.numel();
-
-        // Get a pointer to the tensor data
-        const int32_t* tensor_data = contig_tensor.data_ptr<int32_t>();
-
-        // Create a std::vector<int> and copy the data
-        std::vector<int> result(tensor_data, tensor_data + num_elements);
-
-        return result;
-    }
+    
    double RocAuc::compute(const torch::Tensor& y_proba, const torch::Tensor& labels)
    {
        size_t nClasses = y_proba.size(1);
--- a/src/main/Scores.cpp
+++ b/src/main/Scores.cpp
@@ -1,8 +1,9 @@
 #include <sstream>
 #include "Scores.h"
+#include "common/Utils.h" // tensorToVector
 #include "common/Colors.h"
 namespace platform {
-    Scores::Scores(torch::Tensor& y_test, torch::Tensor& y_proba, int num_classes, std::vector<std::string> labels) : num_classes(num_classes), labels(labels)
+    Scores::Scores(torch::Tensor& y_test, torch::Tensor& y_proba, int num_classes, std::vector<std::string> labels) : num_classes(num_classes), labels(labels), y_test(y_test), y_proba(y_proba)
    {
        if (labels.size() == 0) {
            init_default_labels();
@@ -41,6 +42,44 @@ namespace platform {
        }
        compute_accuracy_value();
    }
+    float Scores::auc()
+    {
+        size_t nSamples = y_test.numel();
+        if (nSamples == 0) return 0;
+        // In binary classification problem there's no need to calculate the average of the AUCs
+        auto nClasses = num_classes;
+        if (num_classes == 2)
+            nClasses = 1;
+        auto y_testv = tensorToVector<int>(y_test);
+        std::vector<double> aucScores(nClasses, 0.0);
+        std::vector<std::pair<double, int>> scoresAndLabels;
+        for (size_t classIdx = 0; classIdx < nClasses; ++classIdx) {
+            scoresAndLabels.clear();
+            for (size_t i = 0; i < nSamples; ++i) {
+                scoresAndLabels.emplace_back(y_proba[i][classIdx].item<float>(), y_testv[i] == classIdx ? 1 : 0);
+            }
+            std::sort(scoresAndLabels.begin(), scoresAndLabels.end(), std::greater<>());
+            std::vector<double> tpr, fpr;
+            double tp = 0, fp = 0;
+            double totalPos = std::count(y_testv.begin(), y_testv.end(), classIdx);
+            double totalNeg = nSamples - totalPos;
+            for (const auto& [score, label] : scoresAndLabels) {
+                if (label == 1) {
+                    tp += 1;
+                } else {
+                    fp += 1;
+                }
+                tpr.push_back(tp / totalPos);
+                fpr.push_back(fp / totalNeg);
+            }
+            double auc = 0.0;
+            for (size_t i = 1; i < tpr.size(); ++i) {
+                auc += 0.5 * (fpr[i] - fpr[i - 1]) * (tpr[i] + tpr[i - 1]);
+            }
+            aucScores[classIdx] = auc;
+        }
+        return std::accumulate(aucScores.begin(), aucScores.end(), 0.0) / nClasses;
+    }
    Scores Scores::create_aggregate(const json& data, const std::string key)
    {
        auto scores = Scores(data[key][0]);
--- a/src/main/Scores.h
+++ b/src/main/Scores.h
@@ -9,10 +9,11 @@ namespace platform {
    using json = nlohmann::ordered_json;
    class Scores {
    public:
-        Scores(torch::Tensor& y_test, torch::Tensor& y_pred, int num_classes, std::vector<std::string> labels = {});
+        Scores(torch::Tensor& y_test, torch::Tensor& y_proba, int num_classes, std::vector<std::string> labels = {});
        explicit Scores(const json& confusion_matrix_);
        static Scores create_aggregate(const json& data, const std::string key);
        float accuracy();
+        float auc();
        float f1_score(int num_class);
        float f1_weighted();
        float f1_macro();
@@ -34,6 +35,9 @@ namespace platform {
        int total;
        std::vector<std::string> labels;
        torch::Tensor confusion_matrix; // Rows ar actual, columns are predicted
+        torch::Tensor null_t; // Covenient null tensor needed when confusion_matrix constructor is used
+        torch::Tensor& y_test = null_t; // for ROC AUC
+        torch::Tensor& y_proba = null_t; // for ROC AUC
        int label_len = 16;
        int dlen = 9;
        int ndec = 7;
--- a/src/reports/ReportConsole.cpp
+++ b/src/reports/ReportConsole.cpp
@@ -65,9 +65,9 @@ namespace platform {
            maxHyper = std::max(maxHyper, (int)r["hyperparameters"].dump().size());
            maxDataset = std::max(maxDataset, (int)r["dataset"].get<std::string>().size());
        }
-        std::vector<std::string> header_labels = { " #", "Dataset", "Sampl.", "Feat.", "Cls", nodes_label, leaves_label, depth_label, "Score", "ROC-AUC ovr", "Time", "Hyperparameters" };
+        std::vector<std::string> header_labels = { " #", "Dataset", "Sampl.", "Feat.", "Cls", nodes_label, leaves_label, depth_label, "Score", "Time", "Hyperparameters" };
        sheader << Colors::GREEN();
-        std::vector<int> header_lengths = { 3, maxDataset, 6, 5, 3, 9, 9, 9, 15, 15, 20, maxHyper };
+        std::vector<int> header_lengths = { 3, maxDataset, 6, 5, 3, 9, 9, 9, 15, 20, maxHyper };
        for (int i = 0; i < header_labels.size(); i++) {
            sheader << std::setw(header_lengths[i]) << std::left << header_labels[i] << " ";
        }
@@ -99,7 +99,6 @@ namespace platform {
            line << std::setw(8) << std::right << std::setprecision(6) << std::fixed << r["score"].get<double>() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["score_std"].get<double>();
            const std::string status = compareResult(r["dataset"].get<std::string>(), r["score"].get<double>());
            line << status;
-            line << std::setw(8) << std::right << std::setprecision(6) << std::fixed << r["auc"].get<double>() << "±" << std::setw(6) << std::setprecision(4) << std::fixed << r["auc_std"].get<double>() << " ";
            line << std::setw(12) << std::right << std::setprecision(6) << std::fixed << r["time"].get<double>() << "±" << std::setw(7) << std::setprecision(4) << std::fixed << r["time_std"].get<double>() << " ";
            line << r["hyperparameters"].dump();
            line << std::endl;
@@ -129,10 +128,6 @@ namespace platform {
            vbody.push_back(line.str()); sbody << line.str();
            line.str(""); line << headerLine(fVector("Test  scores: ", lastResult["scores_test"], 14, 12));
            vbody.push_back(line.str()); sbody << line.str();
-            line.str(""); line << headerLine(fVector("Train auc   : ", lastResult["aucs_train"], 14, 12));
-            vbody.push_back(line.str()); sbody << line.str();
-            line.str(""); line << headerLine(fVector("Test  auc   : ", lastResult["aucs_test"], 14, 12));
-            vbody.push_back(line.str()); sbody << line.str();
            line.str(""); line << headerLine(fVector("Train  times: ", lastResult["times_train"], 10, 3));
            vbody.push_back(line.str()); sbody << line.str();
            line.str(""); line << headerLine(fVector("Test   times: ", lastResult["times_test"], 10, 3));