Continue with grid experiment

2025-01-17 10:39:56 +01:00
parent 9a9a9fb17a
commit c1d5dd74e3
12 changed files with 238 additions and 85 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -29,11 +29,13 @@ add_executable(
 target_link_libraries(b_best Boost::boost "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}")

 # b_grid
-set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp)
+set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp )
 list(TRANSFORM grid_sources PREPEND grid/)
 add_executable(b_grid commands/b_grid.cpp ${grid_sources} 
    common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
-    main/HyperParameters.cpp main/Models.cpp 
+    main/HyperParameters.cpp main/Models.cpp main/Experiment.cpp main/Scores.cpp
+    reports/ReportConsole.cpp reports/ReportBase.cpp 
+    results/Result.cpp
 )
 target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy)

--- a/src/commands/b_grid.cpp
+++ b/src/commands/b_grid.cpp
@@ -36,22 +36,22 @@ void add_experiment_args(argparse::ArgumentParser& program)
 {
    auto env = platform::DotEnv();
    auto datasets = platform::Datasets(false, platform::Paths::datasets());
-    // auto& group = program.add_mutually_exclusive_group(true);
-    // group.add_argument("-d", "--dataset")
-    //     .help("Dataset file name: " + datasets.toString())
-    //     .default_value("all")
-    //     .action([](const std::string& value) {
-    //     auto datasets = platform::Datasets(false, platform::Paths::datasets());
-    //     static std::vector<std::string> choices_datasets(datasets.getNames());
-    //     choices_datasets.push_back("all");
-    //     if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) {
-    //         return value;
-    //     }
-    //     throw std::runtime_error("Dataset must be one of: " + datasets.toString());
-    //         }
-    //     );
-    // group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector<std::string>());
-    // group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test.");
+    auto& group = program.add_mutually_exclusive_group(true);
+    group.add_argument("-d", "--dataset")
+        .help("Dataset file name: " + datasets.toString())
+        .default_value("all")
+        .action([](const std::string& value) {
+        auto datasets = platform::Datasets(false, platform::Paths::datasets());
+        static std::vector<std::string> choices_datasets(datasets.getNames());
+        choices_datasets.push_back("all");
+        if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) {
+            return value;
+        }
+        throw std::runtime_error("Dataset must be one of: " + datasets.toString());
+            }
+        );
+    group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector<std::string>());
+    group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test.");
    program.add_argument("--hyperparameters").default_value("{}").help("Hyperparameters passed to the model in Experiment");
    program.add_argument("--hyper-file").default_value("").help("Hyperparameters file name." \
        "Mutually exclusive with hyperparameters. This file should contain hyperparameters for each dataset in json format.");
@@ -83,11 +83,6 @@ void add_experiment_args(argparse::ArgumentParser& program)
    for (auto choice : valid_choices) {
        score_arg.choices(choice);
    }
-    program.add_argument("--generate-fold-files").help("generate fold information in datasets_experiment folder").default_value(false).implicit_value(true);
-    program.add_argument("--graph").help("generate graphviz dot files with the model").default_value(false).implicit_value(true);
-    program.add_argument("--no-train-score").help("Don't compute train score").default_value(false).implicit_value(true);
-    program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
-    program.add_argument("--save").help("Save result (always save if no dataset is supplied)").default_value(false).implicit_value(true);
    program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
    program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
        try {
@@ -307,19 +302,10 @@ void search(argparse::ArgumentParser& program)
 void experiment(argparse::ArgumentParser& program)
 {
    struct platform::ConfigGrid config;
-    config.model = program.get<std::string>("model");
-    config.score = program.get<std::string>("score");
-    config.discretize = program.get<bool>("discretize");
-    config.stratified = program.get<bool>("stratified");
-    config.smooth_strategy = program.get<std::string>("smooth-strat");
-    config.n_folds = program.get<int>("folds");
-    config.quiet = program.get<bool>("quiet");
-    config.seeds = program.get<std::vector<int>>("seeds");

    auto env = platform::DotEnv();
    config.platform = env.get("platform");
-    platform::Paths::createPath(platform::Paths::grid());
-    auto grid_experiment = platform::GridExperiment(config);
+    auto grid_experiment = platform::GridExperiment(program, config);
    platform::Timer timer;
    timer.start();
    struct platform::ConfigMPI mpi_config;
@@ -333,6 +319,7 @@ void experiment(argparse::ArgumentParser& program)
    grid_experiment.go(mpi_config);
    if (mpi_config.rank == mpi_config.manager) {
        auto results = grid_experiment.getResults();
+        //build_experiment_result(results);
        std::cout << "****** RESULTS ********" << std::endl;
        std::cout << results.dump(4) << std::endl;
        // list_results(results, config.model);
--- a/src/commands/b_results.cpp
+++ b/src/commands/b_results.cpp
@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
    int n_errors = 0;
    std::vector<std::string> files_with_errors;
    for (const auto& file_name : result_files) {
-        std::vector<std::string> errors = validator.validate(file_name);
+        std::vector<std::string> errors = validator.validate_file(file_name);
        if (!errors.empty()) {
            n_errors++;
            std::cout << std::setw(max_length) << std::left << file_name << ": " << errors.size() << " Errors:" << std::endl;
--- a/src/grid/GridBase.cpp
+++ b/src/grid/GridBase.cpp
@@ -9,6 +9,10 @@ namespace platform {
    GridBase::GridBase(struct ConfigGrid& config)
    {
        this->config = config;
+
+    }
+    void GridBase::validate_config()
+    {
        if (config.smooth_strategy == "ORIGINAL")
            smooth_type = bayesnet::Smoothing_t::ORIGINAL;
        else if (config.smooth_strategy == "LAPLACE")
@@ -116,7 +120,7 @@ namespace platform {
        * Each task is a json object with the data needed by the process
        *
        * The overall process consists in these steps:
-           * 0. Create the MPI result type & tasks
+           * 0. Validate config, create the MPI result type & tasks
           * 0.1 Create the MPI result type
           * 0.2 Manager creates the tasks
           * 1. Manager will broadcast the tasks to all the processes
@@ -138,6 +142,7 @@ namespace platform {
        //
        // 0.1 Create the MPI result type
        //
+        validate_config();
        Task_Result result;
        int tasks_size;
        MPI_Datatype MPI_Result;
--- a/src/grid/GridBase.h
+++ b/src/grid/GridBase.h
@@ -20,6 +20,7 @@ namespace platform {
        explicit GridBase(struct ConfigGrid& config);
        ~GridBase() = default;
        void go(struct ConfigMPI& config_mpi);
+        void validate_config();
    protected:
        virtual json build_tasks(Datasets& datasets) = 0;
        virtual void save(json& results) = 0;
--- a/src/grid/GridExperiment.cpp
+++ b/src/grid/GridExperiment.cpp
@@ -8,8 +8,116 @@
 #include "GridExperiment.h"

 namespace platform {
-    GridExperiment::GridExperiment(struct ConfigGrid& config) : GridBase(config)
+    GridExperiment::GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config) : arguments(program), GridBase(config)
    {
+        std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat, score;
+        json hyperparameters_json;
+        bool discretize_dataset, stratified, hyper_best;
+        std::vector<int> seeds;
+        std::vector<std::string> file_names;
+        int n_folds;
+        file_name = program.get<std::string>("dataset");
+        file_names = program.get<std::vector<std::string>>("datasets");
+        datasets_file = program.get<std::string>("datasets-file");
+        model_name = program.get<std::string>("model");
+        discretize_dataset = program.get<bool>("discretize");
+        discretize_algo = program.get<std::string>("discretize-algo");
+        smooth_strat = program.get<std::string>("smooth-strat");
+        stratified = program.get<bool>("stratified");
+        n_folds = program.get<int>("folds");
+        score = program.get<std::string>("score");
+        seeds = program.get<std::vector<int>>("seeds");
+        auto hyperparameters = program.get<std::string>("hyperparameters");
+        hyperparameters_json = json::parse(hyperparameters);
+        hyperparameters_file = program.get<std::string>("hyper-file");
+        hyper_best = program.get<bool>("hyper-best");
+        if (hyper_best) {
+            // Build the best results file_name
+            hyperparameters_file = platform::Paths::results() + platform::Paths::bestResultsFile(score, model_name);
+            // ignore this parameter
+            hyperparameters = "{}";
+        } else {
+            if (hyperparameters_file != "" && hyperparameters != "{}") {
+                throw runtime_error("hyperparameters and hyper_file are mutually exclusive");
+            }
+        }
+        title = program.get<std::string>("title");
+        if (title == "" && file_name == "all") {
+            throw runtime_error("title is mandatory if all datasets are to be tested");
+        }
+        auto datasets = platform::Datasets(false, platform::Paths::datasets());
+        if (datasets_file != "") {
+            ifstream catalog(datasets_file);
+            if (catalog.is_open()) {
+                std::string line;
+                while (getline(catalog, line)) {
+                    if (line.empty() || line[0] == '#') {
+                        continue;
+                    }
+                    if (!datasets.isDataset(line)) {
+                        cerr << "Dataset " << line << " not found" << std::endl;
+                        exit(1);
+                    }
+                    filesToTest.push_back(line);
+                }
+                catalog.close();
+                if (title == "") {
+                    title = "Test " + to_string(filesToTest.size()) + " datasets (" + datasets_file + ") "\
+                        + model_name + " " + to_string(n_folds) + " folds";
+                }
+            } else {
+                throw std::invalid_argument("Unable to open catalog file. [" + datasets_file + "]");
+            }
+        } else {
+            if (file_names.size() > 0) {
+                for (auto file : file_names) {
+                    if (!datasets.isDataset(file)) {
+                        cerr << "Dataset " << file << " not found" << std::endl;
+                        exit(1);
+                    }
+                }
+                filesToTest = file_names;
+                if (title == "") {
+                    title = "Test " + to_string(file_names.size()) + " datasets " + model_name + " " + to_string(n_folds) + " folds";
+                }
+            } else {
+                if (file_name != "all") {
+                    if (!datasets.isDataset(file_name)) {
+                        cerr << "Dataset " << file_name << " not found" << std::endl;
+                        exit(1);
+                    }
+                    if (title == "") {
+                        title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds";
+                    }
+                    filesToTest.push_back(file_name);
+                } else {
+                    filesToTest = datasets.getNames();
+                }
+            }
+        }
+
+        platform::HyperParameters test_hyperparams;
+        if (hyperparameters_file != "") {
+            test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_file, hyper_best);
+        } else {
+            test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_json);
+        }
+        this->config.model = model_name;
+        this->config.score = score;
+        this->config.discretize = discretize_dataset;
+        this->config.stratified = stratified;
+        this->config.smooth_strategy = smooth_strat;
+        this->config.n_folds = n_folds;
+        this->config.seeds = seeds;
+        auto env = platform::DotEnv();
+        experiment.setTitle(title).setLanguage("c++").setLanguageVersion("gcc 14.1.1");
+        experiment.setDiscretizationAlgorithm(discretize_algo).setSmoothSrategy(smooth_strat);
+        experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
+        experiment.setStratified(stratified).setNFolds(n_folds).setScoreName(score);
+        experiment.setHyperparameters(test_hyperparams);
+        for (auto seed : seeds) {
+            experiment.addRandomSeed(seed);
+        }
    }
    json GridExperiment::getResults()
    {
@@ -25,9 +133,7 @@ namespace platform {
        *    // this index is relative to the list of used datasets in the actual run not to the whole datasets list
        *   "seed": # of seed to use,
        *   "fold": # of fold to process
-        *   "hyperpameters": json object with the hyperparameters to use
        * }
-        * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold
        */
        auto tasks = json::array();
        auto all_datasets = datasets.getNames();
@@ -41,7 +147,6 @@ namespace platform {
                        { "idx_dataset", idx_dataset},
                        { "seed", seed },
                        { "fold", n_fold},
-                        { "hyperparameters", json::object() }
                    };
                    tasks.push_back(task);
                }
@@ -53,10 +158,12 @@ namespace platform {
    std::vector<std::string> GridExperiment::filterDatasets(Datasets& datasets) const
    {
        // Load datasets
-        auto datasets_names = datasets.getNames();
-        datasets_names.clear();
-        datasets_names.push_back("iris");
-        return datasets_names;
+        // auto datasets_names = datasets.getNames();
+        // datasets_names.clear();
+        // datasets_names.push_back("iris");
+        // datasets_names.push_back("wine");
+        // datasets_names.push_back("balance-scale");
+        return filesToTest;
    }
    json GridExperiment::initializeResults()
    {
@@ -83,17 +190,60 @@ namespace platform {
    }
    void GridExperiment::compile_results(json& results, json& all_results, std::string& model)
    {
-        results = json::object();
-        for (const auto& result : all_results.items()) {
+        results = json::array();
+        auto datasets = Datasets(false, Paths::datasets());
+        for (const auto& result_item : all_results.items()) {
            // each result has the results of all the outer folds as each one were a different task
-            auto dataset = result.key();
-            results[dataset] = json::array();
-            for (int fold = 0; fold < result.value().size(); ++fold) {
-                results[dataset].push_back(json::object());
-            }
-            for (const auto& result_fold : result.value()) {
-                results[dataset][result_fold["fold"].get<int>()] = result_fold;
+            auto dataset_name = result_item.key();
+            auto data = result_item.value();
+            auto result = json::object();
+            int data_size = data.size();
+            auto score = torch::zeros({ data_size }, torch::kFloat64);
+            auto time_t = torch::zeros({ data_size }, torch::kFloat64);
+            auto nodes = torch::zeros({ data_size }, torch::kFloat64);
+            auto leaves = torch::zeros({ data_size }, torch::kFloat64);
+            auto depth = torch::zeros({ data_size }, torch::kFloat64);
+            for (int fold = 0; fold < data_size; ++fold) {
+                result["scores_test"].push_back(data[fold]["score"]);
+                score[fold] = data[fold]["score"].get<double>();
+                time_t[fold] = data[fold]["time"].get<double>();
+                nodes[fold] = data[fold]["nodes"].get<double>();
+                leaves[fold] = data[fold]["leaves"].get<double>();
+                depth[fold] = data[fold]["depth"].get<double>();
            }
+            double score_mean = torch::mean(score).item<double>();
+            double score_std = torch::std(score).item<double>();
+            double time_mean = torch::mean(time_t).item<double>();
+            double time_std = torch::std(time_t).item<double>();
+            double nodes_mean = torch::mean(nodes).item<double>();
+            double leaves_mean = torch::mean(leaves).item<double>();
+            double depth_mean = torch::mean(depth).item<double>();
+            auto& dataset = datasets.getDataset(dataset_name);
+            dataset.load();
+            result["samples"] = dataset.getNSamples();
+            result["features"] = dataset.getNFeatures();
+            result["classes"] = dataset.getNClasses();
+            result["hyperparameters"] = experiment.getHyperParameters().get(dataset_name);
+            result["score"] = score_mean;
+            result["score_std"] = score_std;
+            result["time"] = time_mean;
+            result["time_std"] = time_std;
+            result["nodes"] = nodes_mean;
+            result["leaves"] = leaves_mean;
+            result["depth"] = depth_mean;
+            result["dataset"] = dataset_name;
+            // Fixed data
+            result["scores_train"] = json::array();
+            result["times_train"] = json::array();
+            result["times_test"] = json::array();
+            result["train_time"] = 0.0;
+            result["train_time_std"] = 0.0;
+            result["test_time"] = 0.0;
+            result["test_time_std"] = 0.0;
+            result["score_train"] = 0.0;
+            result["score_train_std"] = 0.0;
+            result["confusion_matrices"] = json::array();
+            results.push_back(result);
        }
        computed_results = results;
    }
@@ -164,7 +314,7 @@ namespace platform {
        //
        auto clf = Models::instance()->create(config.model);
        auto valid = clf->getValidHyperparameters();
-        auto hyperparameters = platform::HyperParameters(datasets.getNames(), task["hyperparameters"]);
+        auto hyperparameters = experiment.getHyperParameters();
        hyperparameters.check(valid, dataset_name);
        clf->setHyperparameters(hyperparameters.get(dataset_name));
        //
--- a/src/grid/GridExperiment.h
+++ b/src/grid/GridExperiment.h
@@ -3,8 +3,11 @@
 #include <string>
 #include <map>
 #include <mpi.h>
+#include <argparse/argparse.hpp>
 #include <nlohmann/json.hpp>
 #include "common/Datasets.h"
+#include "common/DotEnv.h"
+#include "main/Experiment.h"
 #include "main/HyperParameters.h"
 #include "GridData.h"
 #include "GridBase.h"
@@ -15,11 +18,14 @@ namespace platform {
    using json = nlohmann::ordered_json;
    class GridExperiment : public GridBase {
    public:
-        explicit GridExperiment(struct ConfigGrid& config);
+        explicit GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config);
        ~GridExperiment() = default;
        json getResults();
    private:
+        argparse::ArgumentParser& arguments;
+        Experiment experiment;
        json computed_results;
+        std::vector<std::string> filesToTest;
        void save(json& results);
        json initializeResults();
        json build_tasks(Datasets& datasets);
--- a/src/main/Experiment.cpp
+++ b/src/main/Experiment.cpp
@@ -9,6 +9,7 @@ namespace platform {

    void Experiment::saveResult()
    {
+        result.check();
        result.save();
        std::cout << "Result saved in " << Paths::results() << result.getFilename() << std::endl;
    }
--- a/src/main/Experiment.h
+++ b/src/main/Experiment.h
@@ -48,6 +48,7 @@ namespace platform {
        Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); result.addSeed(randomSeed); return *this; }
        Experiment& setDuration(float duration) { this->result.setDuration(duration); return *this; }
        Experiment& setHyperparameters(const HyperParameters& hyperparameters_) { this->hyperparameters = hyperparameters_; return *this; }
+        HyperParameters& getHyperParameters() { return hyperparameters; }
        void cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files, bool graph);
        void go(std::vector<std::string> filesToProcess, bool quiet, bool no_train_score, bool generate_fold_files, bool graph);
        void saveResult();
--- a/src/results/JsonValidator.h
+++ b/src/results/JsonValidator.h
@@ -11,47 +11,33 @@ namespace platform {
    public:
        JsonValidator(const json& schema) : schema(schema) {}

-        std::vector<std::string> validate(const std::string& fileName)
+        std::vector<std::string> validate_file(const std::string& fileName)
+        {
+            auto data = load_json_file(fileName);
+            return validate(data);
+        }
+        std::vector<std::string> validate(const json& data)
        {
-            std::ifstream file(fileName);
-            if (!file.is_open()) {
-                return { "Error: Unable to open file." };
-            }
-
-            json data;
-            try {
-                file >> data;
-            }
-            catch (const json::parse_error& e) {
-                return { "Error: JSON parsing failed: " + std::string(e.what()) };
-            }
-
            std::vector<std::string> errors;
-
            // Validate the top-level object
            validateObject("", schema, data, errors);
-
            return errors;
        }
-
-        void fix_it(const std::string& fileName)
+        json load_json_file(const std::string& fileName)
        {
            std::ifstream file(fileName);
            if (!file.is_open()) {
-                std::cerr << "Error: Unable to open file for fixing." << std::endl;
-                return;
+                throw std::runtime_error("Error: Unable to open file " + fileName);
            }
-
            json data;
-            try {
-                file >> data;
-            }
-            catch (const json::parse_error& e) {
-                std::cerr << "Error: JSON parsing failed: " << e.what() << std::endl;
-                return;
-            }
+            file >> data;
            file.close();
-
+            return data;
+        }
+        void fix_it(const std::string& fileName)
+        {
+            // Load JSON file
+            auto data = load_json_file(fileName);
            // Fix fields
            for (const auto& [key, value] : schema["properties"].items()) {
                if (!data.contains(key)) {
@@ -77,7 +63,6 @@ namespace platform {
                std::cerr << "Error: Unable to open file for writing." << std::endl;
                return;
            }
-
            outFile << data.dump(4);
            outFile.close();
        }
--- a/src/results/Result.cpp
+++ b/src/results/Result.cpp
@@ -8,6 +8,8 @@
 #include "common/Paths.h"
 #include "common/Symbols.h"
 #include "Result.h"
+#include "JsonValidator.h"
+#include "SchemaV1_0.h"

 namespace platform {
    std::string get_actual_date()
@@ -62,7 +64,19 @@ namespace platform {
    {
        return data;
    }
-
+    void Result::check()
+    {
+        platform::JsonValidator validator(platform::SchemaV1_0::schema);
+        data["schema_version"] = "1.0";
+        std::vector<std::string> errors = validator.validate(data);
+        if (!errors.empty()) {
+            std::string message;
+            for (const auto& error : errors) {
+                message += " - " + error + "\n";
+            }
+            throw std::runtime_error("* Result file has validation errors:\n" + message);
+        }
+    }
    void Result::save()
    {
        std::ofstream file(Paths::results() + getFilename());
--- a/src/results/Result.h
+++ b/src/results/Result.h
@@ -16,6 +16,7 @@ namespace platform {
        Result();
        Result& load(const std::string& path, const std::string& filename);
        void save();
+        void check();
        // Getters
        json getJson();
        std::string to_string(int maxModel, int maxTitle) const;