diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c298fb8..f9908eb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -29,11 +29,13 @@ add_executable( target_link_libraries(b_best Boost::boost "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}") # b_grid -set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp) +set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp ) list(TRANSFORM grid_sources PREPEND grid/) add_executable(b_grid commands/b_grid.cpp ${grid_sources} common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp - main/HyperParameters.cpp main/Models.cpp + main/HyperParameters.cpp main/Models.cpp main/Experiment.cpp main/Scores.cpp + reports/ReportConsole.cpp reports/ReportBase.cpp + results/Result.cpp ) target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) diff --git a/src/commands/b_grid.cpp b/src/commands/b_grid.cpp index aec0d58..51ddc00 100644 --- a/src/commands/b_grid.cpp +++ b/src/commands/b_grid.cpp @@ -36,22 +36,22 @@ void add_experiment_args(argparse::ArgumentParser& program) { auto env = platform::DotEnv(); auto datasets = platform::Datasets(false, platform::Paths::datasets()); - // auto& group = program.add_mutually_exclusive_group(true); - // group.add_argument("-d", "--dataset") - // .help("Dataset file name: " + datasets.toString()) - // .default_value("all") - // .action([](const std::string& value) { - // auto datasets = platform::Datasets(false, platform::Paths::datasets()); - // static std::vector choices_datasets(datasets.getNames()); - // choices_datasets.push_back("all"); - // if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) { - // return value; - // } - // throw std::runtime_error("Dataset must be one of: " + datasets.toString()); - // } - // ); - // group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector()); - // group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test."); + auto& group = program.add_mutually_exclusive_group(true); + group.add_argument("-d", "--dataset") + .help("Dataset file name: " + datasets.toString()) + .default_value("all") + .action([](const std::string& value) { + auto datasets = platform::Datasets(false, platform::Paths::datasets()); + static std::vector choices_datasets(datasets.getNames()); + choices_datasets.push_back("all"); + if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) { + return value; + } + throw std::runtime_error("Dataset must be one of: " + datasets.toString()); + } + ); + group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector()); + group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test."); program.add_argument("--hyperparameters").default_value("{}").help("Hyperparameters passed to the model in Experiment"); program.add_argument("--hyper-file").default_value("").help("Hyperparameters file name." \ "Mutually exclusive with hyperparameters. This file should contain hyperparameters for each dataset in json format."); @@ -83,11 +83,6 @@ void add_experiment_args(argparse::ArgumentParser& program) for (auto choice : valid_choices) { score_arg.choices(choice); } - program.add_argument("--generate-fold-files").help("generate fold information in datasets_experiment folder").default_value(false).implicit_value(true); - program.add_argument("--graph").help("generate graphviz dot files with the model").default_value(false).implicit_value(true); - program.add_argument("--no-train-score").help("Don't compute train score").default_value(false).implicit_value(true); - program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true); - program.add_argument("--save").help("Save result (always save if no dataset is supplied)").default_value(false).implicit_value(true); program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true); program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) { try { @@ -307,19 +302,10 @@ void search(argparse::ArgumentParser& program) void experiment(argparse::ArgumentParser& program) { struct platform::ConfigGrid config; - config.model = program.get("model"); - config.score = program.get("score"); - config.discretize = program.get("discretize"); - config.stratified = program.get("stratified"); - config.smooth_strategy = program.get("smooth-strat"); - config.n_folds = program.get("folds"); - config.quiet = program.get("quiet"); - config.seeds = program.get>("seeds"); auto env = platform::DotEnv(); config.platform = env.get("platform"); - platform::Paths::createPath(platform::Paths::grid()); - auto grid_experiment = platform::GridExperiment(config); + auto grid_experiment = platform::GridExperiment(program, config); platform::Timer timer; timer.start(); struct platform::ConfigMPI mpi_config; @@ -333,6 +319,7 @@ void experiment(argparse::ArgumentParser& program) grid_experiment.go(mpi_config); if (mpi_config.rank == mpi_config.manager) { auto results = grid_experiment.getResults(); + //build_experiment_result(results); std::cout << "****** RESULTS ********" << std::endl; std::cout << results.dump(4) << std::endl; // list_results(results, config.model); diff --git a/src/commands/b_results.cpp b/src/commands/b_results.cpp index 7eebd5c..ac370e7 100644 --- a/src/commands/b_results.cpp +++ b/src/commands/b_results.cpp @@ -74,7 +74,7 @@ int main(int argc, char* argv[]) int n_errors = 0; std::vector files_with_errors; for (const auto& file_name : result_files) { - std::vector errors = validator.validate(file_name); + std::vector errors = validator.validate_file(file_name); if (!errors.empty()) { n_errors++; std::cout << std::setw(max_length) << std::left << file_name << ": " << errors.size() << " Errors:" << std::endl; diff --git a/src/grid/GridBase.cpp b/src/grid/GridBase.cpp index fbd65ca..8b3bbbe 100644 --- a/src/grid/GridBase.cpp +++ b/src/grid/GridBase.cpp @@ -9,6 +9,10 @@ namespace platform { GridBase::GridBase(struct ConfigGrid& config) { this->config = config; + + } + void GridBase::validate_config() + { if (config.smooth_strategy == "ORIGINAL") smooth_type = bayesnet::Smoothing_t::ORIGINAL; else if (config.smooth_strategy == "LAPLACE") @@ -116,7 +120,7 @@ namespace platform { * Each task is a json object with the data needed by the process * * The overall process consists in these steps: - * 0. Create the MPI result type & tasks + * 0. Validate config, create the MPI result type & tasks * 0.1 Create the MPI result type * 0.2 Manager creates the tasks * 1. Manager will broadcast the tasks to all the processes @@ -138,6 +142,7 @@ namespace platform { // // 0.1 Create the MPI result type // + validate_config(); Task_Result result; int tasks_size; MPI_Datatype MPI_Result; diff --git a/src/grid/GridBase.h b/src/grid/GridBase.h index 79d5c87..bd65c5e 100644 --- a/src/grid/GridBase.h +++ b/src/grid/GridBase.h @@ -20,6 +20,7 @@ namespace platform { explicit GridBase(struct ConfigGrid& config); ~GridBase() = default; void go(struct ConfigMPI& config_mpi); + void validate_config(); protected: virtual json build_tasks(Datasets& datasets) = 0; virtual void save(json& results) = 0; diff --git a/src/grid/GridExperiment.cpp b/src/grid/GridExperiment.cpp index 72b47b2..9a3c6be 100644 --- a/src/grid/GridExperiment.cpp +++ b/src/grid/GridExperiment.cpp @@ -8,8 +8,116 @@ #include "GridExperiment.h" namespace platform { - GridExperiment::GridExperiment(struct ConfigGrid& config) : GridBase(config) + GridExperiment::GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config) : arguments(program), GridBase(config) { + std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat, score; + json hyperparameters_json; + bool discretize_dataset, stratified, hyper_best; + std::vector seeds; + std::vector file_names; + int n_folds; + file_name = program.get("dataset"); + file_names = program.get>("datasets"); + datasets_file = program.get("datasets-file"); + model_name = program.get("model"); + discretize_dataset = program.get("discretize"); + discretize_algo = program.get("discretize-algo"); + smooth_strat = program.get("smooth-strat"); + stratified = program.get("stratified"); + n_folds = program.get("folds"); + score = program.get("score"); + seeds = program.get>("seeds"); + auto hyperparameters = program.get("hyperparameters"); + hyperparameters_json = json::parse(hyperparameters); + hyperparameters_file = program.get("hyper-file"); + hyper_best = program.get("hyper-best"); + if (hyper_best) { + // Build the best results file_name + hyperparameters_file = platform::Paths::results() + platform::Paths::bestResultsFile(score, model_name); + // ignore this parameter + hyperparameters = "{}"; + } else { + if (hyperparameters_file != "" && hyperparameters != "{}") { + throw runtime_error("hyperparameters and hyper_file are mutually exclusive"); + } + } + title = program.get("title"); + if (title == "" && file_name == "all") { + throw runtime_error("title is mandatory if all datasets are to be tested"); + } + auto datasets = platform::Datasets(false, platform::Paths::datasets()); + if (datasets_file != "") { + ifstream catalog(datasets_file); + if (catalog.is_open()) { + std::string line; + while (getline(catalog, line)) { + if (line.empty() || line[0] == '#') { + continue; + } + if (!datasets.isDataset(line)) { + cerr << "Dataset " << line << " not found" << std::endl; + exit(1); + } + filesToTest.push_back(line); + } + catalog.close(); + if (title == "") { + title = "Test " + to_string(filesToTest.size()) + " datasets (" + datasets_file + ") "\ + + model_name + " " + to_string(n_folds) + " folds"; + } + } else { + throw std::invalid_argument("Unable to open catalog file. [" + datasets_file + "]"); + } + } else { + if (file_names.size() > 0) { + for (auto file : file_names) { + if (!datasets.isDataset(file)) { + cerr << "Dataset " << file << " not found" << std::endl; + exit(1); + } + } + filesToTest = file_names; + if (title == "") { + title = "Test " + to_string(file_names.size()) + " datasets " + model_name + " " + to_string(n_folds) + " folds"; + } + } else { + if (file_name != "all") { + if (!datasets.isDataset(file_name)) { + cerr << "Dataset " << file_name << " not found" << std::endl; + exit(1); + } + if (title == "") { + title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds"; + } + filesToTest.push_back(file_name); + } else { + filesToTest = datasets.getNames(); + } + } + } + + platform::HyperParameters test_hyperparams; + if (hyperparameters_file != "") { + test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_file, hyper_best); + } else { + test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_json); + } + this->config.model = model_name; + this->config.score = score; + this->config.discretize = discretize_dataset; + this->config.stratified = stratified; + this->config.smooth_strategy = smooth_strat; + this->config.n_folds = n_folds; + this->config.seeds = seeds; + auto env = platform::DotEnv(); + experiment.setTitle(title).setLanguage("c++").setLanguageVersion("gcc 14.1.1"); + experiment.setDiscretizationAlgorithm(discretize_algo).setSmoothSrategy(smooth_strat); + experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform")); + experiment.setStratified(stratified).setNFolds(n_folds).setScoreName(score); + experiment.setHyperparameters(test_hyperparams); + for (auto seed : seeds) { + experiment.addRandomSeed(seed); + } } json GridExperiment::getResults() { @@ -25,9 +133,7 @@ namespace platform { * // this index is relative to the list of used datasets in the actual run not to the whole datasets list * "seed": # of seed to use, * "fold": # of fold to process - * "hyperpameters": json object with the hyperparameters to use * } - * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold */ auto tasks = json::array(); auto all_datasets = datasets.getNames(); @@ -41,7 +147,6 @@ namespace platform { { "idx_dataset", idx_dataset}, { "seed", seed }, { "fold", n_fold}, - { "hyperparameters", json::object() } }; tasks.push_back(task); } @@ -53,10 +158,12 @@ namespace platform { std::vector GridExperiment::filterDatasets(Datasets& datasets) const { // Load datasets - auto datasets_names = datasets.getNames(); - datasets_names.clear(); - datasets_names.push_back("iris"); - return datasets_names; + // auto datasets_names = datasets.getNames(); + // datasets_names.clear(); + // datasets_names.push_back("iris"); + // datasets_names.push_back("wine"); + // datasets_names.push_back("balance-scale"); + return filesToTest; } json GridExperiment::initializeResults() { @@ -83,17 +190,60 @@ namespace platform { } void GridExperiment::compile_results(json& results, json& all_results, std::string& model) { - results = json::object(); - for (const auto& result : all_results.items()) { + results = json::array(); + auto datasets = Datasets(false, Paths::datasets()); + for (const auto& result_item : all_results.items()) { // each result has the results of all the outer folds as each one were a different task - auto dataset = result.key(); - results[dataset] = json::array(); - for (int fold = 0; fold < result.value().size(); ++fold) { - results[dataset].push_back(json::object()); - } - for (const auto& result_fold : result.value()) { - results[dataset][result_fold["fold"].get()] = result_fold; + auto dataset_name = result_item.key(); + auto data = result_item.value(); + auto result = json::object(); + int data_size = data.size(); + auto score = torch::zeros({ data_size }, torch::kFloat64); + auto time_t = torch::zeros({ data_size }, torch::kFloat64); + auto nodes = torch::zeros({ data_size }, torch::kFloat64); + auto leaves = torch::zeros({ data_size }, torch::kFloat64); + auto depth = torch::zeros({ data_size }, torch::kFloat64); + for (int fold = 0; fold < data_size; ++fold) { + result["scores_test"].push_back(data[fold]["score"]); + score[fold] = data[fold]["score"].get(); + time_t[fold] = data[fold]["time"].get(); + nodes[fold] = data[fold]["nodes"].get(); + leaves[fold] = data[fold]["leaves"].get(); + depth[fold] = data[fold]["depth"].get(); } + double score_mean = torch::mean(score).item(); + double score_std = torch::std(score).item(); + double time_mean = torch::mean(time_t).item(); + double time_std = torch::std(time_t).item(); + double nodes_mean = torch::mean(nodes).item(); + double leaves_mean = torch::mean(leaves).item(); + double depth_mean = torch::mean(depth).item(); + auto& dataset = datasets.getDataset(dataset_name); + dataset.load(); + result["samples"] = dataset.getNSamples(); + result["features"] = dataset.getNFeatures(); + result["classes"] = dataset.getNClasses(); + result["hyperparameters"] = experiment.getHyperParameters().get(dataset_name); + result["score"] = score_mean; + result["score_std"] = score_std; + result["time"] = time_mean; + result["time_std"] = time_std; + result["nodes"] = nodes_mean; + result["leaves"] = leaves_mean; + result["depth"] = depth_mean; + result["dataset"] = dataset_name; + // Fixed data + result["scores_train"] = json::array(); + result["times_train"] = json::array(); + result["times_test"] = json::array(); + result["train_time"] = 0.0; + result["train_time_std"] = 0.0; + result["test_time"] = 0.0; + result["test_time_std"] = 0.0; + result["score_train"] = 0.0; + result["score_train_std"] = 0.0; + result["confusion_matrices"] = json::array(); + results.push_back(result); } computed_results = results; } @@ -164,7 +314,7 @@ namespace platform { // auto clf = Models::instance()->create(config.model); auto valid = clf->getValidHyperparameters(); - auto hyperparameters = platform::HyperParameters(datasets.getNames(), task["hyperparameters"]); + auto hyperparameters = experiment.getHyperParameters(); hyperparameters.check(valid, dataset_name); clf->setHyperparameters(hyperparameters.get(dataset_name)); // diff --git a/src/grid/GridExperiment.h b/src/grid/GridExperiment.h index df4851f..2250766 100644 --- a/src/grid/GridExperiment.h +++ b/src/grid/GridExperiment.h @@ -3,8 +3,11 @@ #include #include #include +#include #include #include "common/Datasets.h" +#include "common/DotEnv.h" +#include "main/Experiment.h" #include "main/HyperParameters.h" #include "GridData.h" #include "GridBase.h" @@ -15,11 +18,14 @@ namespace platform { using json = nlohmann::ordered_json; class GridExperiment : public GridBase { public: - explicit GridExperiment(struct ConfigGrid& config); + explicit GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config); ~GridExperiment() = default; json getResults(); private: + argparse::ArgumentParser& arguments; + Experiment experiment; json computed_results; + std::vector filesToTest; void save(json& results); json initializeResults(); json build_tasks(Datasets& datasets); diff --git a/src/main/Experiment.cpp b/src/main/Experiment.cpp index 22395dc..1809c41 100644 --- a/src/main/Experiment.cpp +++ b/src/main/Experiment.cpp @@ -9,6 +9,7 @@ namespace platform { void Experiment::saveResult() { + result.check(); result.save(); std::cout << "Result saved in " << Paths::results() << result.getFilename() << std::endl; } diff --git a/src/main/Experiment.h b/src/main/Experiment.h index 0aed891..9446c0e 100644 --- a/src/main/Experiment.h +++ b/src/main/Experiment.h @@ -48,6 +48,7 @@ namespace platform { Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); result.addSeed(randomSeed); return *this; } Experiment& setDuration(float duration) { this->result.setDuration(duration); return *this; } Experiment& setHyperparameters(const HyperParameters& hyperparameters_) { this->hyperparameters = hyperparameters_; return *this; } + HyperParameters& getHyperParameters() { return hyperparameters; } void cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files, bool graph); void go(std::vector filesToProcess, bool quiet, bool no_train_score, bool generate_fold_files, bool graph); void saveResult(); diff --git a/src/results/JsonValidator.h b/src/results/JsonValidator.h index ea3c846..f4eaf78 100644 --- a/src/results/JsonValidator.h +++ b/src/results/JsonValidator.h @@ -11,47 +11,33 @@ namespace platform { public: JsonValidator(const json& schema) : schema(schema) {} - std::vector validate(const std::string& fileName) + std::vector validate_file(const std::string& fileName) + { + auto data = load_json_file(fileName); + return validate(data); + } + std::vector validate(const json& data) { - std::ifstream file(fileName); - if (!file.is_open()) { - return { "Error: Unable to open file." }; - } - - json data; - try { - file >> data; - } - catch (const json::parse_error& e) { - return { "Error: JSON parsing failed: " + std::string(e.what()) }; - } - std::vector errors; - // Validate the top-level object validateObject("", schema, data, errors); - return errors; } - - void fix_it(const std::string& fileName) + json load_json_file(const std::string& fileName) { std::ifstream file(fileName); if (!file.is_open()) { - std::cerr << "Error: Unable to open file for fixing." << std::endl; - return; + throw std::runtime_error("Error: Unable to open file " + fileName); } - json data; - try { - file >> data; - } - catch (const json::parse_error& e) { - std::cerr << "Error: JSON parsing failed: " << e.what() << std::endl; - return; - } + file >> data; file.close(); - + return data; + } + void fix_it(const std::string& fileName) + { + // Load JSON file + auto data = load_json_file(fileName); // Fix fields for (const auto& [key, value] : schema["properties"].items()) { if (!data.contains(key)) { @@ -77,7 +63,6 @@ namespace platform { std::cerr << "Error: Unable to open file for writing." << std::endl; return; } - outFile << data.dump(4); outFile.close(); } diff --git a/src/results/Result.cpp b/src/results/Result.cpp index 75a89b7..9cde61a 100644 --- a/src/results/Result.cpp +++ b/src/results/Result.cpp @@ -8,6 +8,8 @@ #include "common/Paths.h" #include "common/Symbols.h" #include "Result.h" +#include "JsonValidator.h" +#include "SchemaV1_0.h" namespace platform { std::string get_actual_date() @@ -62,7 +64,19 @@ namespace platform { { return data; } - + void Result::check() + { + platform::JsonValidator validator(platform::SchemaV1_0::schema); + data["schema_version"] = "1.0"; + std::vector errors = validator.validate(data); + if (!errors.empty()) { + std::string message; + for (const auto& error : errors) { + message += " - " + error + "\n"; + } + throw std::runtime_error("* Result file has validation errors:\n" + message); + } + } void Result::save() { std::ofstream file(Paths::results() + getFilename()); diff --git a/src/results/Result.h b/src/results/Result.h index b72ec53..7d7c2c6 100644 --- a/src/results/Result.h +++ b/src/results/Result.h @@ -16,6 +16,7 @@ namespace platform { Result(); Result& load(const std::string& path, const std::string& filename); void save(); + void check(); // Getters json getJson(); std::string to_string(int maxModel, int maxTitle) const;