diff --git a/src/commands/b_grid.cpp b/src/commands/b_grid.cpp index b6fe208..aec0d58 100644 --- a/src/commands/b_grid.cpp +++ b/src/commands/b_grid.cpp @@ -36,22 +36,22 @@ void add_experiment_args(argparse::ArgumentParser& program) { auto env = platform::DotEnv(); auto datasets = platform::Datasets(false, platform::Paths::datasets()); - auto& group = program.add_mutually_exclusive_group(true); - group.add_argument("-d", "--dataset") - .help("Dataset file name: " + datasets.toString()) - .default_value("all") - .action([](const std::string& value) { - auto datasets = platform::Datasets(false, platform::Paths::datasets()); - static std::vector choices_datasets(datasets.getNames()); - choices_datasets.push_back("all"); - if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) { - return value; - } - throw std::runtime_error("Dataset must be one of: " + datasets.toString()); - } - ); - group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector()); - group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test."); + // auto& group = program.add_mutually_exclusive_group(true); + // group.add_argument("-d", "--dataset") + // .help("Dataset file name: " + datasets.toString()) + // .default_value("all") + // .action([](const std::string& value) { + // auto datasets = platform::Datasets(false, platform::Paths::datasets()); + // static std::vector choices_datasets(datasets.getNames()); + // choices_datasets.push_back("all"); + // if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) { + // return value; + // } + // throw std::runtime_error("Dataset must be one of: " + datasets.toString()); + // } + // ); + // group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector()); + // group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test."); program.add_argument("--hyperparameters").default_value("{}").help("Hyperparameters passed to the model in Experiment"); program.add_argument("--hyper-file").default_value("").help("Hyperparameters file name." \ "Mutually exclusive with hyperparameters. This file should contain hyperparameters for each dataset in json format."); @@ -261,7 +261,7 @@ void report(argparse::ArgumentParser& program) list_results(results, config.model); } } -void compute(argparse::ArgumentParser& program) +void search(argparse::ArgumentParser& program) { struct platform::ConfigGrid config; config.model = program.get("model"); @@ -298,6 +298,7 @@ void compute(argparse::ArgumentParser& program) grid_search.go(mpi_config); if (mpi_config.rank == mpi_config.manager) { auto results = grid_search.loadResults(); + std::cout << Colors::RESET() << "* Report of the computed hyperparameters" << std::endl; list_results(results, config.model); std::cout << "Process took " << timer.getDurationString() << std::endl; } @@ -331,7 +332,9 @@ void experiment(argparse::ArgumentParser& program) } grid_experiment.go(mpi_config); if (mpi_config.rank == mpi_config.manager) { - // auto results = grid_experiment.loadResults(); + auto results = grid_experiment.getResults(); + std::cout << "****** RESULTS ********" << std::endl; + std::cout << results.dump(4) << std::endl; // list_results(results, config.model); std::cout << "Process took " << timer.getDurationString() << std::endl; } @@ -354,10 +357,10 @@ int main(int argc, char** argv) report_command.add_description("Report the computed hyperparameters of a model."); // grid compute subparser - argparse::ArgumentParser compute_command("compute"); - compute_command.add_description("Compute using mpi the hyperparameters of a model."); - assignModel(compute_command); - add_compute_args(compute_command); + argparse::ArgumentParser search_command("search"); + search_command.add_description("Search using mpi the hyperparameters of a model."); + assignModel(search_command); + add_compute_args(search_command); // grid experiment subparser argparse::ArgumentParser experiment_command("experiment"); @@ -367,7 +370,7 @@ int main(int argc, char** argv) program.add_subparser(dump_command); program.add_subparser(report_command); - program.add_subparser(compute_command); + program.add_subparser(search_command); program.add_subparser(experiment_command); // @@ -376,7 +379,7 @@ int main(int argc, char** argv) try { program.parse_args(argc, argv); bool found = false; - map commands = { {"dump", &dump}, {"report", &report}, {"compute", &compute}, { "experiment",&experiment } }; + map commands = { {"dump", &dump}, {"report", &report}, {"search", &search}, { "experiment",&experiment } }; for (const auto& command : commands) { if (program.is_subcommand_used(command.first)) { std::invoke(command.second, program.at(command.first)); diff --git a/src/grid/GridBase.cpp b/src/grid/GridBase.cpp index 51d839e..fbd65ca 100644 --- a/src/grid/GridBase.cpp +++ b/src/grid/GridBase.cpp @@ -26,39 +26,9 @@ namespace platform { std::string id = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; auto idx = rank % id.size(); return *(colors.begin() + rank % colors.size()) + id[idx]; - }; - json GridBase::build_tasks() + } + void GridBase::shuffle_and_progress_bar(json& tasks) { - /* - * Each task is a json object with the following structure: - * { - * "dataset": "dataset_name", - * "idx_dataset": idx_dataset, // used to identify the dataset in the results - * // this index is relative to the list of used datasets in the actual run not to the whole datasets list - * "seed": # of seed to use, - * "fold": # of fold to process - * } - */ - auto tasks = json::array(); - auto grid = GridData(Paths::grid_input(config.model)); - auto datasets = Datasets(false, Paths::datasets()); - auto all_datasets = datasets.getNames(); - auto datasets_names = filterDatasets(datasets); - for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) { - auto dataset = datasets_names[idx_dataset]; - for (const auto& seed : config.seeds) { - auto combinations = grid.getGrid(dataset); - for (int n_fold = 0; n_fold < config.n_folds; n_fold++) { - json task = { - { "dataset", dataset }, - { "idx_dataset", idx_dataset}, - { "seed", seed }, - { "fold", n_fold}, - }; - tasks.push_back(task); - } - } - } // Shuffle the array so heavy datasets are eas ier spread across the workers std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle std::shuffle(tasks.begin(), tasks.end(), g); @@ -71,7 +41,6 @@ namespace platform { std::cout << (i + 1) % 10; } std::cout << separator << std::endl << separator << std::flush; - return tasks; } void GridBase::summary(json& all_results, json& tasks, struct ConfigMPI& config_mpi) { @@ -135,25 +104,16 @@ namespace platform { total += task["time"].get(); } if (num_tasks > 1) { - std::cout << Colors::MAGENTA() << setw(3) << std::right << num_tasks; - std::cout << setw(max_dataset) << " Total..." << std::string(10, '.'); - std::cout << setw(15) << std::setprecision(7) << std::fixed << total << std::endl; + std::cout << Colors::MAGENTA() << " "; + std::cout << setw(max_dataset) << "Total (" << setw(2) << std::right << num_tasks << ")" << std::string(7, '.'); + std::cout << " " << setw(15) << std::setprecision(7) << std::fixed << total << std::endl; } } } void GridBase::go(struct ConfigMPI& config_mpi) { /* - * Each task is a json object with the following structure: - * { - * "dataset": "dataset_name", - * "idx_dataset": idx_dataset, // used to identify the dataset in the results - * // this index is relative to the list of used datasets in the actual run not to the whole datasets list - * "seed": # of seed to use, - * "fold": # of fold to process - * } - * - * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold + * Each task is a json object with the data needed by the process * * The overall process consists in these steps: * 0. Create the MPI result type & tasks @@ -170,7 +130,7 @@ namespace platform { * 2b.1 Consumers announce to the producer that they are ready to receive a task * 2b.2 Consumers receive the task from the producer and process it * 2b.3 Consumers send the result to the producer - * 3. Manager select the bests scores for each dataset + * 3. Manager compile results for each dataset * 3.1 Loop thru all the results obtained from each outer fold (task) and select the best * 3.2 Save the results * 3.3 Summary of jobs done @@ -201,9 +161,11 @@ namespace platform { // char* msg; json tasks; + auto env = platform::DotEnv(); + auto datasets = Datasets(config.discretize, Paths::datasets(), env.get("discretize_algo")); if (config_mpi.rank == config_mpi.manager) { timer.start(); - tasks = build_tasks(); + tasks = build_tasks(datasets); auto tasks_str = tasks.dump(); tasks_size = tasks_str.size(); msg = new char[tasks_size + 1]; @@ -219,8 +181,7 @@ namespace platform { MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD); tasks = json::parse(msg); delete[] msg; - auto env = platform::DotEnv(); - auto datasets = Datasets(config.discretize, Paths::datasets(), env.get("discretize_algo")); + if (config_mpi.rank == config_mpi.manager) { // @@ -230,10 +191,10 @@ namespace platform { json all_results = producer(datasets_names, tasks, config_mpi, MPI_Result); std::cout << separator << std::endl; // - // 3. Manager select the bests sccores for each dataset + // 3. Manager compile results for each dataset // auto results = initializeResults(); - select_best_results_folds(results, all_results, config.model); + compile_results(results, all_results, config.model); // // 3.2 Save the results // @@ -250,5 +211,61 @@ namespace platform { consumer(datasets, tasks, config, config_mpi, MPI_Result); } } + json GridBase::producer(std::vector& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) + { + Task_Result result; + json results; + int num_tasks = tasks.size(); + // + // 2a.1 Producer will loop to send all the tasks to the consumers and receive the results + // + for (int i = 0; i < num_tasks; ++i) { + MPI_Status status; + MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + if (status.MPI_TAG == TAG_RESULT) { + //Store result + store_result(names, result, results); + + } + MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD); + } + // + // 2a.2 Producer will send the end message to all the consumers + // + for (int i = 0; i < config_mpi.n_procs - 1; ++i) { + MPI_Status status; + MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + if (status.MPI_TAG == TAG_RESULT) { + //Store result + store_result(names, result, results); + } + MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD); + } + return results; + } + void GridBase::consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) + { + Task_Result result; + // + // 2b.1 Consumers announce to the producer that they are ready to receive a task + // + MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD); + int task; + while (true) { + MPI_Status status; + // + // 2b.2 Consumers receive the task from the producer and process it + // + MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status); + if (status.MPI_TAG == TAG_END) { + break; + } + consumer_go(config, config_mpi, tasks, task, datasets, &result); + // + // 2b.3 Consumers send the result to the producer + // + MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD); + } + } } \ No newline at end of file diff --git a/src/grid/GridBase.h b/src/grid/GridBase.h index 9b29196..79d5c87 100644 --- a/src/grid/GridBase.h +++ b/src/grid/GridBase.h @@ -21,16 +21,17 @@ namespace platform { ~GridBase() = default; void go(struct ConfigMPI& config_mpi); protected: + virtual json build_tasks(Datasets& datasets) = 0; virtual void save(json& results) = 0; virtual std::vector filterDatasets(Datasets& datasets) const = 0; virtual json initializeResults() = 0; - virtual json producer(std::vector& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) = 0; - virtual void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) = 0; - virtual void select_best_results_folds(json& results, json& all_results, std::string& model) = 0; + virtual void compile_results(json& results, json& all_results, std::string& model) = 0; virtual json store_result(std::vector& names, Task_Result& result, json& results) = 0; virtual void consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result) = 0; + void shuffle_and_progress_bar(json& tasks); + json producer(std::vector& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result); + void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result); std::string get_color_rank(int rank); - json build_tasks(); void summary(json& all_results, json& tasks, struct ConfigMPI& config_mpi); struct ConfigGrid config; Timer timer; // used to measure the time of the whole process diff --git a/src/grid/GridExperiment.cpp b/src/grid/GridExperiment.cpp index 7d1df34..72b47b2 100644 --- a/src/grid/GridExperiment.cpp +++ b/src/grid/GridExperiment.cpp @@ -11,176 +11,91 @@ namespace platform { GridExperiment::GridExperiment(struct ConfigGrid& config) : GridBase(config) { } - json GridExperiment::loadResults() + json GridExperiment::getResults() { - std::ifstream file(Paths::grid_output(config.model)); - if (file.is_open()) { - return json::parse(file); + return computed_results; + } + json GridExperiment::build_tasks(Datasets& datasets) + { + /* + * Each task is a json object with the following structure: + * { + * "dataset": "dataset_name", + * "idx_dataset": idx_dataset, // used to identify the dataset in the results + * // this index is relative to the list of used datasets in the actual run not to the whole datasets list + * "seed": # of seed to use, + * "fold": # of fold to process + * "hyperpameters": json object with the hyperparameters to use + * } + * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold + */ + auto tasks = json::array(); + auto all_datasets = datasets.getNames(); + auto datasets_names = filterDatasets(datasets); + for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) { + auto dataset = datasets_names[idx_dataset]; + for (const auto& seed : config.seeds) { + for (int n_fold = 0; n_fold < config.n_folds; n_fold++) { + json task = { + { "dataset", dataset }, + { "idx_dataset", idx_dataset}, + { "seed", seed }, + { "fold", n_fold}, + { "hyperparameters", json::object() } + }; + tasks.push_back(task); + } + } } - return json(); + shuffle_and_progress_bar(tasks); + return tasks; } std::vector GridExperiment::filterDatasets(Datasets& datasets) const { // Load datasets auto datasets_names = datasets.getNames(); - if (config.continue_from != NO_CONTINUE()) { - // Continue previous execution: - if (std::find(datasets_names.begin(), datasets_names.end(), config.continue_from) == datasets_names.end()) { - throw std::invalid_argument("Dataset " + config.continue_from + " not found"); - } - // Remove datasets already processed - std::vector::iterator it = datasets_names.begin(); - while (it != datasets_names.end()) { - if (*it != config.continue_from) { - it = datasets_names.erase(it); - } else { - if (config.only) - ++it; - else - break; - } - } - } - // Exclude datasets - for (const auto& name : config.excluded) { - auto dataset = name.get(); - auto it = std::find(datasets_names.begin(), datasets_names.end(), dataset); - if (it == datasets_names.end()) { - throw std::invalid_argument("Dataset " + dataset + " already excluded or doesn't exist!"); - } - datasets_names.erase(it); - } + datasets_names.clear(); + datasets_names.push_back("iris"); return datasets_names; } json GridExperiment::initializeResults() { - // Load previous results if continue is set json results; - if (config.continue_from != NO_CONTINUE()) { - if (!config.quiet) - std::cout << Colors::RESET() << "* Loading previous results" << std::endl; - try { - std::ifstream file(Paths::grid_output(config.model)); - if (file.is_open()) { - results = json::parse(file); - results = results["results"]; - } - } - catch (const std::exception& e) { - std::cerr << "* There were no previous results" << std::endl; - std::cerr << "* Initizalizing new results" << std::endl; - results = json(); - } - } return results; } void GridExperiment::save(json& results) { - std::ofstream file(Paths::grid_output(config.model)); - json output = { - { "model", config.model }, - { "score", config.score }, - { "discretize", config.discretize }, - { "stratified", config.stratified }, - { "n_folds", config.n_folds }, - { "seeds", config.seeds }, - { "date", get_date() + " " + get_time()}, - { "nested", config.nested}, - { "platform", config.platform }, - { "duration", timer.getDurationString(true)}, - { "results", results } - - }; - file << output.dump(4); + // std::ofstream file(Paths::grid_output(config.model)); + // json output = { + // { "model", config.model }, + // { "score", config.score }, + // { "discretize", config.discretize }, + // { "stratified", config.stratified }, + // { "n_folds", config.n_folds }, + // { "seeds", config.seeds }, + // { "date", get_date() + " " + get_time()}, + // { "nested", config.nested}, + // { "platform", config.platform }, + // { "duration", timer.getDurationString(true)}, + // { "results", results } + // }; + // file << output.dump(4); } - // - // - // - json GridExperiment::producer(std::vector& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) + void GridExperiment::compile_results(json& results, json& all_results, std::string& model) { - Task_Result result; - json results; - int num_tasks = tasks.size(); - // - // 2a.1 Producer will loop to send all the tasks to the consumers and receive the results - // - for (int i = 0; i < num_tasks; ++i) { - MPI_Status status; - MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - if (status.MPI_TAG == TAG_RESULT) { - //Store result - store_result(names, result, results); - - } - MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD); - } - // - // 2a.2 Producer will send the end message to all the consumers - // - for (int i = 0; i < config_mpi.n_procs - 1; ++i) { - MPI_Status status; - MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - if (status.MPI_TAG == TAG_RESULT) { - //Store result - store_result(names, result, results); - } - MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD); - } - return results; - } - void GridExperiment::consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) - { - Task_Result result; - // - // 2b.1 Consumers announce to the producer that they are ready to receive a task - // - MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD); - int task; - while (true) { - MPI_Status status; - // - // 2b.2 Consumers receive the task from the producer and process it - // - MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - if (status.MPI_TAG == TAG_END) { - break; - } - consumer_go(config, config_mpi, tasks, task, datasets, &result); - // - // 2b.3 Consumers send the result to the producer - // - MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD); - } - } - void GridExperiment::select_best_results_folds(json& results, json& all_results, std::string& model) - { - Timer timer; - auto grid = GridData(Paths::grid_input(model)); - // - // Select the best result of the computed outer folds - // + results = json::object(); for (const auto& result : all_results.items()) { // each result has the results of all the outer folds as each one were a different task - double best_score = 0.0; - json best; - for (const auto& result_fold : result.value()) { - double score = result_fold["score"].get(); - if (score > best_score) { - best_score = score; - best = result_fold; - } - } auto dataset = result.key(); - auto combinations = grid.getGrid(dataset); - json json_best = { - { "score", best_score }, - { "hyperparameters", combinations[best["combination"].get()] }, - { "date", get_date() + " " + get_time() }, - { "grid", grid.getInputGrid(dataset) }, - { "duration", timer.translate2String(best["time"].get()) } - }; - results[dataset] = json_best; + results[dataset] = json::array(); + for (int fold = 0; fold < result.value().size(); ++fold) { + results[dataset].push_back(json::object()); + } + for (const auto& result_fold : result.value()) { + results[dataset][result_fold["fold"].get()] = result_fold; + } } + computed_results = results; } json GridExperiment::store_result(std::vector& names, Task_Result& result, json& results) { @@ -190,6 +105,9 @@ namespace platform { { "fold", result.n_fold }, { "time", result.time }, { "dataset", result.idx_dataset }, + { "nodes", result.nodes }, + { "leaves", result.leaves }, + { "depth", result.depth }, { "process", result.process }, { "task", result.task } }; @@ -209,7 +127,6 @@ namespace platform { timer.start(); json task = tasks[n_task]; auto model = config.model; - auto grid = GridData(Paths::grid_input(model)); auto dataset_name = task["dataset"].get(); auto idx_dataset = task["idx_dataset"].get(); auto seed = task["seed"].get(); @@ -226,7 +143,6 @@ namespace platform { // Generate the hyperparameters combinations // auto& dataset = datasets.getDataset(dataset_name); - auto combinations = grid.getGrid(dataset_name); dataset.load(); auto [X, y] = dataset.getTensors(); auto features = dataset.getFeatures(); @@ -242,72 +158,35 @@ namespace platform { auto [train, test] = fold->getFold(n_fold); auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test); auto states = dataset.getStates(); // Get the states of the features Once they are discretized - float best_fold_score = 0.0; - int best_idx_combination = -1; - json best_fold_hyper; - for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) { - auto hyperparam_line = combinations[idx_combination]; - auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line); - folding::Fold* nested_fold; - if (config.stratified) - nested_fold = new folding::StratifiedKFold(config.nested, y_train, seed); - else - nested_fold = new folding::KFold(config.nested, y_train.size(0), seed); - double score = 0.0; - for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) { - // - // Nested level fold - // - auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold); - auto train_nested_t = torch::tensor(train_nested); - auto test_nested_t = torch::tensor(test_nested); - auto X_nested_train = X_train.index({ "...", train_nested_t }); - auto y_nested_train = y_train.index({ train_nested_t }); - auto X_nested_test = X_train.index({ "...", test_nested_t }); - auto y_nested_test = y_train.index({ test_nested_t }); - // - // Build Classifier with selected hyperparameters - // - auto clf = Models::instance()->create(config.model); - auto valid = clf->getValidHyperparameters(); - hyperparameters.check(valid, dataset_name); - clf->setHyperparameters(hyperparameters.get(dataset_name)); - // - // Train model - // - clf->fit(X_nested_train, y_nested_train, features, className, states, smooth); - // - // Test model - // - score += clf->score(X_nested_test, y_nested_test); - } - delete nested_fold; - score /= config.nested; - if (score > best_fold_score) { - best_fold_score = score; - best_idx_combination = idx_combination; - best_fold_hyper = hyperparam_line; - } - } - delete fold; + // - // Build Classifier with the best hyperparameters to obtain the best score + // Build Classifier with selected hyperparameters // - auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper); auto clf = Models::instance()->create(config.model); auto valid = clf->getValidHyperparameters(); + auto hyperparameters = platform::HyperParameters(datasets.getNames(), task["hyperparameters"]); hyperparameters.check(valid, dataset_name); - clf->setHyperparameters(best_fold_hyper); + clf->setHyperparameters(hyperparameters.get(dataset_name)); + // + // Train model + // clf->fit(X_train, y_train, features, className, states, smooth); - best_fold_score = clf->score(X_test, y_test); + // + // Test model + // + double score = clf->score(X_test, y_test); + delete fold; // // Return the result // result->idx_dataset = task["idx_dataset"].get(); - result->idx_combination = best_idx_combination; - result->score = best_fold_score; + result->idx_combination = 0; + result->score = score; result->n_fold = n_fold; result->time = timer.getDuration(); + result->nodes = clf->getNumberOfNodes(); + result->leaves = clf->getNumberOfEdges(); + result->depth = clf->getNumberOfStates(); result->process = config_mpi.rank; result->task = n_task; // diff --git a/src/grid/GridExperiment.h b/src/grid/GridExperiment.h index 61efe10..df4851f 100644 --- a/src/grid/GridExperiment.h +++ b/src/grid/GridExperiment.h @@ -17,15 +17,14 @@ namespace platform { public: explicit GridExperiment(struct ConfigGrid& config); ~GridExperiment() = default; - json loadResults(); - static inline std::string NO_CONTINUE() { return "NO_CONTINUE"; } + json getResults(); private: + json computed_results; void save(json& results); json initializeResults(); + json build_tasks(Datasets& datasets); std::vector filterDatasets(Datasets& datasets) const; - json producer(std::vector& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result); - void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result); - void select_best_results_folds(json& results, json& all_results, std::string& model); + void compile_results(json& results, json& all_results, std::string& model); json store_result(std::vector& names, Task_Result& result, json& results); void consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result); }; diff --git a/src/grid/GridSearch.cpp b/src/grid/GridSearch.cpp index cb71154..e303bca 100644 --- a/src/grid/GridSearch.cpp +++ b/src/grid/GridSearch.cpp @@ -19,6 +19,41 @@ namespace platform { } return json(); } + json GridSearch::build_tasks(Datasets& datasets) + { + /* + * Each task is a json object with the following structure: + * { + * "dataset": "dataset_name", + * "idx_dataset": idx_dataset, // used to identify the dataset in the results + * // this index is relative to the list of used datasets in the actual run not to the whole datasets list + * "seed": # of seed to use, + * "fold": # of fold to process + * } + * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold + */ + auto tasks = json::array(); + auto grid = GridData(Paths::grid_input(config.model)); + auto all_datasets = datasets.getNames(); + auto datasets_names = filterDatasets(datasets); + for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) { + auto dataset = datasets_names[idx_dataset]; + for (const auto& seed : config.seeds) { + auto combinations = grid.getGrid(dataset); + for (int n_fold = 0; n_fold < config.n_folds; n_fold++) { + json task = { + { "dataset", dataset }, + { "idx_dataset", idx_dataset}, + { "seed", seed }, + { "fold", n_fold}, + }; + tasks.push_back(task); + } + } + } + shuffle_and_progress_bar(tasks); + return tasks; + } std::vector GridSearch::filterDatasets(Datasets& datasets) const { // Load datasets @@ -93,66 +128,7 @@ namespace platform { }; file << output.dump(4); } - // - // - // - json GridSearch::producer(std::vector& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) - { - Task_Result result; - json results; - int num_tasks = tasks.size(); - // - // 2a.1 Producer will loop to send all the tasks to the consumers and receive the results - // - for (int i = 0; i < num_tasks; ++i) { - MPI_Status status; - MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - if (status.MPI_TAG == TAG_RESULT) { - //Store result - store_result(names, result, results); - - } - MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_TASK, MPI_COMM_WORLD); - } - // - // 2a.2 Producer will send the end message to all the consumers - // - for (int i = 0; i < config_mpi.n_procs - 1; ++i) { - MPI_Status status; - MPI_Recv(&result, 1, MPI_Result, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - if (status.MPI_TAG == TAG_RESULT) { - //Store result - store_result(names, result, results); - } - MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD); - } - return results; - } - void GridSearch::consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) - { - Task_Result result; - // - // 2b.1 Consumers announce to the producer that they are ready to receive a task - // - MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_QUERY, MPI_COMM_WORLD); - int task; - while (true) { - MPI_Status status; - // - // 2b.2 Consumers receive the task from the producer and process it - // - MPI_Recv(&task, 1, MPI_INT, config_mpi.manager, MPI_ANY_TAG, MPI_COMM_WORLD, &status); - if (status.MPI_TAG == TAG_END) { - break; - } - consumer_go(config, config_mpi, tasks, task, datasets, &result); - // - // 2b.3 Consumers send the result to the producer - // - MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD); - } - } - void GridSearch::select_best_results_folds(json& results, json& all_results, std::string& model) + void GridSearch::compile_results(json& results, json& all_results, std::string& model) { Timer timer; auto grid = GridData(Paths::grid_input(model)); diff --git a/src/grid/GridSearch.h b/src/grid/GridSearch.h index f6ca4bc..e05f9a7 100644 --- a/src/grid/GridSearch.h +++ b/src/grid/GridSearch.h @@ -24,10 +24,9 @@ namespace platform { private: void save(json& results); json initializeResults(); + json build_tasks(Datasets& datasets); std::vector filterDatasets(Datasets& datasets) const; - json producer(std::vector& names, json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result); - void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result); - void select_best_results_folds(json& results, json& all_results, std::string& model); + void compile_results(json& results, json& all_results, std::string& model); json store_result(std::vector& names, Task_Result& result, json& results); void consumer_go(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result); };