From 343269d48c7e73a25867446c1c8fb2352f72c5d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 28 Dec 2023 23:21:50 +0100 Subject: [PATCH] Fix syntax errors --- src/Platform/GridSearch.cc | 147 +++++++++++++++++++++++++++++++------ src/Platform/GridSearch.h | 1 + 2 files changed, 124 insertions(+), 24 deletions(-) diff --git a/src/Platform/GridSearch.cc b/src/Platform/GridSearch.cc index c2ef440..4b6741e 100644 --- a/src/Platform/GridSearch.cc +++ b/src/Platform/GridSearch.cc @@ -28,6 +28,11 @@ namespace platform { oss << std::put_time(timeinfo, "%H:%M:%S"); return oss.str(); } + std::string get_color_rank(int rank) + { + auto colors = { Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN() }; + return *(colors.begin() + rank % colors.size()); + } GridSearch::GridSearch(struct ConfigGrid& config) : config(config) { } @@ -104,20 +109,16 @@ namespace platform { auto datasets = Datasets(false, Paths::datasets()); auto all_datasets = datasets.getNames(); auto datasets_names = processDatasets(datasets); - for (const auto& dataset : datasets_names) { + for (int idx_dataset = 0; idx_dataset < all_datasets.size(); ++idx_dataset) { + auto dataset = all_datasets[idx_dataset]; for (const auto& seed : config.seeds) { auto combinations = grid.getGrid(dataset); for (int n_fold = 0; n_fold < config.n_folds; n_fold++) { - auto it = find(all_datasets.begin(), all_datasets.end(), dataset); - if (it == all_datasets.end()) { - throw std::invalid_argument("Dataset " + dataset + " not found"); - } - auto idx_dataset = std::distance(all_datasets.begin(), it); json task = { { "dataset", dataset }, { "idx_dataset", idx_dataset}, { "seed", seed }, - { "fold", n_fold} + { "fold", n_fold}, }; tasks.push_back(task); } @@ -134,8 +135,96 @@ namespace platform { std::cout << "|" << std::endl << "|" << std::flush; return tasks; } - void process_task_mpi(struct ConfigMPI& config_mpi, int task, Task_Result* result) + void process_task_mpi_consumer(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result) { + // initialize + Timer timer; + timer.start(); + json task = tasks[n_task]; + auto model = config.model; + auto grid = GridData(Paths::grid_input(model)); + auto dataset = task["dataset"].get(); + auto idx_dataset = task["idx_dataset"].get(); + auto seed = task["seed"].get(); + auto n_fold = task["fold"].get(); + bool stratified = config.stratified; + // Generate the hyperparamters combinations + auto combinations = grid.getGrid(dataset); + auto [X, y] = datasets.getTensors(dataset); + auto states = datasets.getStates(dataset); + auto features = datasets.getFeatures(dataset); + auto className = datasets.getClassName(dataset); + // + // Start working on task + // + Fold* fold; + if (stratified) + fold = new StratifiedKFold(config.n_folds, y, seed); + else + fold = new KFold(config.n_folds, y.size(0), seed); + auto [train, test] = fold->getFold(n_fold); + auto train_t = torch::tensor(train); + auto test_t = torch::tensor(test); + auto X_train = X.index({ "...", train_t }); + auto y_train = y.index({ train_t }); + auto X_test = X.index({ "...", test_t }); + auto y_test = y.index({ test_t }); + auto num = 0; + double best_fold_score = 0.0; + int best_idx_combination = -1; + json best_fold_hyper; + for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) { + auto hyperparam_line = combinations[idx_combination]; + auto hyperparameters = platform::HyperParameters(datasets.getNames(), hyperparam_line); + Fold* nested_fold; + if (config.stratified) + nested_fold = new StratifiedKFold(config.nested, y_train, seed); + else + nested_fold = new KFold(config.nested, y_train.size(0), seed); + double score = 0.0; + for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) { + // Nested level fold + auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold); + auto train_nested_t = torch::tensor(train_nested); + auto test_nested_t = torch::tensor(test_nested); + auto X_nested_train = X_train.index({ "...", train_nested_t }); + auto y_nested_train = y_train.index({ train_nested_t }); + auto X_nested_test = X_train.index({ "...", test_nested_t }); + auto y_nested_test = y_train.index({ test_nested_t }); + // Build Classifier with selected hyperparameters + auto clf = Models::instance()->create(config.model); + auto valid = clf->getValidHyperparameters(); + hyperparameters.check(valid, dataset); + clf->setHyperparameters(hyperparameters.get(dataset)); + // Train model + clf->fit(X_nested_train, y_nested_train, features, className, states); + // Test model + score += clf->score(X_nested_test, y_nested_test); + } + delete nested_fold; + score /= config.nested; + if (score > best_fold_score) { + best_fold_score = score; + best_idx_combination = idx_combination; + best_fold_hyper = hyperparam_line; + } + } + delete fold; + // Build Classifier with the best hyperparameters to obtain the best score + auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper); + auto clf = Models::instance()->create(config.model); + auto valid = clf->getValidHyperparameters(); + hyperparameters.check(valid, dataset); + clf->setHyperparameters(best_fold_hyper); + clf->fit(X_train, y_train, features, className, states); + best_fold_score = clf->score(X_test, y_test); + // Return the result + result->idx_dataset = task["idx_dataset"].get(); + result->idx_combination = best_idx_combination; + result->score = best_fold_score; + result->time = timer.getDuration(); + // Update progress bar + std::cout << get_color_rank(config_mpi.rank) << "*" << std::flush; } std::pair GridSearch::part_range_mpi(int n_tasks, int nprocs, int rank) { @@ -155,14 +244,10 @@ namespace platform { } return { start, end }; } - std::string get_color_rank(int rank) - { - auto colors = { Colors::RED(), Colors::GREEN(), Colors::BLUE(), Colors::MAGENTA(), Colors::CYAN() }; - return *(colors.begin() + rank % colors.size()); - } - void producer(json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) + json producer(json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) { Task_Result result; + json results; int num_tasks = tasks.size(); for (int i = 0; i < num_tasks; ++i) { MPI_Status status; @@ -183,8 +268,17 @@ namespace platform { } MPI_Send(&i, 1, MPI_INT, status.MPI_SOURCE, TAG_END, MPI_COMM_WORLD); } + return results; } - void consumer(json& tasks, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) + json select_best_results_folds(json& all_results) + { + json results; + // + // Select the best result of the computed outer folds + // + return results; + } + void consumer(Datasets& datasets, json& tasks, struct ConfigGrid& config, struct ConfigMPI& config_mpi, MPI_Datatype& MPI_Result) { Task_Result result; // Anounce to the producer @@ -197,7 +291,7 @@ namespace platform { break; } // Process task - process_task_mpi(config_mpi, task, &result); + process_task_mpi_consumer(config, config_mpi, tasks, task, datasets, &result); // Send result to producer MPI_Send(&result, 1, MPI_Result, config_mpi.manager, TAG_RESULT, MPI_COMM_WORLD); } @@ -236,21 +330,23 @@ namespace platform { Task_Result result; int tasks_size; MPI_Datatype MPI_Result; - MPI_Datatype type[3] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_DOUBLE }; - int blocklen[3] = { 1, 1, 1 }; - MPI_Aint disp[3]; + MPI_Datatype type[4] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_DOUBLE, MPI_DOUBLE }; + int blocklen[4] = { 1, 1, 1, 1 }; + MPI_Aint disp[4]; disp[0] = offsetof(Task_Result, idx_dataset); disp[1] = offsetof(Task_Result, idx_combination); disp[2] = offsetof(Task_Result, score); - MPI_Type_create_struct(3, blocklen, disp, type, &MPI_Result); + disp[3] = offsetof(Task_Result, time); + MPI_Type_create_struct(4, blocklen, disp, type, &MPI_Result); MPI_Type_commit(&MPI_Result); // // 0.2 Manager creates the tasks // char* msg; + json tasks; if (config_mpi.rank == config_mpi.manager) { timer.start(); - auto tasks = build_tasks_mpi(); + tasks = build_tasks_mpi(); auto tasks_str = tasks.dump(); tasks_size = tasks_str.size(); msg = new char[tasks_size + 1]; @@ -264,15 +360,18 @@ namespace platform { msg = new char[tasks_size + 1]; } MPI_Bcast(msg, tasks_size + 1, MPI_CHAR, config_mpi.manager, MPI_COMM_WORLD); - json tasks = json::parse(msg); + tasks = json::parse(msg); delete[] msg; // // 2. All Workers will receive the tasks and start the process // + auto datasets = Datasets(config.discretize, Paths::datasets()); if (config_mpi.rank == config_mpi.manager) { - producer(tasks, config_mpi, MPI_Result); + auto all_results = producer(tasks, config_mpi, MPI_Result); + auto results = select_best_results_folds(all_results); + save(results); } else { - consumer(tasks, config_mpi, MPI_Result); + consumer(datasets, tasks, config, config_mpi, MPI_Result); } } void GridSearch::go_mpi(struct ConfigMPI& config_mpi) diff --git a/src/Platform/GridSearch.h b/src/Platform/GridSearch.h index a9e2f6e..8004eca 100644 --- a/src/Platform/GridSearch.h +++ b/src/Platform/GridSearch.h @@ -34,6 +34,7 @@ namespace platform { uint idx_dataset; uint idx_combination; double score; + double time; } Task_Result; const int TAG_QUERY = 1; const int TAG_RESULT = 2;