From 017cb8a0dc8dbd469bea211c48f5a8b4fd7d1fc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Wed, 18 Dec 2024 11:17:04 +0100 Subject: [PATCH] Fix smoothing problem in gridsearch --- src/commands/b_grid.cpp | 6 +++++ src/grid/GridSearch.cpp | 53 ++++++++++++++++++++++++++++++++++------- src/grid/GridSearch.h | 4 ++++ 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/src/commands/b_grid.cpp b/src/commands/b_grid.cpp index 6303aec..e816dfa 100644 --- a/src/commands/b_grid.cpp +++ b/src/commands/b_grid.cpp @@ -40,6 +40,11 @@ void add_compute_args(argparse::ArgumentParser& program) program.add_argument("--continue").help("Continue computing from that dataset").default_value(platform::GridSearch::NO_CONTINUE()); program.add_argument("--only").help("Used with continue to compute that dataset only").default_value(false).implicit_value(true); program.add_argument("--exclude").default_value("[]").help("Datasets to exclude in json format, e.g. [\"dataset1\", \"dataset2\"]"); + auto valid_choices = env.valid_tokens("smooth_strat"); + auto& smooth_arg = program.add_argument("--smooth-strat").help("Smooth strategy used in Bayes Network node initialization. Valid values: " + env.valid_values("smooth_strat")).default_value(env.get("smooth_strat")); + for (auto choice : valid_choices) { + smooth_arg.choices(choice); + } program.add_argument("--nested").help("Set the double/nested cross validation number of folds").default_value(5).scan<'i', int>().action([](const std::string& value) { try { auto k = stoi(value); @@ -188,6 +193,7 @@ void compute(argparse::ArgumentParser& program) config.score = program.get("score"); config.discretize = program.get("discretize"); config.stratified = program.get("stratified"); + config.smooth_strategy = program.get("smooth-strat"); config.n_folds = program.get("folds"); config.quiet = program.get("quiet"); config.only = program.get("only"); diff --git a/src/grid/GridSearch.cpp b/src/grid/GridSearch.cpp index c0aaaa0..d3a8972 100644 --- a/src/grid/GridSearch.cpp +++ b/src/grid/GridSearch.cpp @@ -19,6 +19,16 @@ namespace platform { } GridSearch::GridSearch(struct ConfigGrid& config) : config(config) { + if (config.smooth_strategy == "ORIGINAL") + smooth_type = bayesnet::Smoothing_t::ORIGINAL; + else if (config.smooth_strategy == "LAPLACE") + smooth_type = bayesnet::Smoothing_t::LAPLACE; + else if (config.smooth_strategy == "CESTNIK") + smooth_type = bayesnet::Smoothing_t::CESTNIK; + else { + std::cerr << "GridSearch: Unknown smoothing strategy: " << config.smooth_strategy << std::endl; + exit(1); + } } json GridSearch::loadResults() { @@ -87,16 +97,21 @@ namespace platform { std::mt19937 g{ 271 }; // Use fixed seed to obtain the same shuffle std::shuffle(tasks.begin(), tasks.end(), g); std::cout << "* Number of tasks: " << tasks.size() << std::endl; - std::cout << separator; + std::cout << separator << std::flush; for (int i = 0; i < tasks.size(); ++i) { - std::cout << (i + 1) % 10; + if ((i + 1) % 10 == 0) + std::cout << separator; + else + std::cout << (i + 1) % 10; } std::cout << separator << std::endl << separator << std::flush; return tasks; } void process_task_mpi_consumer(struct ConfigGrid& config, struct ConfigMPI& config_mpi, json& tasks, int n_task, Datasets& datasets, Task_Result* result) { + // // initialize + // Timer timer; timer.start(); json task = tasks[n_task]; @@ -107,7 +122,16 @@ namespace platform { auto seed = task["seed"].get(); auto n_fold = task["fold"].get(); bool stratified = config.stratified; - // Generate the hyperparamters combinations + bayesnet::Smoothing_t smooth; + if (config.smooth_strategy == "ORIGINAL") + smooth = bayesnet::Smoothing_t::ORIGINAL; + else if (config.smooth_strategy == "LAPLACE") + smooth = bayesnet::Smoothing_t::LAPLACE; + else if (config.smooth_strategy == "CESTNIK") + smooth = bayesnet::Smoothing_t::CESTNIK; + // + // Generate the hyperparameters combinations + // auto& dataset = datasets.getDataset(dataset_name); auto combinations = grid.getGrid(dataset_name); dataset.load(); @@ -125,9 +149,8 @@ namespace platform { auto [train, test] = fold->getFold(n_fold); auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test); auto states = dataset.getStates(); // Get the states of the features Once they are discretized - double best_fold_score = 0.0; + float best_fold_score = 0.0; int best_idx_combination = -1; - bayesnet::Smoothing_t smoothing = bayesnet::Smoothing_t::NONE; json best_fold_hyper; for (int idx_combination = 0; idx_combination < combinations.size(); ++idx_combination) { auto hyperparam_line = combinations[idx_combination]; @@ -139,7 +162,9 @@ namespace platform { nested_fold = new folding::KFold(config.nested, y_train.size(0), seed); double score = 0.0; for (int n_nested_fold = 0; n_nested_fold < config.nested; n_nested_fold++) { + // // Nested level fold + // auto [train_nested, test_nested] = nested_fold->getFold(n_nested_fold); auto train_nested_t = torch::tensor(train_nested); auto test_nested_t = torch::tensor(test_nested); @@ -147,14 +172,20 @@ namespace platform { auto y_nested_train = y_train.index({ train_nested_t }); auto X_nested_test = X_train.index({ "...", test_nested_t }); auto y_nested_test = y_train.index({ test_nested_t }); + // // Build Classifier with selected hyperparameters + // auto clf = Models::instance()->create(config.model); auto valid = clf->getValidHyperparameters(); hyperparameters.check(valid, dataset_name); clf->setHyperparameters(hyperparameters.get(dataset_name)); + // // Train model - clf->fit(X_nested_train, y_nested_train, features, className, states, smoothing); + // + clf->fit(X_nested_train, y_nested_train, features, className, states, smooth); + // // Test model + // score += clf->score(X_nested_test, y_nested_test); } delete nested_fold; @@ -166,21 +197,27 @@ namespace platform { } } delete fold; + // // Build Classifier with the best hyperparameters to obtain the best score + // auto hyperparameters = platform::HyperParameters(datasets.getNames(), best_fold_hyper); auto clf = Models::instance()->create(config.model); auto valid = clf->getValidHyperparameters(); hyperparameters.check(valid, dataset_name); clf->setHyperparameters(best_fold_hyper); - clf->fit(X_train, y_train, features, className, states, smoothing); + clf->fit(X_train, y_train, features, className, states, smooth); best_fold_score = clf->score(X_test, y_test); + // // Return the result + // result->idx_dataset = task["idx_dataset"].get(); result->idx_combination = best_idx_combination; result->score = best_fold_score; result->n_fold = n_fold; result->time = timer.getDuration(); + // // Update progress bar + // std::cout << get_color_rank(config_mpi.rank) << std::flush; } json store_result(std::vector& names, Task_Result& result, json& results) @@ -294,7 +331,7 @@ namespace platform { * "idx_dataset": idx_dataset, // used to identify the dataset in the results * // this index is relative to the list of used datasets in the actual run not to the whole datasets list * "seed": # of seed to use, - * "Fold": # of fold to process + * "fold": # of fold to process * } * * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold diff --git a/src/grid/GridSearch.h b/src/grid/GridSearch.h index c910202..3aedd44 100644 --- a/src/grid/GridSearch.h +++ b/src/grid/GridSearch.h @@ -8,6 +8,8 @@ #include "common/Timer.h" #include "main/HyperParameters.h" #include "GridData.h" +#include "bayesnet/network/Network.h" + namespace platform { using json = nlohmann::ordered_json; @@ -16,6 +18,7 @@ namespace platform { std::string score; std::string continue_from; std::string platform; + std::string smooth_strategy; bool quiet; bool only; // used with continue_from to only compute that dataset bool discretize; @@ -56,6 +59,7 @@ namespace platform { json build_tasks_mpi(int rank); Timer timer; // used to measure the time of the whole process const std::string separator = "|"; + bayesnet::Smoothing_t smooth_type{ bayesnet::Smoothing_t::NONE }; }; } /* namespace platform */ #endif \ No newline at end of file