Continue with grid experiment

This commit is contained in:
2025-01-17 10:39:56 +01:00
parent 9a9a9fb17a
commit c1d5dd74e3
12 changed files with 238 additions and 85 deletions

View File

@@ -29,11 +29,13 @@ add_executable(
target_link_libraries(b_best Boost::boost "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy "${XLSXWRITER_LIB}")
# b_grid
set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp)
set(grid_sources GridSearch.cpp GridData.cpp GridExperiment.cpp GridBase.cpp )
list(TRANSFORM grid_sources PREPEND grid/)
add_executable(b_grid commands/b_grid.cpp ${grid_sources}
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
main/HyperParameters.cpp main/Models.cpp
main/HyperParameters.cpp main/Models.cpp main/Experiment.cpp main/Scores.cpp
reports/ReportConsole.cpp reports/ReportBase.cpp
results/Result.cpp
)
target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" fimdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy)

View File

@@ -36,22 +36,22 @@ void add_experiment_args(argparse::ArgumentParser& program)
{
auto env = platform::DotEnv();
auto datasets = platform::Datasets(false, platform::Paths::datasets());
// auto& group = program.add_mutually_exclusive_group(true);
// group.add_argument("-d", "--dataset")
// .help("Dataset file name: " + datasets.toString())
// .default_value("all")
// .action([](const std::string& value) {
// auto datasets = platform::Datasets(false, platform::Paths::datasets());
// static std::vector<std::string> choices_datasets(datasets.getNames());
// choices_datasets.push_back("all");
// if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) {
// return value;
// }
// throw std::runtime_error("Dataset must be one of: " + datasets.toString());
// }
// );
// group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector<std::string>());
// group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test.");
auto& group = program.add_mutually_exclusive_group(true);
group.add_argument("-d", "--dataset")
.help("Dataset file name: " + datasets.toString())
.default_value("all")
.action([](const std::string& value) {
auto datasets = platform::Datasets(false, platform::Paths::datasets());
static std::vector<std::string> choices_datasets(datasets.getNames());
choices_datasets.push_back("all");
if (find(choices_datasets.begin(), choices_datasets.end(), value) != choices_datasets.end()) {
return value;
}
throw std::runtime_error("Dataset must be one of: " + datasets.toString());
}
);
group.add_argument("--datasets").nargs(1, 50).help("Datasets file names 1..50 separated by spaces").default_value(std::vector<std::string>());
group.add_argument("--datasets-file").default_value("").help("Datasets file name. Mutually exclusive with dataset. This file should contain a list of datasets to test.");
program.add_argument("--hyperparameters").default_value("{}").help("Hyperparameters passed to the model in Experiment");
program.add_argument("--hyper-file").default_value("").help("Hyperparameters file name." \
"Mutually exclusive with hyperparameters. This file should contain hyperparameters for each dataset in json format.");
@@ -83,11 +83,6 @@ void add_experiment_args(argparse::ArgumentParser& program)
for (auto choice : valid_choices) {
score_arg.choices(choice);
}
program.add_argument("--generate-fold-files").help("generate fold information in datasets_experiment folder").default_value(false).implicit_value(true);
program.add_argument("--graph").help("generate graphviz dot files with the model").default_value(false).implicit_value(true);
program.add_argument("--no-train-score").help("Don't compute train score").default_value(false).implicit_value(true);
program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true);
program.add_argument("--save").help("Save result (always save if no dataset is supplied)").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const std::string& value) {
try {
@@ -307,19 +302,10 @@ void search(argparse::ArgumentParser& program)
void experiment(argparse::ArgumentParser& program)
{
struct platform::ConfigGrid config;
config.model = program.get<std::string>("model");
config.score = program.get<std::string>("score");
config.discretize = program.get<bool>("discretize");
config.stratified = program.get<bool>("stratified");
config.smooth_strategy = program.get<std::string>("smooth-strat");
config.n_folds = program.get<int>("folds");
config.quiet = program.get<bool>("quiet");
config.seeds = program.get<std::vector<int>>("seeds");
auto env = platform::DotEnv();
config.platform = env.get("platform");
platform::Paths::createPath(platform::Paths::grid());
auto grid_experiment = platform::GridExperiment(config);
auto grid_experiment = platform::GridExperiment(program, config);
platform::Timer timer;
timer.start();
struct platform::ConfigMPI mpi_config;
@@ -333,6 +319,7 @@ void experiment(argparse::ArgumentParser& program)
grid_experiment.go(mpi_config);
if (mpi_config.rank == mpi_config.manager) {
auto results = grid_experiment.getResults();
//build_experiment_result(results);
std::cout << "****** RESULTS ********" << std::endl;
std::cout << results.dump(4) << std::endl;
// list_results(results, config.model);

View File

@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
int n_errors = 0;
std::vector<std::string> files_with_errors;
for (const auto& file_name : result_files) {
std::vector<std::string> errors = validator.validate(file_name);
std::vector<std::string> errors = validator.validate_file(file_name);
if (!errors.empty()) {
n_errors++;
std::cout << std::setw(max_length) << std::left << file_name << ": " << errors.size() << " Errors:" << std::endl;

View File

@@ -9,6 +9,10 @@ namespace platform {
GridBase::GridBase(struct ConfigGrid& config)
{
this->config = config;
}
void GridBase::validate_config()
{
if (config.smooth_strategy == "ORIGINAL")
smooth_type = bayesnet::Smoothing_t::ORIGINAL;
else if (config.smooth_strategy == "LAPLACE")
@@ -116,7 +120,7 @@ namespace platform {
* Each task is a json object with the data needed by the process
*
* The overall process consists in these steps:
* 0. Create the MPI result type & tasks
* 0. Validate config, create the MPI result type & tasks
* 0.1 Create the MPI result type
* 0.2 Manager creates the tasks
* 1. Manager will broadcast the tasks to all the processes
@@ -138,6 +142,7 @@ namespace platform {
//
// 0.1 Create the MPI result type
//
validate_config();
Task_Result result;
int tasks_size;
MPI_Datatype MPI_Result;

View File

@@ -20,6 +20,7 @@ namespace platform {
explicit GridBase(struct ConfigGrid& config);
~GridBase() = default;
void go(struct ConfigMPI& config_mpi);
void validate_config();
protected:
virtual json build_tasks(Datasets& datasets) = 0;
virtual void save(json& results) = 0;

View File

@@ -8,8 +8,116 @@
#include "GridExperiment.h"
namespace platform {
GridExperiment::GridExperiment(struct ConfigGrid& config) : GridBase(config)
GridExperiment::GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config) : arguments(program), GridBase(config)
{
std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat, score;
json hyperparameters_json;
bool discretize_dataset, stratified, hyper_best;
std::vector<int> seeds;
std::vector<std::string> file_names;
int n_folds;
file_name = program.get<std::string>("dataset");
file_names = program.get<std::vector<std::string>>("datasets");
datasets_file = program.get<std::string>("datasets-file");
model_name = program.get<std::string>("model");
discretize_dataset = program.get<bool>("discretize");
discretize_algo = program.get<std::string>("discretize-algo");
smooth_strat = program.get<std::string>("smooth-strat");
stratified = program.get<bool>("stratified");
n_folds = program.get<int>("folds");
score = program.get<std::string>("score");
seeds = program.get<std::vector<int>>("seeds");
auto hyperparameters = program.get<std::string>("hyperparameters");
hyperparameters_json = json::parse(hyperparameters);
hyperparameters_file = program.get<std::string>("hyper-file");
hyper_best = program.get<bool>("hyper-best");
if (hyper_best) {
// Build the best results file_name
hyperparameters_file = platform::Paths::results() + platform::Paths::bestResultsFile(score, model_name);
// ignore this parameter
hyperparameters = "{}";
} else {
if (hyperparameters_file != "" && hyperparameters != "{}") {
throw runtime_error("hyperparameters and hyper_file are mutually exclusive");
}
}
title = program.get<std::string>("title");
if (title == "" && file_name == "all") {
throw runtime_error("title is mandatory if all datasets are to be tested");
}
auto datasets = platform::Datasets(false, platform::Paths::datasets());
if (datasets_file != "") {
ifstream catalog(datasets_file);
if (catalog.is_open()) {
std::string line;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
if (!datasets.isDataset(line)) {
cerr << "Dataset " << line << " not found" << std::endl;
exit(1);
}
filesToTest.push_back(line);
}
catalog.close();
if (title == "") {
title = "Test " + to_string(filesToTest.size()) + " datasets (" + datasets_file + ") "\
+ model_name + " " + to_string(n_folds) + " folds";
}
} else {
throw std::invalid_argument("Unable to open catalog file. [" + datasets_file + "]");
}
} else {
if (file_names.size() > 0) {
for (auto file : file_names) {
if (!datasets.isDataset(file)) {
cerr << "Dataset " << file << " not found" << std::endl;
exit(1);
}
}
filesToTest = file_names;
if (title == "") {
title = "Test " + to_string(file_names.size()) + " datasets " + model_name + " " + to_string(n_folds) + " folds";
}
} else {
if (file_name != "all") {
if (!datasets.isDataset(file_name)) {
cerr << "Dataset " << file_name << " not found" << std::endl;
exit(1);
}
if (title == "") {
title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds";
}
filesToTest.push_back(file_name);
} else {
filesToTest = datasets.getNames();
}
}
}
platform::HyperParameters test_hyperparams;
if (hyperparameters_file != "") {
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_file, hyper_best);
} else {
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_json);
}
this->config.model = model_name;
this->config.score = score;
this->config.discretize = discretize_dataset;
this->config.stratified = stratified;
this->config.smooth_strategy = smooth_strat;
this->config.n_folds = n_folds;
this->config.seeds = seeds;
auto env = platform::DotEnv();
experiment.setTitle(title).setLanguage("c++").setLanguageVersion("gcc 14.1.1");
experiment.setDiscretizationAlgorithm(discretize_algo).setSmoothSrategy(smooth_strat);
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
experiment.setStratified(stratified).setNFolds(n_folds).setScoreName(score);
experiment.setHyperparameters(test_hyperparams);
for (auto seed : seeds) {
experiment.addRandomSeed(seed);
}
}
json GridExperiment::getResults()
{
@@ -25,9 +133,7 @@ namespace platform {
* // this index is relative to the list of used datasets in the actual run not to the whole datasets list
* "seed": # of seed to use,
* "fold": # of fold to process
* "hyperpameters": json object with the hyperparameters to use
* }
* This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold
*/
auto tasks = json::array();
auto all_datasets = datasets.getNames();
@@ -41,7 +147,6 @@ namespace platform {
{ "idx_dataset", idx_dataset},
{ "seed", seed },
{ "fold", n_fold},
{ "hyperparameters", json::object() }
};
tasks.push_back(task);
}
@@ -53,10 +158,12 @@ namespace platform {
std::vector<std::string> GridExperiment::filterDatasets(Datasets& datasets) const
{
// Load datasets
auto datasets_names = datasets.getNames();
datasets_names.clear();
datasets_names.push_back("iris");
return datasets_names;
// auto datasets_names = datasets.getNames();
// datasets_names.clear();
// datasets_names.push_back("iris");
// datasets_names.push_back("wine");
// datasets_names.push_back("balance-scale");
return filesToTest;
}
json GridExperiment::initializeResults()
{
@@ -83,17 +190,60 @@ namespace platform {
}
void GridExperiment::compile_results(json& results, json& all_results, std::string& model)
{
results = json::object();
for (const auto& result : all_results.items()) {
results = json::array();
auto datasets = Datasets(false, Paths::datasets());
for (const auto& result_item : all_results.items()) {
// each result has the results of all the outer folds as each one were a different task
auto dataset = result.key();
results[dataset] = json::array();
for (int fold = 0; fold < result.value().size(); ++fold) {
results[dataset].push_back(json::object());
}
for (const auto& result_fold : result.value()) {
results[dataset][result_fold["fold"].get<int>()] = result_fold;
auto dataset_name = result_item.key();
auto data = result_item.value();
auto result = json::object();
int data_size = data.size();
auto score = torch::zeros({ data_size }, torch::kFloat64);
auto time_t = torch::zeros({ data_size }, torch::kFloat64);
auto nodes = torch::zeros({ data_size }, torch::kFloat64);
auto leaves = torch::zeros({ data_size }, torch::kFloat64);
auto depth = torch::zeros({ data_size }, torch::kFloat64);
for (int fold = 0; fold < data_size; ++fold) {
result["scores_test"].push_back(data[fold]["score"]);
score[fold] = data[fold]["score"].get<double>();
time_t[fold] = data[fold]["time"].get<double>();
nodes[fold] = data[fold]["nodes"].get<double>();
leaves[fold] = data[fold]["leaves"].get<double>();
depth[fold] = data[fold]["depth"].get<double>();
}
double score_mean = torch::mean(score).item<double>();
double score_std = torch::std(score).item<double>();
double time_mean = torch::mean(time_t).item<double>();
double time_std = torch::std(time_t).item<double>();
double nodes_mean = torch::mean(nodes).item<double>();
double leaves_mean = torch::mean(leaves).item<double>();
double depth_mean = torch::mean(depth).item<double>();
auto& dataset = datasets.getDataset(dataset_name);
dataset.load();
result["samples"] = dataset.getNSamples();
result["features"] = dataset.getNFeatures();
result["classes"] = dataset.getNClasses();
result["hyperparameters"] = experiment.getHyperParameters().get(dataset_name);
result["score"] = score_mean;
result["score_std"] = score_std;
result["time"] = time_mean;
result["time_std"] = time_std;
result["nodes"] = nodes_mean;
result["leaves"] = leaves_mean;
result["depth"] = depth_mean;
result["dataset"] = dataset_name;
// Fixed data
result["scores_train"] = json::array();
result["times_train"] = json::array();
result["times_test"] = json::array();
result["train_time"] = 0.0;
result["train_time_std"] = 0.0;
result["test_time"] = 0.0;
result["test_time_std"] = 0.0;
result["score_train"] = 0.0;
result["score_train_std"] = 0.0;
result["confusion_matrices"] = json::array();
results.push_back(result);
}
computed_results = results;
}
@@ -164,7 +314,7 @@ namespace platform {
//
auto clf = Models::instance()->create(config.model);
auto valid = clf->getValidHyperparameters();
auto hyperparameters = platform::HyperParameters(datasets.getNames(), task["hyperparameters"]);
auto hyperparameters = experiment.getHyperParameters();
hyperparameters.check(valid, dataset_name);
clf->setHyperparameters(hyperparameters.get(dataset_name));
//

View File

@@ -3,8 +3,11 @@
#include <string>
#include <map>
#include <mpi.h>
#include <argparse/argparse.hpp>
#include <nlohmann/json.hpp>
#include "common/Datasets.h"
#include "common/DotEnv.h"
#include "main/Experiment.h"
#include "main/HyperParameters.h"
#include "GridData.h"
#include "GridBase.h"
@@ -15,11 +18,14 @@ namespace platform {
using json = nlohmann::ordered_json;
class GridExperiment : public GridBase {
public:
explicit GridExperiment(struct ConfigGrid& config);
explicit GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config);
~GridExperiment() = default;
json getResults();
private:
argparse::ArgumentParser& arguments;
Experiment experiment;
json computed_results;
std::vector<std::string> filesToTest;
void save(json& results);
json initializeResults();
json build_tasks(Datasets& datasets);

View File

@@ -9,6 +9,7 @@ namespace platform {
void Experiment::saveResult()
{
result.check();
result.save();
std::cout << "Result saved in " << Paths::results() << result.getFilename() << std::endl;
}

View File

@@ -48,6 +48,7 @@ namespace platform {
Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); result.addSeed(randomSeed); return *this; }
Experiment& setDuration(float duration) { this->result.setDuration(duration); return *this; }
Experiment& setHyperparameters(const HyperParameters& hyperparameters_) { this->hyperparameters = hyperparameters_; return *this; }
HyperParameters& getHyperParameters() { return hyperparameters; }
void cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files, bool graph);
void go(std::vector<std::string> filesToProcess, bool quiet, bool no_train_score, bool generate_fold_files, bool graph);
void saveResult();

View File

@@ -11,47 +11,33 @@ namespace platform {
public:
JsonValidator(const json& schema) : schema(schema) {}
std::vector<std::string> validate(const std::string& fileName)
std::vector<std::string> validate_file(const std::string& fileName)
{
auto data = load_json_file(fileName);
return validate(data);
}
std::vector<std::string> validate(const json& data)
{
std::ifstream file(fileName);
if (!file.is_open()) {
return { "Error: Unable to open file." };
}
json data;
try {
file >> data;
}
catch (const json::parse_error& e) {
return { "Error: JSON parsing failed: " + std::string(e.what()) };
}
std::vector<std::string> errors;
// Validate the top-level object
validateObject("", schema, data, errors);
return errors;
}
void fix_it(const std::string& fileName)
json load_json_file(const std::string& fileName)
{
std::ifstream file(fileName);
if (!file.is_open()) {
std::cerr << "Error: Unable to open file for fixing." << std::endl;
return;
throw std::runtime_error("Error: Unable to open file " + fileName);
}
json data;
try {
file >> data;
}
catch (const json::parse_error& e) {
std::cerr << "Error: JSON parsing failed: " << e.what() << std::endl;
return;
}
file >> data;
file.close();
return data;
}
void fix_it(const std::string& fileName)
{
// Load JSON file
auto data = load_json_file(fileName);
// Fix fields
for (const auto& [key, value] : schema["properties"].items()) {
if (!data.contains(key)) {
@@ -77,7 +63,6 @@ namespace platform {
std::cerr << "Error: Unable to open file for writing." << std::endl;
return;
}
outFile << data.dump(4);
outFile.close();
}

View File

@@ -8,6 +8,8 @@
#include "common/Paths.h"
#include "common/Symbols.h"
#include "Result.h"
#include "JsonValidator.h"
#include "SchemaV1_0.h"
namespace platform {
std::string get_actual_date()
@@ -62,7 +64,19 @@ namespace platform {
{
return data;
}
void Result::check()
{
platform::JsonValidator validator(platform::SchemaV1_0::schema);
data["schema_version"] = "1.0";
std::vector<std::string> errors = validator.validate(data);
if (!errors.empty()) {
std::string message;
for (const auto& error : errors) {
message += " - " + error + "\n";
}
throw std::runtime_error("* Result file has validation errors:\n" + message);
}
}
void Result::save()
{
std::ofstream file(Paths::results() + getFilename());

View File

@@ -16,6 +16,7 @@ namespace platform {
Result();
Result& load(const std::string& path, const std::string& filename);
void save();
void check();
// Getters
json getJson();
std::string to_string(int maxModel, int maxTitle) const;