diff --git a/src/commands/b_grid.cpp b/src/commands/b_grid.cpp index 51ddc00..1a0281a 100644 --- a/src/commands/b_grid.cpp +++ b/src/commands/b_grid.cpp @@ -318,12 +318,13 @@ void experiment(argparse::ArgumentParser& program) } grid_experiment.go(mpi_config); if (mpi_config.rank == mpi_config.manager) { - auto results = grid_experiment.getResults(); - //build_experiment_result(results); - std::cout << "****** RESULTS ********" << std::endl; - std::cout << results.dump(4) << std::endl; - // list_results(results, config.model); - std::cout << "Process took " << timer.getDurationString() << std::endl; + auto experiment = grid_experiment.getExperiment(); + std::cout << "* Report of the computed hyperparameters" << std::endl; + auto duration = timer.getDuration(); + experiment.setDuration(duration); + // experiment.report(grid_experiment.numFiles() == 1); + experiment.saveResult(); + std::cout << "Process took " << duration << std::endl; } MPI_Finalize(); } diff --git a/src/grid/GridBase.cpp b/src/grid/GridBase.cpp index 8b3bbbe..70ef323 100644 --- a/src/grid/GridBase.cpp +++ b/src/grid/GridBase.cpp @@ -46,6 +46,41 @@ namespace platform { } std::cout << separator << std::endl << separator << std::flush; } + json GridBase::build_tasks(Datasets& datasets) + { + /* + * Each task is a json object with the following structure: + * { + * "dataset": "dataset_name", + * "idx_dataset": idx_dataset, // used to identify the dataset in the results + * // this index is relative to the list of used datasets in the actual run not to the whole datasets list + * "seed": # of seed to use, + * "fold": # of fold to process + * } + * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold + */ + auto tasks = json::array(); + auto grid = GridData(Paths::grid_input(config.model)); + auto all_datasets = datasets.getNames(); + auto datasets_names = filterDatasets(datasets); + for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) { + auto dataset = datasets_names[idx_dataset]; + for (const auto& seed : config.seeds) { + auto combinations = grid.getGrid(dataset); + for (int n_fold = 0; n_fold < config.n_folds; n_fold++) { + json task = { + { "dataset", dataset }, + { "idx_dataset", idx_dataset}, + { "seed", seed }, + { "fold", n_fold}, + }; + tasks.push_back(task); + } + } + } + shuffle_and_progress_bar(tasks); + return tasks; + } void GridBase::summary(json& all_results, json& tasks, struct ConfigMPI& config_mpi) { // Report the tasks done by each worker, showing dataset number, seed, fold and time spent @@ -146,20 +181,21 @@ namespace platform { Task_Result result; int tasks_size; MPI_Datatype MPI_Result; - MPI_Datatype type[10] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_INT }; - int blocklen[10] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; - MPI_Aint disp[10]; + MPI_Datatype type[11] = { MPI_UNSIGNED, MPI_UNSIGNED, MPI_INT, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_INT, MPI_INT }; + int blocklen[11] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + MPI_Aint disp[11]; disp[0] = offsetof(Task_Result, idx_dataset); disp[1] = offsetof(Task_Result, idx_combination); disp[2] = offsetof(Task_Result, n_fold); disp[3] = offsetof(Task_Result, score); disp[4] = offsetof(Task_Result, time); - disp[5] = offsetof(Task_Result, nodes); - disp[6] = offsetof(Task_Result, leaves); - disp[7] = offsetof(Task_Result, depth); - disp[8] = offsetof(Task_Result, process); - disp[9] = offsetof(Task_Result, task); - MPI_Type_create_struct(10, blocklen, disp, type, &MPI_Result); + disp[5] = offsetof(Task_Result, time_train); + disp[6] = offsetof(Task_Result, nodes); + disp[7] = offsetof(Task_Result, leaves); + disp[8] = offsetof(Task_Result, depth); + disp[9] = offsetof(Task_Result, process); + disp[10] = offsetof(Task_Result, task); + MPI_Type_create_struct(11, blocklen, disp, type, &MPI_Result); MPI_Type_commit(&MPI_Result); // // 0.2 Manager creates the tasks diff --git a/src/grid/GridBase.h b/src/grid/GridBase.h index bd65c5e..e496bca 100644 --- a/src/grid/GridBase.h +++ b/src/grid/GridBase.h @@ -22,7 +22,7 @@ namespace platform { void go(struct ConfigMPI& config_mpi); void validate_config(); protected: - virtual json build_tasks(Datasets& datasets) = 0; + json build_tasks(Datasets& datasets); virtual void save(json& results) = 0; virtual std::vector filterDatasets(Datasets& datasets) const = 0; virtual json initializeResults() = 0; diff --git a/src/grid/GridConfig.h b/src/grid/GridConfig.h index dbd8675..a9159f0 100644 --- a/src/grid/GridConfig.h +++ b/src/grid/GridConfig.h @@ -39,7 +39,8 @@ namespace platform { uint idx_combination; int n_fold; double score; // Experiment: Score test, no score train in this case - double time; // Experiment: Time train+test, no time train and/or time test in this case + double time; // Experiment: Time test + double time_train; double nodes; // Experiment specific double leaves; // Experiment specific double depth; // Experiment specific diff --git a/src/grid/GridExperiment.cpp b/src/grid/GridExperiment.cpp index 9a3c6be..c273ece 100644 --- a/src/grid/GridExperiment.cpp +++ b/src/grid/GridExperiment.cpp @@ -123,46 +123,8 @@ namespace platform { { return computed_results; } - json GridExperiment::build_tasks(Datasets& datasets) - { - /* - * Each task is a json object with the following structure: - * { - * "dataset": "dataset_name", - * "idx_dataset": idx_dataset, // used to identify the dataset in the results - * // this index is relative to the list of used datasets in the actual run not to the whole datasets list - * "seed": # of seed to use, - * "fold": # of fold to process - * } - */ - auto tasks = json::array(); - auto all_datasets = datasets.getNames(); - auto datasets_names = filterDatasets(datasets); - for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) { - auto dataset = datasets_names[idx_dataset]; - for (const auto& seed : config.seeds) { - for (int n_fold = 0; n_fold < config.n_folds; n_fold++) { - json task = { - { "dataset", dataset }, - { "idx_dataset", idx_dataset}, - { "seed", seed }, - { "fold", n_fold}, - }; - tasks.push_back(task); - } - } - } - shuffle_and_progress_bar(tasks); - return tasks; - } std::vector GridExperiment::filterDatasets(Datasets& datasets) const { - // Load datasets - // auto datasets_names = datasets.getNames(); - // datasets_names.clear(); - // datasets_names.push_back("iris"); - // datasets_names.push_back("wine"); - // datasets_names.push_back("balance-scale"); return filesToTest; } json GridExperiment::initializeResults() @@ -172,25 +134,9 @@ namespace platform { } void GridExperiment::save(json& results) { - // std::ofstream file(Paths::grid_output(config.model)); - // json output = { - // { "model", config.model }, - // { "score", config.score }, - // { "discretize", config.discretize }, - // { "stratified", config.stratified }, - // { "n_folds", config.n_folds }, - // { "seeds", config.seeds }, - // { "date", get_date() + " " + get_time()}, - // { "nested", config.nested}, - // { "platform", config.platform }, - // { "duration", timer.getDurationString(true)}, - // { "results", results } - // }; - // file << output.dump(4); } void GridExperiment::compile_results(json& results, json& all_results, std::string& model) { - results = json::array(); auto datasets = Datasets(false, Paths::datasets()); for (const auto& result_item : all_results.items()) { // each result has the results of all the outer folds as each one were a different task @@ -199,52 +145,44 @@ namespace platform { auto result = json::object(); int data_size = data.size(); auto score = torch::zeros({ data_size }, torch::kFloat64); - auto time_t = torch::zeros({ data_size }, torch::kFloat64); + auto score_train = torch::zeros({ data_size }, torch::kFloat64); + auto time_test = torch::zeros({ data_size }, torch::kFloat64); + auto time_train = torch::zeros({ data_size }, torch::kFloat64); auto nodes = torch::zeros({ data_size }, torch::kFloat64); auto leaves = torch::zeros({ data_size }, torch::kFloat64); auto depth = torch::zeros({ data_size }, torch::kFloat64); + auto& dataset = datasets.getDataset(dataset_name); + dataset.load(); + // + // Prepare Result + // + auto partial_result = PartialResult(); + partial_result.setSamples(dataset.getNSamples()).setFeatures(dataset.getNFeatures()).setClasses(dataset.getNClasses()); + partial_result.setHyperparameters(experiment.getHyperParameters().get(dataset_name)); for (int fold = 0; fold < data_size; ++fold) { - result["scores_test"].push_back(data[fold]["score"]); + partial_result.addScoreTest(data[fold]["score"]); + partial_result.addScoreTrain(0.0); + partial_result.addTimeTest(data[fold]["time"]); + partial_result.addTimeTrain(data[fold]["time_train"]); score[fold] = data[fold]["score"].get(); - time_t[fold] = data[fold]["time"].get(); + time_test[fold] = data[fold]["time"].get(); + time_train[fold] = data[fold]["time_train"].get(); nodes[fold] = data[fold]["nodes"].get(); leaves[fold] = data[fold]["leaves"].get(); depth[fold] = data[fold]["depth"].get(); } - double score_mean = torch::mean(score).item(); - double score_std = torch::std(score).item(); - double time_mean = torch::mean(time_t).item(); - double time_std = torch::std(time_t).item(); - double nodes_mean = torch::mean(nodes).item(); - double leaves_mean = torch::mean(leaves).item(); - double depth_mean = torch::mean(depth).item(); - auto& dataset = datasets.getDataset(dataset_name); - dataset.load(); - result["samples"] = dataset.getNSamples(); - result["features"] = dataset.getNFeatures(); - result["classes"] = dataset.getNClasses(); - result["hyperparameters"] = experiment.getHyperParameters().get(dataset_name); - result["score"] = score_mean; - result["score_std"] = score_std; - result["time"] = time_mean; - result["time_std"] = time_std; - result["nodes"] = nodes_mean; - result["leaves"] = leaves_mean; - result["depth"] = depth_mean; - result["dataset"] = dataset_name; - // Fixed data - result["scores_train"] = json::array(); - result["times_train"] = json::array(); - result["times_test"] = json::array(); - result["train_time"] = 0.0; - result["train_time_std"] = 0.0; - result["test_time"] = 0.0; - result["test_time_std"] = 0.0; - result["score_train"] = 0.0; - result["score_train_std"] = 0.0; - result["confusion_matrices"] = json::array(); - results.push_back(result); + partial_result.setGraph(std::vector()); + partial_result.setScoreTest(torch::mean(score).item()).setScoreTrain(0.0); + partial_result.setScoreTestStd(torch::std(score).item()).setScoreTrainStd(0.0); + partial_result.setTrainTime(torch::mean(time_train).item()).setTestTime(torch::mean(time_test).item()); + partial_result.setTrainTimeStd(torch::std(time_train).item()).setTestTimeStd(torch::std(time_test).item()); + partial_result.setNodes(torch::mean(nodes).item()).setLeaves(torch::mean(leaves).item()).setDepth(torch::mean(depth).item()); + partial_result.setDataset(dataset_name).setNotes(std::vector()); + partial_result.setConfusionMatrices(json::array()); + experiment.addResult(partial_result); } + auto clf = Models::instance()->create(experiment.getModel()); + experiment.setModelVersion(clf->getVersion()); computed_results = results; } json GridExperiment::store_result(std::vector& names, Task_Result& result, json& results) @@ -254,6 +192,7 @@ namespace platform { { "combination", result.idx_combination }, { "fold", result.n_fold }, { "time", result.time }, + { "time_train", result.time_train }, { "dataset", result.idx_dataset }, { "nodes", result.nodes }, { "leaves", result.leaves }, @@ -273,8 +212,7 @@ namespace platform { // // initialize // - Timer timer; - timer.start(); + Timer train_timer, test_timer; json task = tasks[n_task]; auto model = config.model; auto dataset_name = task["dataset"].get(); @@ -305,6 +243,7 @@ namespace platform { fold = new folding::StratifiedKFold(config.n_folds, y, seed); else fold = new folding::KFold(config.n_folds, y.size(0), seed); + train_timer.start(); auto [train, test] = fold->getFold(n_fold); auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test); auto states = dataset.getStates(); // Get the states of the features Once they are discretized @@ -321,11 +260,14 @@ namespace platform { // Train model // clf->fit(X_train, y_train, features, className, states, smooth); + auto train_time = train_timer.getDuration(); // // Test model // + test_timer.start(); double score = clf->score(X_test, y_test); delete fold; + auto test_time = test_timer.getDuration(); // // Return the result // @@ -333,7 +275,8 @@ namespace platform { result->idx_combination = 0; result->score = score; result->n_fold = n_fold; - result->time = timer.getDuration(); + result->time = test_time; + result->time_train = train_time; result->nodes = clf->getNumberOfNodes(); result->leaves = clf->getNumberOfEdges(); result->depth = clf->getNumberOfStates(); diff --git a/src/grid/GridExperiment.h b/src/grid/GridExperiment.h index 2250766..f03da41 100644 --- a/src/grid/GridExperiment.h +++ b/src/grid/GridExperiment.h @@ -21,6 +21,8 @@ namespace platform { explicit GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config); ~GridExperiment() = default; json getResults(); + Experiment& getExperiment() { return experiment; } + size_t numFiles() const { return filesToTest.size(); } private: argparse::ArgumentParser& arguments; Experiment experiment; @@ -28,7 +30,6 @@ namespace platform { std::vector filesToTest; void save(json& results); json initializeResults(); - json build_tasks(Datasets& datasets); std::vector filterDatasets(Datasets& datasets) const; void compile_results(json& results, json& all_results, std::string& model); json store_result(std::vector& names, Task_Result& result, json& results); diff --git a/src/grid/GridSearch.cpp b/src/grid/GridSearch.cpp index e303bca..6fd955d 100644 --- a/src/grid/GridSearch.cpp +++ b/src/grid/GridSearch.cpp @@ -19,41 +19,6 @@ namespace platform { } return json(); } - json GridSearch::build_tasks(Datasets& datasets) - { - /* - * Each task is a json object with the following structure: - * { - * "dataset": "dataset_name", - * "idx_dataset": idx_dataset, // used to identify the dataset in the results - * // this index is relative to the list of used datasets in the actual run not to the whole datasets list - * "seed": # of seed to use, - * "fold": # of fold to process - * } - * This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold - */ - auto tasks = json::array(); - auto grid = GridData(Paths::grid_input(config.model)); - auto all_datasets = datasets.getNames(); - auto datasets_names = filterDatasets(datasets); - for (int idx_dataset = 0; idx_dataset < datasets_names.size(); ++idx_dataset) { - auto dataset = datasets_names[idx_dataset]; - for (const auto& seed : config.seeds) { - auto combinations = grid.getGrid(dataset); - for (int n_fold = 0; n_fold < config.n_folds; n_fold++) { - json task = { - { "dataset", dataset }, - { "idx_dataset", idx_dataset}, - { "seed", seed }, - { "fold", n_fold}, - }; - tasks.push_back(task); - } - } - } - shuffle_and_progress_bar(tasks); - return tasks; - } std::vector GridSearch::filterDatasets(Datasets& datasets) const { // Load datasets diff --git a/src/grid/GridSearch.h b/src/grid/GridSearch.h index e05f9a7..6f8ab37 100644 --- a/src/grid/GridSearch.h +++ b/src/grid/GridSearch.h @@ -24,7 +24,6 @@ namespace platform { private: void save(json& results); json initializeResults(); - json build_tasks(Datasets& datasets); std::vector filterDatasets(Datasets& datasets) const; void compile_results(json& results, json& all_results, std::string& model); json store_result(std::vector& names, Task_Result& result, json& results); diff --git a/src/main/Experiment.cpp b/src/main/Experiment.cpp index 1809c41..f33a99f 100644 --- a/src/main/Experiment.cpp +++ b/src/main/Experiment.cpp @@ -9,6 +9,7 @@ namespace platform { void Experiment::saveResult() { + result.setSchemaVersion("1.0"); result.check(); result.save(); std::cout << "Result saved in " << Paths::results() << result.getFilename() << std::endl; diff --git a/src/main/Experiment.h b/src/main/Experiment.h index 9446c0e..838d80d 100644 --- a/src/main/Experiment.h +++ b/src/main/Experiment.h @@ -20,6 +20,7 @@ namespace platform { Experiment& setTitle(const std::string& title) { this->result.setTitle(title); return *this; } Experiment& setModelVersion(const std::string& model_version) { this->result.setModelVersion(model_version); return *this; } Experiment& setModel(const std::string& model) { this->result.setModel(model); return *this; } + std::string getModel() const { return result.getModel(); } Experiment& setLanguage(const std::string& language) { this->result.setLanguage(language); return *this; } Experiment& setDiscretizationAlgorithm(const std::string& discretization_algo) { diff --git a/src/manage/ManageScreen.cpp b/src/manage/ManageScreen.cpp index 6726e39..140dead 100644 --- a/src/manage/ManageScreen.cpp +++ b/src/manage/ManageScreen.cpp @@ -257,8 +257,9 @@ namespace platform { auto [index_from, index_to] = paginator[static_cast(output_type)].getOffset(); for (int i = index_from; i <= index_to; i++) { auto color = (i % 2) ? Colors::BLUE() : Colors::CYAN(); - std::cout << color << std::setw(3) << std::fixed << std::right << i << " "; - std::cout << results.at(i).to_string(maxModel, maxTitle) << std::endl; + auto color_status = results.at(i).check().size() == 0 ? color : Colors::RED(); + std::cout << color_status << std::setw(3) << std::fixed << std::right << i << " "; + std::cout << color << results.at(i).to_string(maxModel, maxTitle) << std::endl; } // // Status Area diff --git a/src/reports/ReportConsole.cpp b/src/reports/ReportConsole.cpp index 7420f58..9317b53 100644 --- a/src/reports/ReportConsole.cpp +++ b/src/reports/ReportConsole.cpp @@ -49,7 +49,8 @@ namespace platform { oss << "Execution took " << timer.translate2String(data["duration"].get()) << " on " << data["platform"].get() << " Language: " << data["language"].get(); sheader << headerLine(oss.str()); - sheader << headerLine("Score is " + data["score_name"].get()); + std::string schema_version = data.find("schema_version") != data.end() ? data["schema_version"].get() : "-"; + sheader << headerLine("Score is " + data["score_name"].get() + " Schema version: " + schema_version); sheader << std::string(MAXL, '*') << std::endl; sheader << std::endl; } @@ -250,7 +251,7 @@ namespace platform { if (train_data) { oss << color_line << std::left << std::setw(maxLine) << output_train[i] << suffix << Colors::BLUE() << " | " << color_line << std::left << std::setw(maxLine) - << output_test[i] << std::endl; + << output_test[i] << std::endl; } else { oss << color_line << output_test[i] << std::endl; } diff --git a/src/results/Result.cpp b/src/results/Result.cpp index 9cde61a..c143874 100644 --- a/src/results/Result.cpp +++ b/src/results/Result.cpp @@ -64,18 +64,10 @@ namespace platform { { return data; } - void Result::check() + std::vector Result::check() { platform::JsonValidator validator(platform::SchemaV1_0::schema); - data["schema_version"] = "1.0"; - std::vector errors = validator.validate(data); - if (!errors.empty()) { - std::string message; - for (const auto& error : errors) { - message += " - " + error + "\n"; - } - throw std::runtime_error("* Result file has validation errors:\n" + message); - } + return validator.validate(data); } void Result::save() { diff --git a/src/results/Result.h b/src/results/Result.h index 7d7c2c6..78bfd62 100644 --- a/src/results/Result.h +++ b/src/results/Result.h @@ -16,7 +16,7 @@ namespace platform { Result(); Result& load(const std::string& path, const std::string& filename); void save(); - void check(); + std::vector check(); // Getters json getJson(); std::string to_string(int maxModel, int maxTitle) const; @@ -29,7 +29,7 @@ namespace platform { std::string getModel() const { return data["model"].get(); }; std::string getPlatform() const { return data["platform"].get(); }; std::string getScoreName() const { return data["score_name"].get(); }; - + void setSchemaVersion(const std::string& version) { data["schema_version"] = version; }; bool isComplete() const { return complete; }; json getData() const { return data; } // Setters