|
|
|
@@ -8,8 +8,116 @@
|
|
|
|
|
#include "GridExperiment.h"
|
|
|
|
|
|
|
|
|
|
namespace platform {
|
|
|
|
|
GridExperiment::GridExperiment(struct ConfigGrid& config) : GridBase(config)
|
|
|
|
|
GridExperiment::GridExperiment(argparse::ArgumentParser& program, struct ConfigGrid& config) : arguments(program), GridBase(config)
|
|
|
|
|
{
|
|
|
|
|
std::string file_name, model_name, title, hyperparameters_file, datasets_file, discretize_algo, smooth_strat, score;
|
|
|
|
|
json hyperparameters_json;
|
|
|
|
|
bool discretize_dataset, stratified, hyper_best;
|
|
|
|
|
std::vector<int> seeds;
|
|
|
|
|
std::vector<std::string> file_names;
|
|
|
|
|
int n_folds;
|
|
|
|
|
file_name = program.get<std::string>("dataset");
|
|
|
|
|
file_names = program.get<std::vector<std::string>>("datasets");
|
|
|
|
|
datasets_file = program.get<std::string>("datasets-file");
|
|
|
|
|
model_name = program.get<std::string>("model");
|
|
|
|
|
discretize_dataset = program.get<bool>("discretize");
|
|
|
|
|
discretize_algo = program.get<std::string>("discretize-algo");
|
|
|
|
|
smooth_strat = program.get<std::string>("smooth-strat");
|
|
|
|
|
stratified = program.get<bool>("stratified");
|
|
|
|
|
n_folds = program.get<int>("folds");
|
|
|
|
|
score = program.get<std::string>("score");
|
|
|
|
|
seeds = program.get<std::vector<int>>("seeds");
|
|
|
|
|
auto hyperparameters = program.get<std::string>("hyperparameters");
|
|
|
|
|
hyperparameters_json = json::parse(hyperparameters);
|
|
|
|
|
hyperparameters_file = program.get<std::string>("hyper-file");
|
|
|
|
|
hyper_best = program.get<bool>("hyper-best");
|
|
|
|
|
if (hyper_best) {
|
|
|
|
|
// Build the best results file_name
|
|
|
|
|
hyperparameters_file = platform::Paths::results() + platform::Paths::bestResultsFile(score, model_name);
|
|
|
|
|
// ignore this parameter
|
|
|
|
|
hyperparameters = "{}";
|
|
|
|
|
} else {
|
|
|
|
|
if (hyperparameters_file != "" && hyperparameters != "{}") {
|
|
|
|
|
throw runtime_error("hyperparameters and hyper_file are mutually exclusive");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
title = program.get<std::string>("title");
|
|
|
|
|
if (title == "" && file_name == "all") {
|
|
|
|
|
throw runtime_error("title is mandatory if all datasets are to be tested");
|
|
|
|
|
}
|
|
|
|
|
auto datasets = platform::Datasets(false, platform::Paths::datasets());
|
|
|
|
|
if (datasets_file != "") {
|
|
|
|
|
ifstream catalog(datasets_file);
|
|
|
|
|
if (catalog.is_open()) {
|
|
|
|
|
std::string line;
|
|
|
|
|
while (getline(catalog, line)) {
|
|
|
|
|
if (line.empty() || line[0] == '#') {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (!datasets.isDataset(line)) {
|
|
|
|
|
cerr << "Dataset " << line << " not found" << std::endl;
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
filesToTest.push_back(line);
|
|
|
|
|
}
|
|
|
|
|
catalog.close();
|
|
|
|
|
if (title == "") {
|
|
|
|
|
title = "Test " + to_string(filesToTest.size()) + " datasets (" + datasets_file + ") "\
|
|
|
|
|
+ model_name + " " + to_string(n_folds) + " folds";
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
throw std::invalid_argument("Unable to open catalog file. [" + datasets_file + "]");
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (file_names.size() > 0) {
|
|
|
|
|
for (auto file : file_names) {
|
|
|
|
|
if (!datasets.isDataset(file)) {
|
|
|
|
|
cerr << "Dataset " << file << " not found" << std::endl;
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
filesToTest = file_names;
|
|
|
|
|
if (title == "") {
|
|
|
|
|
title = "Test " + to_string(file_names.size()) + " datasets " + model_name + " " + to_string(n_folds) + " folds";
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (file_name != "all") {
|
|
|
|
|
if (!datasets.isDataset(file_name)) {
|
|
|
|
|
cerr << "Dataset " << file_name << " not found" << std::endl;
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
if (title == "") {
|
|
|
|
|
title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds";
|
|
|
|
|
}
|
|
|
|
|
filesToTest.push_back(file_name);
|
|
|
|
|
} else {
|
|
|
|
|
filesToTest = datasets.getNames();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
platform::HyperParameters test_hyperparams;
|
|
|
|
|
if (hyperparameters_file != "") {
|
|
|
|
|
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_file, hyper_best);
|
|
|
|
|
} else {
|
|
|
|
|
test_hyperparams = platform::HyperParameters(datasets.getNames(), hyperparameters_json);
|
|
|
|
|
}
|
|
|
|
|
this->config.model = model_name;
|
|
|
|
|
this->config.score = score;
|
|
|
|
|
this->config.discretize = discretize_dataset;
|
|
|
|
|
this->config.stratified = stratified;
|
|
|
|
|
this->config.smooth_strategy = smooth_strat;
|
|
|
|
|
this->config.n_folds = n_folds;
|
|
|
|
|
this->config.seeds = seeds;
|
|
|
|
|
auto env = platform::DotEnv();
|
|
|
|
|
experiment.setTitle(title).setLanguage("c++").setLanguageVersion("gcc 14.1.1");
|
|
|
|
|
experiment.setDiscretizationAlgorithm(discretize_algo).setSmoothSrategy(smooth_strat);
|
|
|
|
|
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
|
|
|
|
|
experiment.setStratified(stratified).setNFolds(n_folds).setScoreName(score);
|
|
|
|
|
experiment.setHyperparameters(test_hyperparams);
|
|
|
|
|
for (auto seed : seeds) {
|
|
|
|
|
experiment.addRandomSeed(seed);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
json GridExperiment::getResults()
|
|
|
|
|
{
|
|
|
|
@@ -25,9 +133,7 @@ namespace platform {
|
|
|
|
|
* // this index is relative to the list of used datasets in the actual run not to the whole datasets list
|
|
|
|
|
* "seed": # of seed to use,
|
|
|
|
|
* "fold": # of fold to process
|
|
|
|
|
* "hyperpameters": json object with the hyperparameters to use
|
|
|
|
|
* }
|
|
|
|
|
* This way a task consists in process all combinations of hyperparameters for a dataset, seed and fold
|
|
|
|
|
*/
|
|
|
|
|
auto tasks = json::array();
|
|
|
|
|
auto all_datasets = datasets.getNames();
|
|
|
|
@@ -41,7 +147,6 @@ namespace platform {
|
|
|
|
|
{ "idx_dataset", idx_dataset},
|
|
|
|
|
{ "seed", seed },
|
|
|
|
|
{ "fold", n_fold},
|
|
|
|
|
{ "hyperparameters", json::object() }
|
|
|
|
|
};
|
|
|
|
|
tasks.push_back(task);
|
|
|
|
|
}
|
|
|
|
@@ -53,10 +158,12 @@ namespace platform {
|
|
|
|
|
std::vector<std::string> GridExperiment::filterDatasets(Datasets& datasets) const
|
|
|
|
|
{
|
|
|
|
|
// Load datasets
|
|
|
|
|
auto datasets_names = datasets.getNames();
|
|
|
|
|
datasets_names.clear();
|
|
|
|
|
datasets_names.push_back("iris");
|
|
|
|
|
return datasets_names;
|
|
|
|
|
// auto datasets_names = datasets.getNames();
|
|
|
|
|
// datasets_names.clear();
|
|
|
|
|
// datasets_names.push_back("iris");
|
|
|
|
|
// datasets_names.push_back("wine");
|
|
|
|
|
// datasets_names.push_back("balance-scale");
|
|
|
|
|
return filesToTest;
|
|
|
|
|
}
|
|
|
|
|
json GridExperiment::initializeResults()
|
|
|
|
|
{
|
|
|
|
@@ -83,17 +190,60 @@ namespace platform {
|
|
|
|
|
}
|
|
|
|
|
void GridExperiment::compile_results(json& results, json& all_results, std::string& model)
|
|
|
|
|
{
|
|
|
|
|
results = json::object();
|
|
|
|
|
for (const auto& result : all_results.items()) {
|
|
|
|
|
results = json::array();
|
|
|
|
|
auto datasets = Datasets(false, Paths::datasets());
|
|
|
|
|
for (const auto& result_item : all_results.items()) {
|
|
|
|
|
// each result has the results of all the outer folds as each one were a different task
|
|
|
|
|
auto dataset = result.key();
|
|
|
|
|
results[dataset] = json::array();
|
|
|
|
|
for (int fold = 0; fold < result.value().size(); ++fold) {
|
|
|
|
|
results[dataset].push_back(json::object());
|
|
|
|
|
}
|
|
|
|
|
for (const auto& result_fold : result.value()) {
|
|
|
|
|
results[dataset][result_fold["fold"].get<int>()] = result_fold;
|
|
|
|
|
auto dataset_name = result_item.key();
|
|
|
|
|
auto data = result_item.value();
|
|
|
|
|
auto result = json::object();
|
|
|
|
|
int data_size = data.size();
|
|
|
|
|
auto score = torch::zeros({ data_size }, torch::kFloat64);
|
|
|
|
|
auto time_t = torch::zeros({ data_size }, torch::kFloat64);
|
|
|
|
|
auto nodes = torch::zeros({ data_size }, torch::kFloat64);
|
|
|
|
|
auto leaves = torch::zeros({ data_size }, torch::kFloat64);
|
|
|
|
|
auto depth = torch::zeros({ data_size }, torch::kFloat64);
|
|
|
|
|
for (int fold = 0; fold < data_size; ++fold) {
|
|
|
|
|
result["scores_test"].push_back(data[fold]["score"]);
|
|
|
|
|
score[fold] = data[fold]["score"].get<double>();
|
|
|
|
|
time_t[fold] = data[fold]["time"].get<double>();
|
|
|
|
|
nodes[fold] = data[fold]["nodes"].get<double>();
|
|
|
|
|
leaves[fold] = data[fold]["leaves"].get<double>();
|
|
|
|
|
depth[fold] = data[fold]["depth"].get<double>();
|
|
|
|
|
}
|
|
|
|
|
double score_mean = torch::mean(score).item<double>();
|
|
|
|
|
double score_std = torch::std(score).item<double>();
|
|
|
|
|
double time_mean = torch::mean(time_t).item<double>();
|
|
|
|
|
double time_std = torch::std(time_t).item<double>();
|
|
|
|
|
double nodes_mean = torch::mean(nodes).item<double>();
|
|
|
|
|
double leaves_mean = torch::mean(leaves).item<double>();
|
|
|
|
|
double depth_mean = torch::mean(depth).item<double>();
|
|
|
|
|
auto& dataset = datasets.getDataset(dataset_name);
|
|
|
|
|
dataset.load();
|
|
|
|
|
result["samples"] = dataset.getNSamples();
|
|
|
|
|
result["features"] = dataset.getNFeatures();
|
|
|
|
|
result["classes"] = dataset.getNClasses();
|
|
|
|
|
result["hyperparameters"] = experiment.getHyperParameters().get(dataset_name);
|
|
|
|
|
result["score"] = score_mean;
|
|
|
|
|
result["score_std"] = score_std;
|
|
|
|
|
result["time"] = time_mean;
|
|
|
|
|
result["time_std"] = time_std;
|
|
|
|
|
result["nodes"] = nodes_mean;
|
|
|
|
|
result["leaves"] = leaves_mean;
|
|
|
|
|
result["depth"] = depth_mean;
|
|
|
|
|
result["dataset"] = dataset_name;
|
|
|
|
|
// Fixed data
|
|
|
|
|
result["scores_train"] = json::array();
|
|
|
|
|
result["times_train"] = json::array();
|
|
|
|
|
result["times_test"] = json::array();
|
|
|
|
|
result["train_time"] = 0.0;
|
|
|
|
|
result["train_time_std"] = 0.0;
|
|
|
|
|
result["test_time"] = 0.0;
|
|
|
|
|
result["test_time_std"] = 0.0;
|
|
|
|
|
result["score_train"] = 0.0;
|
|
|
|
|
result["score_train_std"] = 0.0;
|
|
|
|
|
result["confusion_matrices"] = json::array();
|
|
|
|
|
results.push_back(result);
|
|
|
|
|
}
|
|
|
|
|
computed_results = results;
|
|
|
|
|
}
|
|
|
|
@@ -164,7 +314,7 @@ namespace platform {
|
|
|
|
|
//
|
|
|
|
|
auto clf = Models::instance()->create(config.model);
|
|
|
|
|
auto valid = clf->getValidHyperparameters();
|
|
|
|
|
auto hyperparameters = platform::HyperParameters(datasets.getNames(), task["hyperparameters"]);
|
|
|
|
|
auto hyperparameters = experiment.getHyperParameters();
|
|
|
|
|
hyperparameters.check(valid, dataset_name);
|
|
|
|
|
clf->setHyperparameters(hyperparameters.get(dataset_name));
|
|
|
|
|
//
|
|
|
|
|