diff --git a/lib/argparse b/lib/argparse index f0759fd..eab1d75 160000 --- a/lib/argparse +++ b/lib/argparse @@ -1 +1 @@ -Subproject commit f0759fd982bb4a88785094626ea522cb3a84ec84 +Subproject commit eab1d75e49970857eba1fdef5afb68befa2fa16f diff --git a/lib/libxlsxwriter b/lib/libxlsxwriter index 284b61b..c89c551 160000 --- a/lib/libxlsxwriter +++ b/lib/libxlsxwriter @@ -1 +1 @@ -Subproject commit 284b61ba0b8930ad93003380defc4a0817b75079 +Subproject commit c89c55122116a829fc1442e784b8026be9868239 diff --git a/src/commands/b_main.cpp b/src/commands/b_main.cpp index b11c5ed..5bd8a4e 100644 --- a/src/commands/b_main.cpp +++ b/src/commands/b_main.cpp @@ -47,6 +47,7 @@ void manageArguments(argparse::ArgumentParser& program) ); program.add_argument("--title").default_value("").help("Experiment title"); program.add_argument("--discretize").help("Discretize input dataset").default_value((bool)stoi(env.get("discretize"))).implicit_value(true); + program.add_argument("--generate-fold-files").help("generate fold information in datasets_experiment folder").default_value(false).implicit_value(true); program.add_argument("--no-train-score").help("Don't compute train score").default_value(false).implicit_value(true); program.add_argument("--quiet").help("Don't display detailed progress").default_value(false).implicit_value(true); program.add_argument("--save").help("Save result (always save if no dataset is supplied)").default_value(false).implicit_value(true); @@ -75,7 +76,7 @@ int main(int argc, char** argv) manageArguments(program); std::string file_name, model_name, title, hyperparameters_file, datasets_file; json hyperparameters_json; - bool discretize_dataset, stratified, saveResults, quiet, no_train_score; + bool discretize_dataset, stratified, saveResults, quiet, no_train_score, generate_fold_files; std::vector seeds; std::vector file_names; std::vector filesToTest; @@ -95,6 +96,7 @@ int main(int argc, char** argv) hyperparameters_json = json::parse(hyperparameters); hyperparameters_file = program.get("hyper-file"); no_train_score = program.get("no-train-score"); + generate_fold_files = program.get("generate-fold-files"); if (hyperparameters_file != "" && hyperparameters != "{}") { throw runtime_error("hyperparameters and hyper_file are mutually exclusive"); } @@ -184,7 +186,7 @@ int main(int argc, char** argv) } platform::Timer timer; timer.start(); - experiment.go(filesToTest, quiet, no_train_score); + experiment.go(filesToTest, quiet, no_train_score, generate_fold_files); experiment.setDuration(timer.getDuration()); if (saveResults) { experiment.saveResult(); diff --git a/src/common/Paths.h b/src/common/Paths.h index 6fd61cf..1544b1b 100644 --- a/src/common/Paths.h +++ b/src/common/Paths.h @@ -15,6 +15,12 @@ namespace platform { auto env = platform::DotEnv(); return env.get("source_data"); } + static std::string experiment_file(const std::string& fileName, bool discretize, bool stratified, int seed, int nfold) + { + std::string disc = discretize ? "_disc_" : "_ndisc_"; + std::string strat = stratified ? "strat_" : "nstrat_"; + return "datasets_experiment/" + fileName + disc + strat + std::to_string(seed) + "_" + std::to_string(nfold) + ".json"; + } static void createPath(const std::string& path) { // Create directory if it does not exist diff --git a/src/main/Experiment.cpp b/src/main/Experiment.cpp index 4a5112c..787184e 100644 --- a/src/main/Experiment.cpp +++ b/src/main/Experiment.cpp @@ -23,7 +23,7 @@ namespace platform { { std::cout << result.getJson().dump(4) << std::endl; } - void Experiment::go(std::vector filesToProcess, bool quiet, bool no_train_score) + void Experiment::go(std::vector filesToProcess, bool quiet, bool no_train_score, bool generate_fold_files) { for (auto fileName : filesToProcess) { if (fileName.size() > max_name) @@ -47,7 +47,7 @@ namespace platform { for (auto fileName : filesToProcess) { if (!quiet) std::cout << " " << setw(3) << right << num++ << " " << setw(max_name) << left << fileName << right << flush; - cross_validation(fileName, quiet, no_train_score); + cross_validation(fileName, quiet, no_train_score, generate_fold_files); if (!quiet) std::cout << std::endl; } @@ -74,7 +74,45 @@ namespace platform { std::cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush; } - void Experiment::cross_validation(const std::string& fileName, bool quiet, bool no_train_score) + void generate_files(const std::string& fileName, bool discretize, bool stratified, int seed, int nfold, torch::Tensor X_train, torch::Tensor y_train, torch::Tensor X_test, torch::Tensor y_test, std::vector& train, std::vector& test) + { + std::string file_name = Paths::experiment_file(fileName, discretize, stratified, seed, nfold); + auto file = std::ofstream(file_name); + json output; + output["seed"] = seed; + output["nfold"] = nfold; + output["X_train"] = json::array(); + auto n = X_train.size(1); + for (int i = 0; i < X_train.size(0); i++) { + if (X_train.dtype() == torch::kFloat32) { + auto xvf_ptr = X_train.index({ i }).data_ptr(); + auto feature = std::vector(xvf_ptr, xvf_ptr + n); + output["X_train"].push_back(feature); + } else { + auto feature = std::vector(X_train.index({ i }).data_ptr(), X_train.index({ i }).data_ptr() + n); + output["X_train"].push_back(feature); + } + } + output["y_train"] = std::vector(y_train.data_ptr(), y_train.data_ptr() + n); + output["X_test"] = json::array(); + n = X_test.size(1); + for (int i = 0; i < X_test.size(0); i++) { + if (X_train.dtype() == torch::kFloat32) { + auto xvf_ptr = X_test.index({ i }).data_ptr(); + auto feature = std::vector(xvf_ptr, xvf_ptr + n); + output["X_test"].push_back(feature); + } else { + auto feature = std::vector(X_test.index({ i }).data_ptr(), X_test.index({ i }).data_ptr() + n); + output["X_test"].push_back(feature); + } + } + output["y_test"] = std::vector(y_test.data_ptr(), y_test.data_ptr() + n); + output["train"] = train; + output["test"] = test; + file << output.dump(4); + file.close(); + } + void Experiment::cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files) { auto datasets = Datasets(discretized, Paths::datasets()); // Get dataset @@ -137,6 +175,8 @@ namespace platform { auto y_train = y.index({ train_t }); auto X_test = X.index({ "...", test_t }); auto y_test = y.index({ test_t }); + if (generate_fold_files) + generate_files(fileName, discretized, stratified, seed, nfold, X_train, y_train, X_test, y_test, train, test); if (!quiet) showProgress(nfold + 1, getColor(clf->getStatus()), "a"); // Train model diff --git a/src/main/Experiment.h b/src/main/Experiment.h index 4cbd5a4..3892e82 100644 --- a/src/main/Experiment.h +++ b/src/main/Experiment.h @@ -28,8 +28,8 @@ namespace platform { Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); result.addSeed(randomSeed); return *this; } Experiment& setDuration(float duration) { this->result.setDuration(duration); return *this; } Experiment& setHyperparameters(const HyperParameters& hyperparameters_) { this->hyperparameters = hyperparameters_; return *this; } - void cross_validation(const std::string& fileName, bool quiet, bool no_train_score); - void go(std::vector filesToProcess, bool quiet, bool no_train_score); + void cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files); + void go(std::vector filesToProcess, bool quiet, bool no_train_score, bool generate_fold_files); void saveResult(); void show(); void report(bool classification_report = false); diff --git a/src/results/Result.cpp b/src/results/Result.cpp index d058c85..470adbf 100644 --- a/src/results/Result.cpp +++ b/src/results/Result.cpp @@ -64,7 +64,7 @@ namespace platform { void Result::save() { - std::ofstream file(Paths::results() + "/" + getFilename()); + std::ofstream file(Paths::results() + getFilename()); file << data; file.close(); }