From a7ec930fa0a2b2128cf377fa4d119ec0ce1c0134 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Thu, 6 Jun 2024 13:03:57 +0200 Subject: [PATCH] Add numeric features management to Dataset --- CMakeLists.txt | 2 +- lib/libxlsxwriter | 2 +- lib/mdlp | 2 +- src/CMakeLists.txt | 10 +++--- src/common/Dataset.cpp | 24 +++++++++++-- src/common/Dataset.h | 14 ++++++-- src/common/Datasets.cpp | 46 +++++++++++++++++++----- src/common/Datasets.h | 1 + src/common/Discretization.cpp | 55 +++++++++++++++++++++++++++++ src/common/Discretization.h | 33 +++++++++++++++++ src/common/DiscretizationRegister.h | 10 ++++++ src/common/Utils.h | 21 +++++------ src/main/Experiment.cpp | 8 ++++- src/reports/DatasetsConsole.cpp | 14 +++++--- src/reports/DatasetsExcel.cpp | 11 +++--- 15 files changed, 210 insertions(+), 43 deletions(-) create mode 100644 src/common/Discretization.cpp create mode 100644 src/common/Discretization.h create mode 100644 src/common/DiscretizationRegister.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 72010b7..64b0304 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.20) project(Platform - VERSION 1.0.4 + VERSION 1.1.0 DESCRIPTION "Platform to run Experiments with classifiers." HOMEPAGE_URL "https://github.com/rmontanana/platform" LANGUAGES CXX diff --git a/lib/libxlsxwriter b/lib/libxlsxwriter index c89c551..f483e65 160000 --- a/lib/libxlsxwriter +++ b/lib/libxlsxwriter @@ -1 +1 @@ -Subproject commit c89c55122116a829fc1442e784b8026be9868239 +Subproject commit f483e65f2e8364702c411ca54470482fe54666b2 diff --git a/lib/mdlp b/lib/mdlp index 236d1b2..633aa52 160000 --- a/lib/mdlp +++ b/lib/mdlp @@ -1 +1 @@ -Subproject commit 236d1b2f8be185039493fe7fce04a83e02ed72e5 +Subproject commit 633aa52849a61a5da9f5d6ea9f2401fd0c48ad47 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7049d65..5b881e4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,7 +21,7 @@ include_directories( add_executable( b_best commands/b_best.cpp best/Statistics.cpp best/BestResultsExcel.cpp best/BestResults.cpp - common/Datasets.cpp common/Dataset.cpp + common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp main/Models.cpp main/Scores.cpp reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp results/Result.cpp @@ -32,14 +32,14 @@ target_link_libraries(b_best Boost::boost "${PyClassifiers}" "${BayesNet}" mdlp set(grid_sources GridSearch.cpp GridData.cpp) list(TRANSFORM grid_sources PREPEND grid/) add_executable(b_grid commands/b_grid.cpp ${grid_sources} - common/Datasets.cpp common/Dataset.cpp + common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp main/HyperParameters.cpp main/Models.cpp ) target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy) # b_list add_executable(b_list commands/b_list.cpp - common/Datasets.cpp common/Dataset.cpp + common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp main/Models.cpp main/Scores.cpp reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp @@ -50,7 +50,7 @@ target_link_libraries(b_list "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIB set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp) list(TRANSFORM main_sources PREPEND main/) add_executable(b_main commands/b_main.cpp ${main_sources} - common/Datasets.cpp common/Dataset.cpp + common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp reports/ReportConsole.cpp reports/ReportBase.cpp results/Result.cpp ) @@ -61,7 +61,7 @@ set(manage_sources ManageScreen.cpp CommandParser.cpp ResultsManager.cpp) list(TRANSFORM manage_sources PREPEND manage/) add_executable( b_manage commands/b_manage.cpp ${manage_sources} - common/Datasets.cpp common/Dataset.cpp + common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp reports/ReportConsole.cpp reports/ReportExcel.cpp reports/ReportExcelCompared.cpp reports/ReportBase.cpp reports/ExcelFile.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp main/Scores.cpp diff --git a/src/common/Dataset.cpp b/src/common/Dataset.cpp index 7f9a26f..bbeba35 100644 --- a/src/common/Dataset.cpp +++ b/src/common/Dataset.cpp @@ -2,7 +2,12 @@ #include #include "Dataset.h" namespace platform { - Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType) + Dataset::Dataset(const Dataset& dataset) : + path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), + n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features), + states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), + X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), + fileType(dataset.fileType) { } std::string Dataset::getName() const @@ -180,12 +185,20 @@ namespace platform { } else if (fileType == RDATA) { load_rdata(); } + n_samples = Xv[0].size(); + n_features = Xv.size(); + if (numericFeaturesIdx.at(0) == -1) { + numericFeatures = std::vector(n_features, true); + } else { + numericFeatures = std::vector(n_features, false); + for (auto i : numericFeaturesIdx) { + numericFeatures[i] = true; + } + } if (discretize) { Xd = discretizeDataset(Xv, yv); computeStates(); } - n_samples = Xv[0].size(); - n_features = Xv.size(); loaded = true; } void Dataset::buildTensors() @@ -215,4 +228,9 @@ namespace platform { } return Xd; } + std::pair Dataset::getDiscretizedTrainTestTensors() + { + auto discretizer = Discretization::instance()->create("mdlp"); + return { X_train, X_test }; + } } \ No newline at end of file diff --git a/src/common/Dataset.h b/src/common/Dataset.h index 3e1180e..afd609e 100644 --- a/src/common/Dataset.h +++ b/src/common/Dataset.h @@ -4,14 +4,17 @@ #include #include #include -#include +#include #include "Utils.h" #include "SourceData.h" namespace platform { - class Dataset { public: - Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; + Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector numericFeaturesIdx) : + path(path), name(name), className(className), discretize(discretize), + loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx) + { + }; explicit Dataset(const Dataset&); std::string getName() const; std::string getClassName() const; @@ -20,9 +23,11 @@ namespace platform { std::map> getStates() const; std::pair>&, std::vector&> getVectors(); std::pair>&, std::vector&> getVectorsDiscretized(); + std::pair getDiscretizedTrainTestTensors(); std::pair getTensors(); int getNFeatures() const; int getNSamples() const; + std::vector& getNumericFeatures() { return numericFeatures; } void load(); const bool inline isLoaded() const { return loaded; }; private: @@ -31,12 +36,15 @@ namespace platform { fileType_t fileType; std::string className; int n_samples{ 0 }, n_features{ 0 }; + std::vector numericFeaturesIdx; + std::vector numericFeatures; // true if feature is numeric std::vector features; std::vector labels; std::map> states; bool loaded; bool discretize; torch::Tensor X, y; + torch::Tensor X_train, X_test; std::vector> Xv; std::vector> Xd; std::vector yv; diff --git a/src/common/Datasets.cpp b/src/common/Datasets.cpp index 7e87542..535268e 100644 --- a/src/common/Datasets.cpp +++ b/src/common/Datasets.cpp @@ -1,27 +1,47 @@ #include #include "Datasets.h" +#include + namespace platform { + using json = nlohmann::ordered_json; + const std::string message_dataset_not_loaded = "dataset not loaded."; void Datasets::load() { auto sd = SourceData(sfileType); fileType = sd.getFileType(); path = sd.getPath(); ifstream catalog(path + "all.txt"); + std::vector numericFeaturesIdx; if (catalog.is_open()) { std::string line; while (getline(catalog, line)) { if (line.empty() || line[0] == '#') { continue; } - std::vector tokens = split(line, ','); + std::vector tokens = split(line, ';'); std::string name = tokens[0]; std::string className; + numericFeaturesIdx.clear(); if (tokens.size() == 1) { className = "-1"; + numericFeaturesIdx.push_back(-1); } else { className = tokens[1]; + if (tokens.size() > 2) { + auto numericFeatures = tokens[2]; + if (numericFeatures == "all") { + numericFeaturesIdx.push_back(-1); + } else { + auto features = json::parse(numericFeatures); + for (auto& f : features) { + numericFeaturesIdx.push_back(f); + } + } + } else { + numericFeaturesIdx.push_back(-1); + } } - datasets[name] = make_unique(path, name, className, discretize, fileType); + datasets[name] = make_unique(path, name, className, discretize, fileType, numericFeaturesIdx); } catalog.close(); } else { @@ -39,7 +59,7 @@ namespace platform { if (datasets.at(name)->isLoaded()) { return datasets.at(name)->getFeatures(); } else { - throw std::invalid_argument("Dataset not loaded."); + throw std::invalid_argument(message_dataset_not_loaded); } } std::vector Datasets::getLabels(const std::string& name) const @@ -47,7 +67,7 @@ namespace platform { if (datasets.at(name)->isLoaded()) { return datasets.at(name)->getLabels(); } else { - throw std::invalid_argument("Dataset not loaded."); + throw std::invalid_argument(message_dataset_not_loaded); } } map> Datasets::getStates(const std::string& name) const @@ -55,7 +75,7 @@ namespace platform { if (datasets.at(name)->isLoaded()) { return datasets.at(name)->getStates(); } else { - throw std::invalid_argument("Dataset not loaded."); + throw std::invalid_argument(message_dataset_not_loaded); } } void Datasets::loadDataset(const std::string& name) const @@ -71,7 +91,7 @@ namespace platform { if (datasets.at(name)->isLoaded()) { return datasets.at(name)->getClassName(); } else { - throw std::invalid_argument("Dataset not loaded."); + throw std::invalid_argument(message_dataset_not_loaded); } } int Datasets::getNSamples(const std::string& name) const @@ -79,7 +99,7 @@ namespace platform { if (datasets.at(name)->isLoaded()) { return datasets.at(name)->getNSamples(); } else { - throw std::invalid_argument("Dataset not loaded."); + throw std::invalid_argument(message_dataset_not_loaded); } } int Datasets::getNClasses(const std::string& name) @@ -93,7 +113,15 @@ namespace platform { auto [Xv, yv] = getVectors(name); return *std::max_element(yv.begin(), yv.end()) + 1; } else { - throw std::invalid_argument("Dataset not loaded."); + throw std::invalid_argument(message_dataset_not_loaded); + } + } + std::vector& Datasets::getNumericFeatures(const std::string& name) const + { + if (datasets.at(name)->isLoaded()) { + return datasets.at(name)->getNumericFeatures(); + } else { + throw std::invalid_argument(message_dataset_not_loaded); } } std::vector Datasets::getClassesCounts(const std::string& name) const @@ -106,7 +134,7 @@ namespace platform { } return counts; } else { - throw std::invalid_argument("Dataset not loaded."); + throw std::invalid_argument(message_dataset_not_loaded); } } pair>&, std::vector&> Datasets::getVectors(const std::string& name) diff --git a/src/common/Datasets.h b/src/common/Datasets.h index 028a9f1..df63c02 100644 --- a/src/common/Datasets.h +++ b/src/common/Datasets.h @@ -11,6 +11,7 @@ namespace platform { std::vector getLabels(const std::string& name) const; std::string getClassName(const std::string& name) const; int getNClasses(const std::string& name); + std::vector& getNumericFeatures(const std::string& name) const; std::vector getClassesCounts(const std::string& name) const; std::map> getStates(const std::string& name) const; std::pair>&, std::vector&> getVectors(const std::string& name); diff --git a/src/common/Discretization.cpp b/src/common/Discretization.cpp new file mode 100644 index 0000000..234be43 --- /dev/null +++ b/src/common/Discretization.cpp @@ -0,0 +1,55 @@ +#include "Discretization.h" + +namespace platform { + // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory + Discretization* Discretization::factory = nullptr; + Discretization* Discretization::instance() + { + //manages singleton + if (factory == nullptr) + factory = new Discretization(); + return factory; + } + void Discretization::registerFactoryFunction(const std::string& name, + function classFactoryFunction) + { + // register the class factory function + functionRegistry[name] = classFactoryFunction; + } + std::shared_ptr Discretization::create(const std::string& name) + { + mdlp::Discretizer* instance = nullptr; + + // find name in the registry and call factory method. + auto it = functionRegistry.find(name); + if (it != functionRegistry.end()) + instance = it->second(); + // wrap instance in a shared ptr and return + if (instance != nullptr) + return std::unique_ptr(instance); + else + throw std::runtime_error("Discretizer not found: " + name); + } + std::vector Discretization::getNames() + { + std::vector names; + transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names), + [](const pair>& pair) { return pair.first; }); + return names; + } + std::string Discretization::toString() + { + std::string result = ""; + std::string sep = ""; + for (const auto& pair : functionRegistry) { + result += sep + pair.first; + sep = ", "; + } + return "{" + result + "}"; + } + RegistrarDiscretization::RegistrarDiscretization(const std::string& name, function classFactoryFunction) + { + // register the class factory function + Discretization::instance()->registerFactoryFunction(name, classFactoryFunction); + } +} \ No newline at end of file diff --git a/src/common/Discretization.h b/src/common/Discretization.h new file mode 100644 index 0000000..4a4b061 --- /dev/null +++ b/src/common/Discretization.h @@ -0,0 +1,33 @@ +#ifndef DISCRETIZATION_H +#define DISCRETIZATION_H +#include +#include +#include +#include +#include +#include +#include +#include +namespace platform { + class Discretization { + public: + Discretization(Discretization&) = delete; + void operator=(const Discretization&) = delete; + // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory + static Discretization* instance(); + std::shared_ptr create(const std::string& name); + void registerFactoryFunction(const std::string& name, + function classFactoryFunction); + std::vector getNames(); + std::string toString(); + private: + map> functionRegistry; + static Discretization* factory; //singleton + Discretization() {}; + }; + class RegistrarDiscretization { + public: + RegistrarDiscretization(const std::string& className, function classFactoryFunction); + }; +} +#endif \ No newline at end of file diff --git a/src/common/DiscretizationRegister.h b/src/common/DiscretizationRegister.h new file mode 100644 index 0000000..79cb912 --- /dev/null +++ b/src/common/DiscretizationRegister.h @@ -0,0 +1,10 @@ +#ifndef DISCRETIZATIONREGISTER_H +#define DISCRETIZATIONREGISTER_H +#include +static platform::RegistrarDiscretization registrarM("mdlp", + [](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();}); +static platform::RegistrarDiscretization registrarBU("BinUniform", + [](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);}); +static platform::RegistrarDiscretization registrarBQ("BinQuantile", + [](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);}); +#endif \ No newline at end of file diff --git a/src/common/Utils.h b/src/common/Utils.h index e13629c..692ed11 100644 --- a/src/common/Utils.h +++ b/src/common/Utils.h @@ -3,17 +3,8 @@ #include #include #include +#include namespace platform { - static std::vector split(const std::string& text, char delimiter) - { - std::vector result; - std::stringstream ss(text); - std::string token; - while (std::getline(ss, token, delimiter)) { - result.push_back(token); - } - return result; - } static std::string trim(const std::string& str) { std::string result = str; @@ -25,5 +16,15 @@ namespace platform { }).base(), result.end()); return result; } + static std::vector split(const std::string& text, char delimiter) + { + std::vector result; + std::stringstream ss(text); + std::string token; + while (std::getline(ss, token, delimiter)) { + result.push_back(trim(token)); + } + return result; + } } #endif \ No newline at end of file diff --git a/src/main/Experiment.cpp b/src/main/Experiment.cpp index 61b04d8..cd6e825 100644 --- a/src/main/Experiment.cpp +++ b/src/main/Experiment.cpp @@ -115,7 +115,7 @@ namespace platform { } void Experiment::cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files) { - auto datasets = Datasets(discretized, Paths::datasets()); + auto datasets = Datasets(false, Paths::datasets()); // Never discretize here // Get dataset auto [X, y] = datasets.getTensors(fileName); auto states = datasets.getStates(fileName); @@ -176,6 +176,12 @@ namespace platform { auto y_train = y.index({ train_t }); auto X_test = X.index({ "...", test_t }); auto y_test = y.index({ test_t }); + if (discretized) { + // compute states too + // discretizer->fit(X_train, y_train); + // X_train = discretizer->transform(X_train); + // X_test = discretizer->transform(X_test); + } if (generate_fold_files) generate_files(fileName, discretized, stratified, seed, nfold, X_train, y_train, X_test, y_test, train, test); if (!quiet) diff --git a/src/reports/DatasetsConsole.cpp b/src/reports/DatasetsConsole.cpp index 06da29f..9c0ddcc 100644 --- a/src/reports/DatasetsConsole.cpp +++ b/src/reports/DatasetsConsole.cpp @@ -1,3 +1,4 @@ +#include #include "common/Colors.h" #include "common/Datasets.h" #include "common/Paths.h" @@ -12,7 +13,7 @@ namespace platform { auto part = temp.substr(0, DatasetsConsole::BALANCE_LENGTH); line += part + "\n"; body.push_back(line); - line = string(name_len + 22, ' '); + line = string(name_len + 28, ' '); temp = temp.substr(DatasetsConsole::BALANCE_LENGTH); } line += temp + "\n"; @@ -26,8 +27,8 @@ namespace platform { std::stringstream sheader; auto datasets_names = datasets.getNames(); int maxName = std::max(size_t(7), (*max_element(datasets_names.begin(), datasets_names.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size()); - std::vector header_labels = { " #", "Dataset", "Sampl.", "Feat.", "Cls", "Balance" }; - std::vector header_lengths = { 3, maxName, 6, 5, 3, DatasetsConsole::BALANCE_LENGTH }; + std::vector header_labels = { " #", "Dataset", "Sampl.", "Feat.", "#Num.", "Cls", "Balance" }; + std::vector header_lengths = { 3, maxName, 6, 5, 5, 3, DatasetsConsole::BALANCE_LENGTH }; sheader << Colors::GREEN(); for (int i = 0; i < header_labels.size(); i++) { sheader << setw(header_lengths[i]) << left << header_labels[i] << " "; @@ -50,7 +51,11 @@ namespace platform { datasets.loadDataset(dataset); auto nSamples = datasets.getNSamples(dataset); line << setw(6) << right << nSamples << " "; - line << setw(5) << right << datasets.getFeatures(dataset).size() << " "; + auto nFeatures = datasets.getFeatures(dataset).size(); + line << setw(5) << right << nFeatures << " "; + auto numericFeatures = datasets.getNumericFeatures(dataset); + auto num = std::count(numericFeatures.begin(), numericFeatures.end(), true); + line << setw(5) << right << num << " "; line << setw(3) << right << datasets.getNClasses(dataset) << " "; std::string sep = ""; oss.str(""); @@ -63,6 +68,7 @@ namespace platform { data[dataset] = json::object(); data[dataset]["samples"] = nSamples; data[dataset]["features"] = datasets.getFeatures(dataset).size(); + data[dataset]["numericFeatures"] = num; data[dataset]["classes"] = datasets.getNClasses(dataset); data[dataset]["balance"] = oss.str(); } diff --git a/src/reports/DatasetsExcel.cpp b/src/reports/DatasetsExcel.cpp index 267f103..667ef38 100644 --- a/src/reports/DatasetsExcel.cpp +++ b/src/reports/DatasetsExcel.cpp @@ -17,11 +17,11 @@ namespace platform { int balanceSize = 75; // Min size of the column worksheet = workbook_add_worksheet(workbook, "Datasets"); // Header - worksheet_merge_range(worksheet, 0, 0, 0, 5, "Datasets", styles["headerFirst"]); + worksheet_merge_range(worksheet, 0, 0, 0, 6, "Datasets", styles["headerFirst"]); // Body header row = 2; int col = 0; - for (const auto& name : { "Nº", "Dataset", "Samples", "Features", "Classes", "Balance" }) { + for (const auto& name : { "Nº", "Dataset", "Samples", "Features", "#Numer.", "Classes", "Balance" }) { writeString(row, col++, name, "bodyHeader"); } // Body @@ -34,12 +34,13 @@ namespace platform { writeString(row, 1, key.c_str(), "text"); writeInt(row, 2, value["samples"], "ints"); writeInt(row, 3, value["features"], "ints"); - writeInt(row, 4, value["classes"], "ints"); - writeString(row, 5, value["balance"].get().c_str(), "text"); + writeInt(row, 4, value["numericFeatures"], "ints"); + writeInt(row, 5, value["classes"], "ints"); + writeString(row, 6, value["balance"].get().c_str(), "text"); } // Format columns worksheet_freeze_panes(worksheet, 3, 2); - std::vector columns_sizes = { 5, datasetNameSize, 10, 10, 10, balanceSize }; + std::vector columns_sizes = { 5, datasetNameSize, 10, 10, 10, 10, balanceSize }; for (int i = 0; i < columns_sizes.size(); ++i) { worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL); }