From f288bbd6fa6348f31f35a6b51e52ffb58c68925e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 10 Oct 2023 11:52:39 +0200 Subject: [PATCH 01/15] Begin adding cfs to BoostAODE --- src/BayesNet/BoostAODE.cc | 13 +++++++++++++ src/BayesNet/BoostAODE.h | 5 ++++- src/Platform/Dataset.cc | 10 ---------- src/Platform/Dataset.h | 2 +- src/Platform/Datasets.cc | 2 +- src/Platform/DotEnv.h | 7 +++++-- src/Platform/Experiment.cc | 5 ++--- src/Platform/Paths.h | 6 ++++++ src/Platform/ReportBase.cc | 3 +-- src/Platform/ReportConsole.cc | 6 ++++-- src/Platform/Utils.h | 19 +++++++++++++++++++ src/Platform/list.cc | 4 +--- src/Platform/main.cc | 5 ++--- 13 files changed, 59 insertions(+), 28 deletions(-) create mode 100644 src/Platform/Utils.h diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index c976408..4ddf30d 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -4,6 +4,7 @@ #include "Colors.h" #include "Folding.h" #include +#include "Paths.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} @@ -28,6 +29,9 @@ namespace bayesnet { if (hyperparameters.contains("convergence")) { convergence = hyperparameters["convergence"]; } + if (hyperparameters.contains("cfs")) { + cfs = hyperparameters["cfs"]; + } } void BoostAODE::validationInit() { @@ -58,6 +62,12 @@ namespace bayesnet { } } + void BoostAODE::initializeModels() + { + ifstream file(cfs + ".json"); + if (file.is_open()) { + } + } void BoostAODE::trainModel(const torch::Tensor& weights) { models.clear(); @@ -66,6 +76,9 @@ namespace bayesnet { maxModels = .1 * n > 10 ? .1 * n : n; validationInit(); Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + if (cfs != "") { + initializeModels(); + } bool exitCondition = false; unordered_set featuresUsed; // Variables to control the accuracy finish condition diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index 61e2e95..5c99145 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -16,10 +16,13 @@ namespace bayesnet { torch::Tensor dataset_; torch::Tensor X_train, y_train, X_test, y_test; void validationInit(); - bool repeatSparent = false; + void initializeModels(); + // Hyperparameters + bool repeatSparent = false; // if true, a feature can be selected more than once int maxModels = 0; bool ascending = false; //Process KBest features ascending or descending order bool convergence = false; //if true, stop when the model does not improve + string cfs = ""; // if not empty, use CFS to select features }; } #endif \ No newline at end of file diff --git a/src/Platform/Dataset.cc b/src/Platform/Dataset.cc index 02a36f9..f75fdbc 100644 --- a/src/Platform/Dataset.cc +++ b/src/Platform/Dataset.cc @@ -212,14 +212,4 @@ namespace platform { } return Xd; } - vector Dataset::split(const string& text, char delimiter) - { - vector result; - stringstream ss(text); - string token; - while (getline(ss, token, delimiter)) { - result.push_back(token); - } - return result; - } } \ No newline at end of file diff --git a/src/Platform/Dataset.h b/src/Platform/Dataset.h index fbc577e..21b619e 100644 --- a/src/Platform/Dataset.h +++ b/src/Platform/Dataset.h @@ -5,6 +5,7 @@ #include #include #include "CPPFImdlp.h" +#include "Utils.h" namespace platform { using namespace std; @@ -62,7 +63,6 @@ namespace platform { public: Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; explicit Dataset(const Dataset&); - static vector split(const string& text, char delimiter); string getName() const; string getClassName() const; vector getFeatures() const; diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 717ccbc..4f53a2b 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -13,7 +13,7 @@ namespace platform { if (line.empty() || line[0] == '#') { continue; } - vector tokens = Dataset::split(line, ','); + vector tokens = split(line, ','); string name = tokens[0]; string className; if (tokens.size() == 1) { diff --git a/src/Platform/DotEnv.h b/src/Platform/DotEnv.h index c481310..87ec50e 100644 --- a/src/Platform/DotEnv.h +++ b/src/Platform/DotEnv.h @@ -4,7 +4,10 @@ #include #include #include -#include "Dataset.h" +#include +#include "Utils.h" + +//#include "Dataset.h" namespace platform { class DotEnv { private: @@ -51,7 +54,7 @@ namespace platform { auto seeds_str = env["seeds"]; seeds_str = trim(seeds_str); seeds_str = seeds_str.substr(1, seeds_str.size() - 2); - auto seeds_str_split = Dataset::split(seeds_str, ','); + auto seeds_str_split = split(seeds_str, ','); transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) { return stoi(str); }); diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index dced445..311dbc7 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -3,7 +3,7 @@ #include "Datasets.h" #include "Models.h" #include "ReportConsole.h" -#include "DotEnv.h" +#include "Paths.h" namespace platform { using json = nlohmann::json; string get_date() @@ -134,8 +134,7 @@ namespace platform { } void Experiment::cross_validation(const string& fileName) { - auto env = platform::DotEnv(); - auto datasets = platform::Datasets(discretized, env.get("source_data")); + auto datasets = platform::Datasets(discretized, Paths::datasets()); // Get dataset auto [X, y] = datasets.getTensors(fileName); auto states = datasets.getStates(fileName); diff --git a/src/Platform/Paths.h b/src/Platform/Paths.h index 926568e..a1eb00c 100644 --- a/src/Platform/Paths.h +++ b/src/Platform/Paths.h @@ -1,11 +1,17 @@ #ifndef PATHS_H #define PATHS_H #include +#include "DotEnv.h" namespace platform { class Paths { public: static std::string results() { return "results/"; } static std::string excel() { return "excel/"; } + static std::string datasets() + { + auto env = platform::DotEnv(); + return env.get("source_data"); + } }; } #endif \ No newline at end of file diff --git a/src/Platform/ReportBase.cc b/src/Platform/ReportBase.cc index 5f113a5..acb5581 100644 --- a/src/Platform/ReportBase.cc +++ b/src/Platform/ReportBase.cc @@ -58,8 +58,7 @@ namespace platform { } } else { if (data["score_name"].get() == "accuracy") { - auto env = platform::DotEnv(); - auto dt = Datasets(false, env.get("source_data")); + auto dt = Datasets(false, Paths::datasets()); dt.loadDataset(dataset); auto numClasses = dt.getNClasses(dataset); if (numClasses == 2) { diff --git a/src/Platform/ReportConsole.cc b/src/Platform/ReportConsole.cc index bb08ef3..aaba840 100644 --- a/src/Platform/ReportConsole.cc +++ b/src/Platform/ReportConsole.cc @@ -56,10 +56,12 @@ namespace platform { try { cout << r["hyperparameters"].get(); } - catch (const exception& err) { - cout << r["hyperparameters"]; + catch (...) { + //cout << r["hyperparameters"]; + cout << "Arrggggghhhh!" << endl; } cout << endl; + cout << flush; lastResult = r; totalScore += r["score"].get(); odd = !odd; diff --git a/src/Platform/Utils.h b/src/Platform/Utils.h new file mode 100644 index 0000000..3e24f05 --- /dev/null +++ b/src/Platform/Utils.h @@ -0,0 +1,19 @@ +#ifndef UTILS_H +#define UTILS_H +#include +#include +#include +namespace platform { + //static vector split(const string& text, char delimiter); + static std::vector split(const std::string& text, char delimiter) + { + std::vector result; + std::stringstream ss(text); + std::string token; + while (std::getline(ss, token, delimiter)) { + result.push_back(token); + } + return result; + } +} +#endif \ No newline at end of file diff --git a/src/Platform/list.cc b/src/Platform/list.cc index 8c386a5..581ee5f 100644 --- a/src/Platform/list.cc +++ b/src/Platform/list.cc @@ -3,7 +3,6 @@ #include "Paths.h" #include "Colors.h" #include "Datasets.h" -#include "DotEnv.h" using namespace std; const int BALANCE_LENGTH = 75; @@ -28,8 +27,7 @@ void outputBalance(const string& balance) int main(int argc, char** argv) { - auto env = platform::DotEnv(); - auto data = platform::Datasets(false, env.get("source_data")); + auto data = platform::Datasets(false, platform::Paths::datasets()); locale mylocale(cout.getloc(), new separated); locale::global(mylocale); cout.imbue(mylocale); diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 62470c5..1101e2b 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -82,8 +82,7 @@ int main(int argc, char** argv) auto seeds = program.get>("seeds"); auto hyperparameters = program.get("hyperparameters"); vector filesToTest; - auto env = platform::DotEnv(); - auto datasets = platform::Datasets(discretize_dataset, env.get("source_data")); + auto datasets = platform::Datasets(discretize_dataset, platform::Paths::datasets()); auto title = program.get("title"); auto saveResults = program.get("save"); if (file_name != "") { @@ -102,7 +101,7 @@ int main(int argc, char** argv) /* * Begin Processing */ - + auto env = platform::DotEnv(); auto experiment = platform::Experiment(); experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3"); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform")); -- 2.45.2 From df9b4c48d26c2fdf0ed199e699ecc8c64e8e1712 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 10 Oct 2023 13:39:11 +0200 Subject: [PATCH 02/15] Begin CFS initialization --- src/BayesNet/BoostAODE.cc | 62 +++++++++++++++++++---------------- src/BayesNet/BoostAODE.h | 1 - src/Platform/ReportConsole.cc | 8 +---- src/Platform/main.cc | 49 +++++++++++++-------------- 4 files changed, 57 insertions(+), 63 deletions(-) diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index 4ddf30d..aeae235 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -11,30 +11,7 @@ namespace bayesnet { void BoostAODE::buildModel(const torch::Tensor& weights) { // Models shall be built in trainModel - } - void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) - { - // Check if hyperparameters are valid - const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence" }; - checkHyperparameters(validKeys, hyperparameters); - if (hyperparameters.contains("repeatSparent")) { - repeatSparent = hyperparameters["repeatSparent"]; - } - if (hyperparameters.contains("maxModels")) { - maxModels = hyperparameters["maxModels"]; - } - if (hyperparameters.contains("ascending")) { - ascending = hyperparameters["ascending"]; - } - if (hyperparameters.contains("convergence")) { - convergence = hyperparameters["convergence"]; - } - if (hyperparameters.contains("cfs")) { - cfs = hyperparameters["cfs"]; - } - } - void BoostAODE::validationInit() - { + // Prepare the validation dataset auto y_ = dataset.index({ -1, "..." }); if (convergence) { // Prepare train & validation sets from train data @@ -60,12 +37,43 @@ namespace bayesnet { X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); y_train = y_; } - + if (cfs != "") { + initializeModels(); + } + } + void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) + { + // Check if hyperparameters are valid + const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "cfs" }; + checkHyperparameters(validKeys, hyperparameters); + if (hyperparameters.contains("repeatSparent")) { + repeatSparent = hyperparameters["repeatSparent"]; + } + if (hyperparameters.contains("maxModels")) { + maxModels = hyperparameters["maxModels"]; + } + if (hyperparameters.contains("ascending")) { + ascending = hyperparameters["ascending"]; + } + if (hyperparameters.contains("convergence")) { + convergence = hyperparameters["convergence"]; + } + if (hyperparameters.contains("cfs")) { + cfs = hyperparameters["cfs"]; + } } void BoostAODE::initializeModels() { ifstream file(cfs + ".json"); if (file.is_open()) { + nlohmann::json data; + file >> data; + file.close(); + auto model = "iris"; // has to come in when building object + auto features = data[model]; + cout << "features: " << features.dump() << endl; + } else { + throw runtime_error("File " + cfs + ".json not found"); } } void BoostAODE::trainModel(const torch::Tensor& weights) @@ -74,11 +82,7 @@ namespace bayesnet { n_models = 0; if (maxModels == 0) maxModels = .1 * n > 10 ? .1 * n : n; - validationInit(); Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); - if (cfs != "") { - initializeModels(); - } bool exitCondition = false; unordered_set featuresUsed; // Variables to control the accuracy finish condition diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index 5c99145..f3fa5bd 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -15,7 +15,6 @@ namespace bayesnet { private: torch::Tensor dataset_; torch::Tensor X_train, y_train, X_test, y_test; - void validationInit(); void initializeModels(); // Hyperparameters bool repeatSparent = false; // if true, a feature can be selected more than once diff --git a/src/Platform/ReportConsole.cc b/src/Platform/ReportConsole.cc index aaba840..c8e6890 100644 --- a/src/Platform/ReportConsole.cc +++ b/src/Platform/ReportConsole.cc @@ -53,13 +53,7 @@ namespace platform { const string status = compareResult(r["dataset"].get(), r["score"].get()); cout << status; cout << setw(12) << right << setprecision(6) << fixed << r["time"].get() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get() << " "; - try { - cout << r["hyperparameters"].get(); - } - catch (...) { - //cout << r["hyperparameters"]; - cout << "Arrggggghhhh!" << endl; - } + cout << r["hyperparameters"].dump(); cout << endl; cout << flush; lastResult = r; diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 1101e2b..ecdf258 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -12,7 +12,7 @@ using namespace std; using json = nlohmann::json; -argparse::ArgumentParser manageArguments(int argc, char** argv) +argparse::ArgumentParser manageArguments() { auto env = platform::DotEnv(); argparse::ArgumentParser program("main"); @@ -48,43 +48,40 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) }}); auto seed_values = env.getSeeds(); program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values); + return program; +} + +int main(int argc, char** argv) +{ + string file_name, model_name, title; + json hyperparameters_json; + bool discretize_dataset, stratified, saveResults; + vector seeds; + vector filesToTest; + int n_folds; + auto program = manageArguments(); try { program.parse_args(argc, argv); - auto file_name = program.get("dataset"); - auto model_name = program.get("model"); - auto discretize_dataset = program.get("discretize"); - auto stratified = program.get("stratified"); - auto n_folds = program.get("folds"); - auto seeds = program.get>("seeds"); - auto title = program.get("title"); + file_name = program.get("dataset"); + model_name = program.get("model"); + discretize_dataset = program.get("discretize"); + stratified = program.get("stratified"); + n_folds = program.get("folds"); + seeds = program.get>("seeds"); auto hyperparameters = program.get("hyperparameters"); - auto saveResults = program.get("save"); + hyperparameters_json = json::parse(hyperparameters); + title = program.get("title"); if (title == "" && file_name == "") { throw runtime_error("title is mandatory if dataset is not provided"); } + saveResults = program.get("save"); } catch (const exception& err) { cerr << err.what() << endl; cerr << program; exit(1); } - return program; -} - -int main(int argc, char** argv) -{ - auto program = manageArguments(argc, argv); - auto file_name = program.get("dataset"); - auto model_name = program.get("model"); - auto discretize_dataset = program.get("discretize"); - auto stratified = program.get("stratified"); - auto n_folds = program.get("folds"); - auto seeds = program.get>("seeds"); - auto hyperparameters = program.get("hyperparameters"); - vector filesToTest; auto datasets = platform::Datasets(discretize_dataset, platform::Paths::datasets()); - auto title = program.get("title"); - auto saveResults = program.get("save"); if (file_name != "") { if (!datasets.isDataset(file_name)) { cerr << "Dataset " << file_name << " not found" << endl; @@ -106,7 +103,7 @@ int main(int argc, char** argv) experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3"); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform")); experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy"); - experiment.setHyperparameters(json::parse(hyperparameters)); + experiment.setHyperparameters(hyperparameters_json); for (auto seed : seeds) { experiment.addRandomSeed(seed); } -- 2.45.2 From ca833a34f5835cb085c1ead9c9dd4f48edc435c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 10 Oct 2023 18:16:43 +0200 Subject: [PATCH 03/15] try openssl sha256 --- CMakeLists.txt | 2 +- src/BayesNet/BoostAODE.cc | 58 ++++++++++++++++++++++++++++++------- src/BayesNet/BoostAODE.h | 2 +- src/BayesNet/CMakeLists.txt | 2 +- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 294f0bf..37c674d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,7 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") # CMakes modules # -------------- set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) - +find_package(OpenSSL REQUIRED) include(AddGitSubmodule) if (CODE_COVERAGE) enable_testing() diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index aeae235..d3e8901 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -1,10 +1,12 @@ -#include "BoostAODE.h" #include +#include +#include +#include "BoostAODE.h" #include "BayesMetrics.h" #include "Colors.h" #include "Folding.h" -#include #include "Paths.h" +#include namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} @@ -13,6 +15,8 @@ namespace bayesnet { // Models shall be built in trainModel // Prepare the validation dataset auto y_ = dataset.index({ -1, "..." }); + int nSamples = dataset.size(1); + int nFeatures = dataset.size(0) - 1; if (convergence) { // Prepare train & validation sets from train data auto fold = platform::StratifiedKFold(5, y_, 271); @@ -38,7 +42,7 @@ namespace bayesnet { y_train = y_; } if (cfs != "") { - initializeModels(); + initializeModels(nSamples, nFeatures); } } void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) @@ -62,18 +66,52 @@ namespace bayesnet { cfs = hyperparameters["cfs"]; } } - void BoostAODE::initializeModels() + string sha256(const string& input) { - ifstream file(cfs + ".json"); + EVP_MD_CTX* mdctx; + const EVP_MD* md; + unsigned char hash[EVP_MAX_MD_SIZE]; + unsigned int hash_len; + + OpenSSL_add_all_digests(); + md = EVP_get_digestbyname("sha256"); + mdctx = EVP_MD_CTX_new(); + EVP_DigestInit_ex(mdctx, md, nullptr); + EVP_DigestUpdate(mdctx, input.c_str(), input.size()); + EVP_DigestFinal_ex(mdctx, hash, &hash_len); + EVP_MD_CTX_free(mdctx); + stringstream oss; + for (unsigned int i = 0; i < hash_len; i++) { + oss << hex << (int)hash[i]; + } + return oss.str(); + } + + void BoostAODE::initializeModels(int nSamples, int nFeatures) + { + // Read the CFS features + string output = "[", prefix = ""; + bool first = true; + for (const auto& feature : features) { + output += prefix + feature; + if (first) { + prefix = ", "; + first = false; + } + } + output += "]"; + // std::size_t str_hash = std::hash{}(output); + string str_hash = sha256(output); + stringstream oss; + oss << "cfs/" << str_hash << ".json"; + string name = oss.str(); + ifstream file(name); if (file.is_open()) { - nlohmann::json data; - file >> data; + nlohmann::json features = nlohmann::json::parse(file); file.close(); - auto model = "iris"; // has to come in when building object - auto features = data[model]; cout << "features: " << features.dump() << endl; } else { - throw runtime_error("File " + cfs + ".json not found"); + throw runtime_error("File " + name + " not found"); } } void BoostAODE::trainModel(const torch::Tensor& weights) diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index f3fa5bd..3464a7d 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -15,7 +15,7 @@ namespace bayesnet { private: torch::Tensor dataset_; torch::Tensor X_train, y_train, X_test, y_test; - void initializeModels(); + void initializeModels(int nSamples, int nFeatures); // Hyperparameters bool repeatSparent = false; // if true, a feature can be selected more than once int maxModels = 0; diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 2a120f3..6ca1238 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -6,4 +6,4 @@ include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) -target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file +target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto) \ No newline at end of file -- 2.45.2 From e7ded6826792d14abc7aeeaa1ca877c8e0382b4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 10 Oct 2023 23:00:38 +0200 Subject: [PATCH 04/15] First cfs working version --- src/BayesNet/BoostAODE.cc | 33 +++++++++++++++++++++------------ src/BayesNet/BoostAODE.h | 4 ++-- src/Platform/Paths.h | 1 + 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index d3e8901..0952a7a 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -13,10 +13,10 @@ namespace bayesnet { void BoostAODE::buildModel(const torch::Tensor& weights) { // Models shall be built in trainModel + models.clear(); + n_models = 0; // Prepare the validation dataset auto y_ = dataset.index({ -1, "..." }); - int nSamples = dataset.size(1); - int nFeatures = dataset.size(0) - 1; if (convergence) { // Prepare train & validation sets from train data auto fold = platform::StratifiedKFold(5, y_, 271); @@ -41,8 +41,8 @@ namespace bayesnet { X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); y_train = y_; } - if (cfs != "") { - initializeModels(nSamples, nFeatures); + if (cfs) { + initializeModels(); } } void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) @@ -82,18 +82,18 @@ namespace bayesnet { EVP_MD_CTX_free(mdctx); stringstream oss; for (unsigned int i = 0; i < hash_len; i++) { - oss << hex << (int)hash[i]; + oss << hex << setfill('0') << setw(2) << (int)hash[i]; } return oss.str(); } - void BoostAODE::initializeModels(int nSamples, int nFeatures) + void BoostAODE::initializeModels() { // Read the CFS features string output = "[", prefix = ""; bool first = true; for (const auto& feature : features) { - output += prefix + feature; + output += prefix + "'" + feature + "'"; if (first) { prefix = ", "; first = false; @@ -103,21 +103,30 @@ namespace bayesnet { // std::size_t str_hash = std::hash{}(output); string str_hash = sha256(output); stringstream oss; - oss << "cfs/" << str_hash << ".json"; + oss << platform::Paths::cfs() << str_hash << ".json"; string name = oss.str(); ifstream file(name); + Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); if (file.is_open()) { - nlohmann::json features = nlohmann::json::parse(file); + nlohmann::json cfsFeatures = nlohmann::json::parse(file); file.close(); - cout << "features: " << features.dump() << endl; + for (const string& feature : cfsFeatures) { + // cout << "Feature: [" << feature << "]" << endl; + auto pos = find(features.begin(), features.end(), feature); + if (pos == features.end()) + throw runtime_error("Feature " + feature + " not found in dataset"); + int numFeature = pos - features.begin(); + cout << "Feature: [" << feature << "] " << numFeature << endl; + models.push_back(std::make_unique(numFeature)); + models.back()->fit(dataset, features, className, states, weights_); + n_models++; + } } else { throw runtime_error("File " + name + " not found"); } } void BoostAODE::trainModel(const torch::Tensor& weights) { - models.clear(); - n_models = 0; if (maxModels == 0) maxModels = .1 * n > 10 ? .1 * n : n; Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index 3464a7d..683cb99 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -15,13 +15,13 @@ namespace bayesnet { private: torch::Tensor dataset_; torch::Tensor X_train, y_train, X_test, y_test; - void initializeModels(int nSamples, int nFeatures); + void initializeModels(); // Hyperparameters bool repeatSparent = false; // if true, a feature can be selected more than once int maxModels = 0; bool ascending = false; //Process KBest features ascending or descending order bool convergence = false; //if true, stop when the model does not improve - string cfs = ""; // if not empty, use CFS to select features + bool cfs = false; // if true use CFS to select features stored in cfs folder with sha256(features) file_name }; } #endif \ No newline at end of file diff --git a/src/Platform/Paths.h b/src/Platform/Paths.h index a1eb00c..16d459c 100644 --- a/src/Platform/Paths.h +++ b/src/Platform/Paths.h @@ -7,6 +7,7 @@ namespace platform { public: static std::string results() { return "results/"; } static std::string excel() { return "excel/"; } + static std::string cfs() { return "cfs/"; } static std::string datasets() { auto env = platform::DotEnv(); -- 2.45.2 From 47e2b138c5a92d2fc317054987c1eacb34b005ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 11 Oct 2023 11:33:29 +0200 Subject: [PATCH 05/15] Complete first working cfs --- CMakeLists.txt | 3 ++- src/BayesNet/BoostAODE.cc | 38 +++++++++++++++++++------------------- src/BayesNet/BoostAODE.h | 3 ++- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 37c674d..1f837ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,8 @@ endif (ENABLE_CLANG_TIDY) add_git_submodule("lib/mdlp") add_git_submodule("lib/argparse") add_git_submodule("lib/json") -find_library(XLSXWRITER_LIB libxlsxwriter.dylib PATHS /usr/local/lib) + +find_library(XLSXWRITER_LIB libxlsxwriter.dylib PATHS /usr/local/lib ${HOME}/lib/usr/local/lib) # Subdirectories # -------------- diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index 0952a7a..a9120a0 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -41,9 +41,6 @@ namespace bayesnet { X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); y_train = y_; } - if (cfs) { - initializeModels(); - } } void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) { @@ -87,8 +84,9 @@ namespace bayesnet { return oss.str(); } - void BoostAODE::initializeModels() + unordered_set BoostAODE::initializeModels() { + unordered_set featuresUsed; // Read the CFS features string output = "[", prefix = ""; bool first = true; @@ -110,28 +108,30 @@ namespace bayesnet { if (file.is_open()) { nlohmann::json cfsFeatures = nlohmann::json::parse(file); file.close(); - for (const string& feature : cfsFeatures) { - // cout << "Feature: [" << feature << "]" << endl; - auto pos = find(features.begin(), features.end(), feature); - if (pos == features.end()) - throw runtime_error("Feature " + feature + " not found in dataset"); - int numFeature = pos - features.begin(); - cout << "Feature: [" << feature << "] " << numFeature << endl; - models.push_back(std::make_unique(numFeature)); - models.back()->fit(dataset, features, className, states, weights_); + for (const int& feature : cfsFeatures) { + // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; + featuresUsed.insert(feature); + unique_ptr model = std::make_unique(feature); + model->fit(dataset, features, className, states, weights_); + models.push_back(std::move(model)); + significanceModels.push_back(1.0); n_models++; } } else { throw runtime_error("File " + name + " not found"); } + return featuresUsed; } void BoostAODE::trainModel(const torch::Tensor& weights) { + unordered_set featuresUsed; + if (cfs) { + featuresUsed = initializeModels(); + } if (maxModels == 0) maxModels = .1 * n > 10 ? .1 * n : n; Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); bool exitCondition = false; - unordered_set featuresUsed; // Variables to control the accuracy finish condition double priorAccuracy = 0.0; double delta = 1.0; @@ -150,16 +150,16 @@ namespace bayesnet { unique_ptr model; auto feature = featureSelection[0]; if (!repeatSparent || featuresUsed.size() < featureSelection.size()) { - bool found = false; - for (auto feat : featureSelection) { + bool used = true; + for (const auto& feat : featureSelection) { if (find(featuresUsed.begin(), featuresUsed.end(), feat) != featuresUsed.end()) { continue; } - found = true; + used = false; feature = feat; break; } - if (!found) { + if (used) { exitCondition = true; continue; } @@ -199,7 +199,7 @@ namespace bayesnet { count++; } } - exitCondition = n_models == maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance; + exitCondition = n_models >= maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance; } if (featuresUsed.size() != features.size()) { status = WARNING; diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index 683cb99..fb87fce 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -1,6 +1,7 @@ #ifndef BOOSTAODE_H #define BOOSTAODE_H #include "Ensemble.h" +#include #include "SPODE.h" namespace bayesnet { class BoostAODE : public Ensemble { @@ -15,7 +16,7 @@ namespace bayesnet { private: torch::Tensor dataset_; torch::Tensor X_train, y_train, X_test, y_test; - void initializeModels(); + unordered_set initializeModels(); // Hyperparameters bool repeatSparent = false; // if true, a feature can be selected more than once int maxModels = 0; -- 2.45.2 From 40d1dad5d827f3729e38c7b3fe448e1b511bb880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 11 Oct 2023 21:17:26 +0200 Subject: [PATCH 06/15] Begin CFS implementation --- src/BayesNet/BayesMetrics.cc | 9 +-- src/BayesNet/BayesMetrics.h | 14 ++-- src/BayesNet/CFS.cc | 127 +++++++++++++++++++++++++++++++++++ src/BayesNet/CFS.h | 31 +++++++++ src/BayesNet/CMakeLists.txt | 2 +- src/BayesNet/Node.h | 2 +- 6 files changed, 173 insertions(+), 12 deletions(-) create mode 100644 src/BayesNet/CFS.cc create mode 100644 src/BayesNet/CFS.h diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 623656e..86de9ea 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -60,11 +60,12 @@ namespace bayesnet { { return scoresKBest; } - vector> Metrics::doCombinations(const vector& source) + template + vector> Metrics::doCombinations(const vector& source) { - vector> result; + vector> result; for (int i = 0; i < source.size(); ++i) { - string temp = source[i]; + T temp = source[i]; for (int j = i + 1; j < source.size(); ++j) { result.push_back({ temp, source[j] }); } @@ -76,7 +77,7 @@ namespace bayesnet { auto result = vector(); auto source = vector(features); source.push_back(className); - auto combinations = doCombinations(source); + auto combinations = doCombinations(source); // Compute class prior auto margin = torch::zeros({ classNumStates }, torch::kFloat); for (int value = 0; value < classNumStates; ++value) { diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 01841a7..30606c0 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -8,20 +8,22 @@ namespace bayesnet { using namespace torch; class Metrics { private: - Tensor samples; // nxm tensor used to fit the model - vector features; - string className; int classNumStates = 0; vector scoresKBest; vector featuresKBest; // sorted indices of the features - double entropy(const Tensor& feature, const Tensor& weights); double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); - vector> doCombinations(const vector&); + protected: + Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector + string className; + double entropy(const Tensor& feature, const Tensor& weights); + vector features; + template + vector> doCombinations(const vector& source); public: Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); Metrics(const vector>& vsamples, const vector& labels, const vector& features, const string& className, const int classNumStates); - vector SelectKBestWeighted(const torch::Tensor& weights, bool ascending=false, unsigned k = 0); + vector SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0); vector getScoresKBest() const; double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); vector conditionalEdgeWeights(vector& weights); // To use in Python diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc new file mode 100644 index 0000000..b3473cd --- /dev/null +++ b/src/BayesNet/CFS.cc @@ -0,0 +1,127 @@ +#include "CFS.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + CFS::CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) + + { + } + double CFS::symmetricalUncertainty(int a, int b) + { + /* + Compute symmetrical uncertainty. Normalize* information gain (mutual + information) with the entropies of the features in order to compensate + the bias due to high cardinality features. *Range [0, 1] + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + */ + auto x = samples.index({ a, "..." }); + auto y = samples.index({ b, "..." }); + return 2.0 * mutualInformation(y, x, weights) / (entropy(x, weights) + entropy(y, weights)); + } + void CFS::computeSuLabels() + { + // Compute Simmetrical Uncertainty between features and labels + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + for (int i = 0; i < features.size(); ++i) { + suLabels[i] = symmetricalUncertainty(i, -1); + } + + } + double CFS::computeSuFeatures(const int firstFeature, const int secondFeature) + { + // Compute Simmetrical Uncertainty between features + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + // TODO: Implement Cache in this function + return symmetricalUncertainty(firstFeature, secondFeature); + } + double CFS::computeMerit() + { + double result; + double rcf = 0; + for (auto feature : cfsFeatures) { + rcf += suLabels[feature]; + } + double rff = 0; + int n = cfsFeatures.size(); + for (const auto& item : doCombinations(cfsFeatures)) { + rff += computeSuFeatures(item.first, item.second); + } + return rcf / sqrt(n + (n * n - n) * rff); + } + void CFS::fit() + { + cfsFeatures.clear(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto continueCondition = true; + auto feature = featureOrder[0]; + cfsFeatures.push_back(feature); + cfsScores.push_back(suLabels[feature]); + while (continueCondition) { + double merit = numeric_limits::lowest(); + int bestFeature = -1; + for (auto feature : featureOrder) { + cfsFeatures.push_back(feature); + auto meritNew = computeMerit(); // Compute merit with cfsFeatures + if (meritNew > merit) { + merit = meritNew; + bestFeature = feature; + } + cfsFeatures.pop_back(); + } + cfsFeatures.push_back(bestFeature); + cfsScores.push_back(merit); + featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), feature), featureOrder.end()); + continueCondition = computeContinueCondition(featureOrder); + } + fitted = true; + } + bool CFS::computeContinueCondition(const vector& featureOrder) + { + if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { + return false; + } + if (cfsScores.size() >= 5) { + /* + "To prevent the best first search from exploring the entire + feature subset search space, a stopping criterion is imposed. + The search will terminate if five consecutive fully expanded + subsets show no improvement over the current best subset." + as stated in Mark A.Hall Thesis + */ + double item_ant = numeric_limits::lowest(); + int num = 0; + vector lastFive(cfsScores.end() - 5, cfsScores.end()); + for (auto item : lastFive) { + if (item_ant == numeric_limits::lowest()) { + item_ant = item; + } + if (item > item_ant) { + break; + } else { + num++; + item_ant = item; + } + } + if (num == 5) { + return false; + } + } + return true; + } + vector CFS::getFeatures() const + { + if (!fitted) { + throw runtime_error("CFS not fitted"); + } + return cfsFeatures; + } + vector CFS::getScores() const + { + if (!fitted) { + throw runtime_error("CFS not fitted"); + } + return cfsScores; + } +} \ No newline at end of file diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h new file mode 100644 index 0000000..1cf621d --- /dev/null +++ b/src/BayesNet/CFS.h @@ -0,0 +1,31 @@ +#ifndef CFS_H +#define CFS_H +#include +#include +#include "BayesMetrics.h" +using namespace std; +namespace bayesnet { + class CFS : public Metrics { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); + virtual ~CFS() {}; + void fit(); + vector getFeatures() const; + vector getScores() const; + private: + void computeSuLabels(); + double computeSuFeatures(const int a, const int b); + double symmetricalUncertainty(int a, int b); + double computeMerit(); + bool computeContinueCondition(const vector& featureOrder); + vector> combinations(const vector& features); + const torch::Tensor& weights; + int maxFeatures; + vector cfsFeatures; + vector cfsScores; + vector suLabels; + bool fitted = false; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 6ca1238..e22827e 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto) \ No newline at end of file diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index 6758c5c..4979007 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -14,8 +14,8 @@ namespace bayesnet { int numStates; // number of states of the variable torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ... vector dimensions; // dimensions of the cpTable - public: vector> combinations(const vector&); + public: explicit Node(const string&); void clear(); void addParent(Node*); -- 2.45.2 From 5022a4dc90345c27bf373d351b039f9a0d83285c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 13 Oct 2023 12:29:25 +0200 Subject: [PATCH 07/15] Complete CFS tested with Python mufs --- .vscode/c_cpp_properties.json | 18 +++++++++++++ src/BayesNet/BayesMetrics.cc | 20 +++++--------- src/BayesNet/BayesMetrics.h | 12 ++++++++- src/BayesNet/BoostAODE.cc | 6 +++-- src/BayesNet/CFS.cc | 49 +++++++++++++++++++++++++++++++--- src/BayesNet/CFS.h | 1 + src/Platform/CMakeLists.txt | 2 +- src/Platform/testx.cpp | 50 +++++++++++++++++++++++++---------- 8 files changed, 123 insertions(+), 35 deletions(-) create mode 100644 .vscode/c_cpp_properties.json diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..6faaf51 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Mac", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "macFrameworkPath": [ + "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks" + ], + "cStandard": "c17", + "cppStandard": "c++17", + "compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 86de9ea..e98f41a 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -60,24 +60,13 @@ namespace bayesnet { { return scoresKBest; } - template - vector> Metrics::doCombinations(const vector& source) - { - vector> result; - for (int i = 0; i < source.size(); ++i) { - T temp = source[i]; - for (int j = i + 1; j < source.size(); ++j) { - result.push_back({ temp, source[j] }); - } - } - return result; - } + torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights) { auto result = vector(); auto source = vector(features); source.push_back(className); - auto combinations = doCombinations(source); + auto combinations = doCombinations(source); // Compute class prior auto margin = torch::zeros({ classNumStates }, torch::kFloat); for (int value = 0; value < classNumStates; ++value) { @@ -123,6 +112,11 @@ namespace bayesnet { torch::Tensor counts = feature.bincount(weights); double totalWeight = counts.sum().item(); torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; + // cout << "Probs: "; + // for (int i = 0; i < probs.size(0); ++i) { + // cout << probs[i].item() << ", "; + // } + // cout << endl; torch::Tensor logProbs = torch::log(probs); torch::Tensor entropy = -probs * logProbs; return entropy.nansum().item(); diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 30606c0..341951e 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -18,7 +18,17 @@ namespace bayesnet { double entropy(const Tensor& feature, const Tensor& weights); vector features; template - vector> doCombinations(const vector& source); + vector> doCombinations(const vector& source) + { + vector> result; + for (int i = 0; i < source.size(); ++i) { + T temp = source[i]; + for (int j = i + 1; j < source.size(); ++j) { + result.push_back({ temp, source[j] }); + } + } + return result; + } public: Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index a9120a0..a95d6e2 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -2,11 +2,11 @@ #include #include #include "BoostAODE.h" -#include "BayesMetrics.h" #include "Colors.h" #include "Folding.h" #include "Paths.h" #include +#include "CFS.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} @@ -98,13 +98,15 @@ namespace bayesnet { } } output += "]"; + Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + int maxFeatures = 0; + auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); // std::size_t str_hash = std::hash{}(output); string str_hash = sha256(output); stringstream oss; oss << platform::Paths::cfs() << str_hash << ".json"; string name = oss.str(); ifstream file(name); - Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); if (file.is_open()) { nlohmann::json cfsFeatures = nlohmann::json::parse(file); file.close(); diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index b3473cd..51e30dc 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -17,14 +17,22 @@ namespace bayesnet { */ auto x = samples.index({ a, "..." }); auto y = samples.index({ b, "..." }); - return 2.0 * mutualInformation(y, x, weights) / (entropy(x, weights) + entropy(y, weights)); + auto mu = mutualInformation(x, y, weights); + // cout << "Mutual Information: (" << a << ", " << b << ") = " << mu << endl; + auto hx = entropy(x, weights); + // cout << "Entropy X: " << hx << endl; + auto hy = entropy(y, weights); + // cout << "Entropy Y: " << hy << endl; + return 2.0 * mu / (hx + hy); } void CFS::computeSuLabels() { // Compute Simmetrical Uncertainty between features and labels // https://en.wikipedia.org/wiki/Symmetric_uncertainty + // cout << "SuLabels" << endl; for (int i = 0; i < features.size(); ++i) { - suLabels[i] = symmetricalUncertainty(i, -1); + suLabels.push_back(symmetricalUncertainty(i, -1)); + // cout << i << " -> " << suLabels[i] << endl; } } @@ -44,7 +52,7 @@ namespace bayesnet { } double rff = 0; int n = cfsFeatures.size(); - for (const auto& item : doCombinations(cfsFeatures)) { + for (const auto& item : doCombinations(cfsFeatures)) { rff += computeSuFeatures(item.first, item.second); } return rcf / sqrt(n + (n * n - n) * rff); @@ -58,25 +66,58 @@ namespace bayesnet { auto feature = featureOrder[0]; cfsFeatures.push_back(feature); cfsScores.push_back(suLabels[feature]); + cfsFeatures.erase(cfsFeatures.begin()); while (continueCondition) { double merit = numeric_limits::lowest(); int bestFeature = -1; for (auto feature : featureOrder) { cfsFeatures.push_back(feature); auto meritNew = computeMerit(); // Compute merit with cfsFeatures + //cout << "MeritNew: " << meritNew << " Merit: " << merit << endl; if (meritNew > merit) { merit = meritNew; bestFeature = feature; } cfsFeatures.pop_back(); } + if (bestFeature == -1) { + throw runtime_error("Feature not found"); + } cfsFeatures.push_back(bestFeature); cfsScores.push_back(merit); - featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), feature), featureOrder.end()); + featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end()); continueCondition = computeContinueCondition(featureOrder); } fitted = true; } + void CFS::test() + { + cout << "H(y): " << entropy(samples.index({ -1, "..." }), weights) << endl; + cout << "y: "; + auto y = samples.index({ -1, "..." }); + for (int i = 0; i < y.size(0); ++i) { + cout << y[i].item() << ", "; + } + cout << endl; + computeSuLabels(); + // cout << "Probabilites of features: " << endl; + // for (const auto& featureName : features) { + // int featureIdx = find(features.begin(), features.end(), featureName) - features.begin(); + // cout << featureName << "(" << featureIdx << "): "; + // auto feature = samples.index({ featureIdx, "..." }); + // torch::Tensor counts = feature.bincount(weights); + // double totalWeight = counts.sum().item(); + // torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; + // for (int i = 0; i < probs.size(0); ++i) { + // cout << probs[i].item() << ", "; + // } + // cout << endl; + // // for (int i = 0; i < x.size(0); ++i) { + // // cout << x[i].item() << ", "; + // // } + // // cout << endl; + // } + } bool CFS::computeContinueCondition(const vector& featureOrder) { if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h index 1cf621d..556659a 100644 --- a/src/BayesNet/CFS.h +++ b/src/BayesNet/CFS.h @@ -11,6 +11,7 @@ namespace bayesnet { CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); virtual ~CFS() {}; void fit(); + void test(); vector getFeatures() const; vector getScores() const; private: diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 4111c34..75e846f 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -9,7 +9,7 @@ add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Mo add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc) add_executable(b_list list.cc Datasets.cc Dataset.cc) add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc) -add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc) +add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc ) target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(b_manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs) diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index 43ab29c..c6b733e 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -7,6 +7,7 @@ #include "Network.h" #include "ArffFiles.h" #include "CPPFImdlp.h" +#include "CFS.h" using namespace std; using namespace platform; @@ -191,22 +192,43 @@ int main() // } // cout << "***********************************************************************************************" << endl; // } - const string file_name = "iris"; - auto net = bayesnet::Network(); + // const string file_name = "iris"; + // auto net = bayesnet::Network(); + // auto dt = Datasets(true, "Arff"); + // auto raw = RawDatasets("iris", true); + // auto [X, y] = dt.getVectors(file_name); + // cout << "Dataset dims " << raw.dataset.sizes() << endl; + // cout << "weights dims " << raw.weights.sizes() << endl; + // cout << "States dims " << raw.statest.size() << endl; + // cout << "features: "; + // for (const auto& feature : raw.featurest) { + // cout << feature << ", "; + // net.addNode(feature); + // } + // net.addNode(raw.classNamet); + // cout << endl; + // net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); auto dt = Datasets(true, "Arff"); - auto raw = RawDatasets("iris", true); - auto [X, y] = dt.getVectors(file_name); - cout << "Dataset dims " << raw.dataset.sizes() << endl; - cout << "weights dims " << raw.weights.sizes() << endl; - cout << "States dims " << raw.statest.size() << endl; - cout << "features: "; - for (const auto& feature : raw.featurest) { - cout << feature << ", "; - net.addNode(feature); + for (const auto& name : dt.getNames()) { + //for (const auto& name : { "iris" }) { + auto [X, y] = dt.getTensors(name); + auto features = dt.getFeatures(name); + auto states = dt.getStates(name); + auto className = dt.getClassName(name); + int maxFeatures = 0; + auto classNumStates = states.at(className).size(); + torch::Tensor weights = torch::full({ X.size(1) }, 1.0 / X.size(1), torch::kDouble); + auto dataset = X; + auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1); + dataset = torch::cat({ dataset, yresized }, 0); + auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights); + cfs.fit(); + cout << "Dataset: " << name << " CFS features: "; + for (const auto& feature : cfs.getFeatures()) { + cout << feature << ", "; + } + cout << "end." << endl; } - net.addNode(raw.classNamet); - cout << endl; - net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); } -- 2.45.2 From 54b8939f35afac7b9f3779daaf429b3adce84336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 13 Oct 2023 13:46:22 +0200 Subject: [PATCH 08/15] Prepare BoostAODE first try --- CMakeLists.txt | 1 - src/BayesNet/BayesMetrics.cc | 5 ---- src/BayesNet/BoostAODE.cc | 52 +++++++----------------------------- src/BayesNet/CFS.cc | 47 +++++++------------------------- src/BayesNet/CFS.h | 1 + src/BayesNet/CMakeLists.txt | 2 +- src/Platform/testx.cpp | 4 +-- 7 files changed, 24 insertions(+), 88 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f837ac..88d769f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,6 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") # CMakes modules # -------------- set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) -find_package(OpenSSL REQUIRED) include(AddGitSubmodule) if (CODE_COVERAGE) enable_testing() diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index e98f41a..6bd3bbb 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -112,11 +112,6 @@ namespace bayesnet { torch::Tensor counts = feature.bincount(weights); double totalWeight = counts.sum().item(); torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; - // cout << "Probs: "; - // for (int i = 0; i < probs.size(0); ++i) { - // cout << probs[i].item() << ", "; - // } - // cout << endl; torch::Tensor logProbs = torch::log(probs); torch::Tensor entropy = -probs * logProbs; return entropy.nansum().item(); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index a95d6e2..cee8a51 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -5,7 +5,6 @@ #include "Colors.h" #include "Folding.h" #include "Paths.h" -#include #include "CFS.h" namespace bayesnet { @@ -63,27 +62,6 @@ namespace bayesnet { cfs = hyperparameters["cfs"]; } } - string sha256(const string& input) - { - EVP_MD_CTX* mdctx; - const EVP_MD* md; - unsigned char hash[EVP_MAX_MD_SIZE]; - unsigned int hash_len; - - OpenSSL_add_all_digests(); - md = EVP_get_digestbyname("sha256"); - mdctx = EVP_MD_CTX_new(); - EVP_DigestInit_ex(mdctx, md, nullptr); - EVP_DigestUpdate(mdctx, input.c_str(), input.size()); - EVP_DigestFinal_ex(mdctx, hash, &hash_len); - EVP_MD_CTX_free(mdctx); - stringstream oss; - for (unsigned int i = 0; i < hash_len; i++) { - oss << hex << setfill('0') << setw(2) << (int)hash[i]; - } - return oss.str(); - } - unordered_set BoostAODE::initializeModels() { unordered_set featuresUsed; @@ -101,26 +79,16 @@ namespace bayesnet { Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); int maxFeatures = 0; auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); - // std::size_t str_hash = std::hash{}(output); - string str_hash = sha256(output); - stringstream oss; - oss << platform::Paths::cfs() << str_hash << ".json"; - string name = oss.str(); - ifstream file(name); - if (file.is_open()) { - nlohmann::json cfsFeatures = nlohmann::json::parse(file); - file.close(); - for (const int& feature : cfsFeatures) { - // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; - featuresUsed.insert(feature); - unique_ptr model = std::make_unique(feature); - model->fit(dataset, features, className, states, weights_); - models.push_back(std::move(model)); - significanceModels.push_back(1.0); - n_models++; - } - } else { - throw runtime_error("File " + name + " not found"); + cfs.fit(); + auto cfsFeatures = cfs.getFeatures(); + for (const int& feature : cfsFeatures) { + // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; + featuresUsed.insert(feature); + unique_ptr model = std::make_unique(feature); + model->fit(dataset, features, className, states, weights_); + models.push_back(std::move(model)); + significanceModels.push_back(1.0); + n_models++; } return featuresUsed; } diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index 51e30dc..6b64220 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -18,21 +18,16 @@ namespace bayesnet { auto x = samples.index({ a, "..." }); auto y = samples.index({ b, "..." }); auto mu = mutualInformation(x, y, weights); - // cout << "Mutual Information: (" << a << ", " << b << ") = " << mu << endl; auto hx = entropy(x, weights); - // cout << "Entropy X: " << hx << endl; auto hy = entropy(y, weights); - // cout << "Entropy Y: " << hy << endl; return 2.0 * mu / (hx + hy); } void CFS::computeSuLabels() { // Compute Simmetrical Uncertainty between features and labels // https://en.wikipedia.org/wiki/Symmetric_uncertainty - // cout << "SuLabels" << endl; for (int i = 0; i < features.size(); ++i) { suLabels.push_back(symmetricalUncertainty(i, -1)); - // cout << i << " -> " << suLabels[i] << endl; } } @@ -40,8 +35,14 @@ namespace bayesnet { { // Compute Simmetrical Uncertainty between features // https://en.wikipedia.org/wiki/Symmetric_uncertainty - // TODO: Implement Cache in this function - return symmetricalUncertainty(firstFeature, secondFeature); + try { + return suFeatures.at({ firstFeature, secondFeature }); + } + catch (const out_of_range& e) { + auto result = symmetricalUncertainty(firstFeature, secondFeature); + suFeatures[{firstFeature, secondFeature}] = result; + return result; + } } double CFS::computeMerit() { @@ -73,7 +74,6 @@ namespace bayesnet { for (auto feature : featureOrder) { cfsFeatures.push_back(feature); auto meritNew = computeMerit(); // Compute merit with cfsFeatures - //cout << "MeritNew: " << meritNew << " Merit: " << merit << endl; if (meritNew > merit) { merit = meritNew; bestFeature = feature; @@ -81,7 +81,8 @@ namespace bayesnet { cfsFeatures.pop_back(); } if (bestFeature == -1) { - throw runtime_error("Feature not found"); + // meritNew has to be nan due to constant features + break; } cfsFeatures.push_back(bestFeature); cfsScores.push_back(merit); @@ -90,34 +91,6 @@ namespace bayesnet { } fitted = true; } - void CFS::test() - { - cout << "H(y): " << entropy(samples.index({ -1, "..." }), weights) << endl; - cout << "y: "; - auto y = samples.index({ -1, "..." }); - for (int i = 0; i < y.size(0); ++i) { - cout << y[i].item() << ", "; - } - cout << endl; - computeSuLabels(); - // cout << "Probabilites of features: " << endl; - // for (const auto& featureName : features) { - // int featureIdx = find(features.begin(), features.end(), featureName) - features.begin(); - // cout << featureName << "(" << featureIdx << "): "; - // auto feature = samples.index({ featureIdx, "..." }); - // torch::Tensor counts = feature.bincount(weights); - // double totalWeight = counts.sum().item(); - // torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; - // for (int i = 0; i < probs.size(0); ++i) { - // cout << probs[i].item() << ", "; - // } - // cout << endl; - // // for (int i = 0; i < x.size(0); ++i) { - // // cout << x[i].item() << ", "; - // // } - // // cout << endl; - // } - } bool CFS::computeContinueCondition(const vector& featureOrder) { if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h index 556659a..eff5da6 100644 --- a/src/BayesNet/CFS.h +++ b/src/BayesNet/CFS.h @@ -26,6 +26,7 @@ namespace bayesnet { vector cfsFeatures; vector cfsScores; vector suLabels; + map, double> suFeatures; bool fitted = false; }; } diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index e22827e..27a2d3a 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -6,4 +6,4 @@ include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) -target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto) \ No newline at end of file +target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index c6b733e..1ab1d83 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -210,7 +210,7 @@ int main() // net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); auto dt = Datasets(true, "Arff"); for (const auto& name : dt.getNames()) { - //for (const auto& name : { "iris" }) { + // for (const auto& name : { "iris" }) { auto [X, y] = dt.getTensors(name); auto features = dt.getFeatures(name); auto states = dt.getStates(name); @@ -222,8 +222,8 @@ int main() auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1); dataset = torch::cat({ dataset, yresized }, 0); auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights); + cout << "Dataset: " << name << " CFS features: " << flush; cfs.fit(); - cout << "Dataset: " << name << " CFS features: "; for (const auto& feature : cfs.getFeatures()) { cout << feature << ", "; } -- 2.45.2 From 977ff6fddbd02dc669bc697bd5829ab31d7cde90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 13 Oct 2023 14:01:52 +0200 Subject: [PATCH 09/15] Update CMakeLists for Linux --- src/Platform/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 75e846f..3e52c59 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -11,7 +11,7 @@ add_executable(b_list list.cc Datasets.cc Dataset.cc) add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc) add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc ) target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") -if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") +if ( CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux") target_link_libraries(b_manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs) target_link_libraries(b_best Boost::boost libxlsxwriter.so stdc++fs) else() -- 2.45.2 From d00b08cbe8b91ac1880608a1cf8c4747473785fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 13 Oct 2023 14:26:47 +0200 Subject: [PATCH 10/15] Fix Header for Linux --- src/Platform/DotEnv.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Platform/DotEnv.h b/src/Platform/DotEnv.h index 87ec50e..7d5ee2b 100644 --- a/src/Platform/DotEnv.h +++ b/src/Platform/DotEnv.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include "Utils.h" -- 2.45.2 From 6d5a25cdc8c9d44b1b13c7c155f15cf029f9a55c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 14 Oct 2023 11:27:46 +0200 Subject: [PATCH 11/15] Refactor CFS class creating abstract base class --- src/BayesNet/CFS.cc | 94 ++++++----------------------------- src/BayesNet/CFS.h | 26 +++------- src/BayesNet/CMakeLists.txt | 2 +- src/BayesNet/FeatureSelect.cc | 74 +++++++++++++++++++++++++++ src/BayesNet/FeatureSelect.h | 31 ++++++++++++ 5 files changed, 127 insertions(+), 100 deletions(-) create mode 100644 src/BayesNet/FeatureSelect.cc create mode 100644 src/BayesNet/FeatureSelect.h diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index 6b64220..50c0ea8 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -2,90 +2,38 @@ #include #include "bayesnetUtils.h" namespace bayesnet { - CFS::CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : - Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) - { - } - double CFS::symmetricalUncertainty(int a, int b) - { - /* - Compute symmetrical uncertainty. Normalize* information gain (mutual - information) with the entropies of the features in order to compensate - the bias due to high cardinality features. *Range [0, 1] - (https://www.sciencedirect.com/science/article/pii/S0020025519303603) - */ - auto x = samples.index({ a, "..." }); - auto y = samples.index({ b, "..." }); - auto mu = mutualInformation(x, y, weights); - auto hx = entropy(x, weights); - auto hy = entropy(y, weights); - return 2.0 * mu / (hx + hy); - } - void CFS::computeSuLabels() - { - // Compute Simmetrical Uncertainty between features and labels - // https://en.wikipedia.org/wiki/Symmetric_uncertainty - for (int i = 0; i < features.size(); ++i) { - suLabels.push_back(symmetricalUncertainty(i, -1)); - } - } - double CFS::computeSuFeatures(const int firstFeature, const int secondFeature) - { - // Compute Simmetrical Uncertainty between features - // https://en.wikipedia.org/wiki/Symmetric_uncertainty - try { - return suFeatures.at({ firstFeature, secondFeature }); - } - catch (const out_of_range& e) { - auto result = symmetricalUncertainty(firstFeature, secondFeature); - suFeatures[{firstFeature, secondFeature}] = result; - return result; - } - } - double CFS::computeMerit() - { - double result; - double rcf = 0; - for (auto feature : cfsFeatures) { - rcf += suLabels[feature]; - } - double rff = 0; - int n = cfsFeatures.size(); - for (const auto& item : doCombinations(cfsFeatures)) { - rff += computeSuFeatures(item.first, item.second); - } - return rcf / sqrt(n + (n * n - n) * rff); - } + + void CFS::fit() { - cfsFeatures.clear(); + selectedFeatures.clear(); computeSuLabels(); auto featureOrder = argsort(suLabels); // sort descending order auto continueCondition = true; auto feature = featureOrder[0]; - cfsFeatures.push_back(feature); - cfsScores.push_back(suLabels[feature]); - cfsFeatures.erase(cfsFeatures.begin()); + selectedFeatures.push_back(feature); + selectedScores.push_back(suLabels[feature]); + selectedFeatures.erase(selectedFeatures.begin()); while (continueCondition) { double merit = numeric_limits::lowest(); int bestFeature = -1; for (auto feature : featureOrder) { - cfsFeatures.push_back(feature); - auto meritNew = computeMerit(); // Compute merit with cfsFeatures + selectedFeatures.push_back(feature); + auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures if (meritNew > merit) { merit = meritNew; bestFeature = feature; } - cfsFeatures.pop_back(); + selectedFeatures.pop_back(); } if (bestFeature == -1) { // meritNew has to be nan due to constant features break; } - cfsFeatures.push_back(bestFeature); - cfsScores.push_back(merit); + selectedFeatures.push_back(bestFeature); + selectedScores.push_back(merit); featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end()); continueCondition = computeContinueCondition(featureOrder); } @@ -93,10 +41,10 @@ namespace bayesnet { } bool CFS::computeContinueCondition(const vector& featureOrder) { - if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { + if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) { return false; } - if (cfsScores.size() >= 5) { + if (selectedScores.size() >= 5) { /* "To prevent the best first search from exploring the entire feature subset search space, a stopping criterion is imposed. @@ -106,7 +54,7 @@ namespace bayesnet { */ double item_ant = numeric_limits::lowest(); int num = 0; - vector lastFive(cfsScores.end() - 5, cfsScores.end()); + vector lastFive(selectedScores.end() - 5, selectedScores.end()); for (auto item : lastFive) { if (item_ant == numeric_limits::lowest()) { item_ant = item; @@ -124,18 +72,4 @@ namespace bayesnet { } return true; } - vector CFS::getFeatures() const - { - if (!fitted) { - throw runtime_error("CFS not fitted"); - } - return cfsFeatures; - } - vector CFS::getScores() const - { - if (!fitted) { - throw runtime_error("CFS not fitted"); - } - return cfsScores; - } } \ No newline at end of file diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h index eff5da6..36b7c52 100644 --- a/src/BayesNet/CFS.h +++ b/src/BayesNet/CFS.h @@ -2,32 +2,20 @@ #define CFS_H #include #include -#include "BayesMetrics.h" +#include "FeatureSelect.h" using namespace std; namespace bayesnet { - class CFS : public Metrics { + class CFS : public FeatureSelect { public: // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector - CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); + CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights) + { + } virtual ~CFS() {}; - void fit(); - void test(); - vector getFeatures() const; - vector getScores() const; + void fit() override; private: - void computeSuLabels(); - double computeSuFeatures(const int a, const int b); - double symmetricalUncertainty(int a, int b); - double computeMerit(); bool computeContinueCondition(const vector& featureOrder); - vector> combinations(const vector& features); - const torch::Tensor& weights; - int maxFeatures; - vector cfsFeatures; - vector cfsScores; - vector suLabels; - map, double> suFeatures; - bool fitted = false; }; } #endif \ No newline at end of file diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 27a2d3a..c9543ea 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.cc b/src/BayesNet/FeatureSelect.cc new file mode 100644 index 0000000..4eb45fe --- /dev/null +++ b/src/BayesNet/FeatureSelect.cc @@ -0,0 +1,74 @@ +#include "FeatureSelect.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) + + { + } + double FeatureSelect::symmetricalUncertainty(int a, int b) + { + /* + Compute symmetrical uncertainty. Normalize* information gain (mutual + information) with the entropies of the features in order to compensate + the bias due to high cardinality features. *Range [0, 1] + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + */ + auto x = samples.index({ a, "..." }); + auto y = samples.index({ b, "..." }); + auto mu = mutualInformation(x, y, weights); + auto hx = entropy(x, weights); + auto hy = entropy(y, weights); + return 2.0 * mu / (hx + hy); + } + void FeatureSelect::computeSuLabels() + { + // Compute Simmetrical Uncertainty between features and labels + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + for (int i = 0; i < features.size(); ++i) { + suLabels.push_back(symmetricalUncertainty(i, -1)); + } + } + double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature) + { + // Compute Simmetrical Uncertainty between features + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + try { + return suFeatures.at({ firstFeature, secondFeature }); + } + catch (const out_of_range& e) { + double result = symmetricalUncertainty(firstFeature, secondFeature); + suFeatures[{firstFeature, secondFeature}] = result; + return result; + } + } + double FeatureSelect::computeMeritCFS() + { + double result; + double rcf = 0; + for (auto feature : selectedFeatures) { + rcf += suLabels[feature]; + } + double rff = 0; + int n = selectedFeatures.size(); + for (const auto& item : doCombinations(selectedFeatures)) { + rff += computeSuFeatures(item.first, item.second); + } + return rcf / sqrt(n + (n * n - n) * rff); + } + vector FeatureSelect::getFeatures() const + { + if (!fitted) { + throw runtime_error("FeatureSelect not fitted"); + } + return selectedFeatures; + } + vector FeatureSelect::getScores() const + { + if (!fitted) { + throw runtime_error("FeatureSelect not fitted"); + } + return selectedScores; + } +} \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.h b/src/BayesNet/FeatureSelect.h new file mode 100644 index 0000000..c1e280c --- /dev/null +++ b/src/BayesNet/FeatureSelect.h @@ -0,0 +1,31 @@ +#ifndef FEATURE_SELECT_H +#define FEATURE_SELECT_H +#include +#include +#include "BayesMetrics.h" +using namespace std; +namespace bayesnet { + class FeatureSelect : public Metrics { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + FeatureSelect(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); + virtual ~FeatureSelect() {}; + virtual void fit() = 0; + vector getFeatures() const; + vector getScores() const; + protected: + void computeSuLabels(); + double computeSuFeatures(const int a, const int b); + double symmetricalUncertainty(int a, int b); + double computeMeritCFS(); + vector> combinations(const vector& features); + const torch::Tensor& weights; + int maxFeatures; + vector selectedFeatures; + vector selectedScores; + vector suLabels; + map, double> suFeatures; + bool fitted = false; + }; +} +#endif \ No newline at end of file -- 2.45.2 From 6ef49385ea3eb7389979ff97e8587eb45581b7d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 14 Oct 2023 11:30:32 +0200 Subject: [PATCH 12/15] Remove unneeded method declaration FeatureSelect --- src/BayesNet/FeatureSelect.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/BayesNet/FeatureSelect.h b/src/BayesNet/FeatureSelect.h index c1e280c..c342468 100644 --- a/src/BayesNet/FeatureSelect.h +++ b/src/BayesNet/FeatureSelect.h @@ -18,7 +18,6 @@ namespace bayesnet { double computeSuFeatures(const int a, const int b); double symmetricalUncertainty(int a, int b); double computeMeritCFS(); - vector> combinations(const vector& features); const torch::Tensor& weights; int maxFeatures; vector selectedFeatures; -- 2.45.2 From b35532dd9e15862fd2fcab1c50894ce379fdc968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 14 Oct 2023 13:12:04 +0200 Subject: [PATCH 13/15] Implement IWSS and FCBF too for BoostAODE --- src/BayesNet/BayesMetrics.h | 7 +++++ src/BayesNet/BoostAODE.cc | 49 ++++++++++++++++++++++------------- src/BayesNet/BoostAODE.h | 6 ++++- src/BayesNet/CFS.cc | 9 +++---- src/BayesNet/CMakeLists.txt | 2 +- src/BayesNet/FCBF.cc | 44 +++++++++++++++++++++++++++++++ src/BayesNet/FCBF.h | 18 +++++++++++++ src/BayesNet/FeatureSelect.cc | 5 ++++ src/BayesNet/FeatureSelect.h | 1 + src/BayesNet/IWSS.cc | 47 +++++++++++++++++++++++++++++++++ src/BayesNet/IWSS.h | 18 +++++++++++++ 11 files changed, 180 insertions(+), 26 deletions(-) create mode 100644 src/BayesNet/FCBF.cc create mode 100644 src/BayesNet/FCBF.h create mode 100644 src/BayesNet/IWSS.cc create mode 100644 src/BayesNet/IWSS.h diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 341951e..66016a6 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -29,6 +29,13 @@ namespace bayesnet { } return result; } + template + T pop_first(vector& v) + { + T temp = v[0]; + v.erase(v.begin()); + return temp; + } public: Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index cee8a51..fb38a7c 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -6,6 +6,8 @@ #include "Folding.h" #include "Paths.h" #include "CFS.h" +#include "FCBF.h" +#include "IWSS.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} @@ -44,7 +46,7 @@ namespace bayesnet { void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) { // Check if hyperparameters are valid - const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "cfs" }; + const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" }; checkHyperparameters(validKeys, hyperparameters); if (hyperparameters.contains("repeatSparent")) { repeatSparent = hyperparameters["repeatSparent"]; @@ -58,29 +60,39 @@ namespace bayesnet { if (hyperparameters.contains("convergence")) { convergence = hyperparameters["convergence"]; } - if (hyperparameters.contains("cfs")) { - cfs = hyperparameters["cfs"]; + if (hyperparameters.contains("threshold")) { + threshold = hyperparameters["threshold"]; + } + if (hyperparameters.contains("select_features")) { + auto selectedAlgorithm = hyperparameters["select_features"]; + vector algos = { "IWSS", "FCBF", "CFS" }; + selectFeatures = true; + algorithm = selectedAlgorithm; + if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) { + throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]"); + } } } unordered_set BoostAODE::initializeModels() { unordered_set featuresUsed; - // Read the CFS features - string output = "[", prefix = ""; - bool first = true; - for (const auto& feature : features) { - output += prefix + "'" + feature + "'"; - if (first) { - prefix = ", "; - first = false; - } - } - output += "]"; Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); int maxFeatures = 0; - auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); - cfs.fit(); - auto cfsFeatures = cfs.getFeatures(); + if (algorithm == "CFS") { + featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); + } else if (algorithm == "IWSS") { + if (threshold < 0 || threshold >0.5) { + throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]"); + } + featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); + } else if (algorithm == "FCBF") { + if (threshold < 1e-7 || threshold > 1) { + throw invalid_argument("Invalid threshold value [1e-7, 1]"); + } + featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); + } + featureSelector->fit(); + auto cfsFeatures = featureSelector->getFeatures(); for (const int& feature : cfsFeatures) { // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; featuresUsed.insert(feature); @@ -90,12 +102,13 @@ namespace bayesnet { significanceModels.push_back(1.0); n_models++; } + delete featureSelector; return featuresUsed; } void BoostAODE::trainModel(const torch::Tensor& weights) { unordered_set featuresUsed; - if (cfs) { + if (selectFeatures) { featuresUsed = initializeModels(); } if (maxModels == 0) diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index fb87fce..dd1cf75 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -3,6 +3,7 @@ #include "Ensemble.h" #include #include "SPODE.h" +#include "FeatureSelect.h" namespace bayesnet { class BoostAODE : public Ensemble { public: @@ -22,7 +23,10 @@ namespace bayesnet { int maxModels = 0; bool ascending = false; //Process KBest features ascending or descending order bool convergence = false; //if true, stop when the model does not improve - bool cfs = false; // if true use CFS to select features stored in cfs folder with sha256(features) file_name + bool selectFeatures = false; // if true, use feature selection + string algorithm = ""; // Selected feature selection algorithm + FeatureSelect* featureSelector = nullptr; + double threshold = -1; }; } #endif \ No newline at end of file diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index 50c0ea8..f2ffc1e 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -2,13 +2,9 @@ #include #include "bayesnetUtils.h" namespace bayesnet { - - - - void CFS::fit() { - selectedFeatures.clear(); + initialize(); computeSuLabels(); auto featureOrder = argsort(suLabels); // sort descending order auto continueCondition = true; @@ -21,7 +17,8 @@ namespace bayesnet { int bestFeature = -1; for (auto feature : featureOrder) { selectedFeatures.push_back(feature); - auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures + // Compute merit with selectedFeatures + auto meritNew = computeMeritCFS(); if (meritNew > merit) { merit = meritNew; bestFeature = feature; diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index c9543ea..cc0f5a5 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/FCBF.cc b/src/BayesNet/FCBF.cc new file mode 100644 index 0000000..db935af --- /dev/null +++ b/src/BayesNet/FCBF.cc @@ -0,0 +1,44 @@ +#include "bayesnetUtils.h" +#include "FCBF.h" +namespace bayesnet { + + FCBF::FCBF(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold) + { + if (threshold < 1e-7) { + throw std::invalid_argument("Threshold cannot be less than 1e-7"); + } + } + void FCBF::fit() + { + initialize(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto featureOrderCopy = featureOrder; + for (const auto& feature : featureOrder) { + // Don't self compare + featureOrderCopy.erase(featureOrderCopy.begin()); + if (suLabels.at(feature) == 0.0) { + // The feature has been removed from the list + continue; + } + if (suLabels.at(feature) < threshold) { + break; + } + // Remove redundant features + for (const auto& featureCopy : featureOrderCopy) { + double value = computeSuFeatures(feature, featureCopy); + if (value >= suLabels.at(featureCopy)) { + // Remove feature from list + suLabels[featureCopy] = 0.0; + } + } + selectedFeatures.push_back(feature); + selectedScores.push_back(suLabels[feature]); + if (selectedFeatures.size() == maxFeatures) { + break; + } + } + fitted = true; + } +} \ No newline at end of file diff --git a/src/BayesNet/FCBF.h b/src/BayesNet/FCBF.h new file mode 100644 index 0000000..aa7ff47 --- /dev/null +++ b/src/BayesNet/FCBF.h @@ -0,0 +1,18 @@ +#ifndef FCBF_H +#define FCBF_H +#include +#include +#include "FeatureSelect.h" +using namespace std; +namespace bayesnet { + class FCBF : public FeatureSelect { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + FCBF(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold); + virtual ~FCBF() {}; + void fit() override; + private: + double threshold = -1; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.cc b/src/BayesNet/FeatureSelect.cc index 4eb45fe..11d929b 100644 --- a/src/BayesNet/FeatureSelect.cc +++ b/src/BayesNet/FeatureSelect.cc @@ -7,6 +7,11 @@ namespace bayesnet { { } + void FeatureSelect::initialize() + { + selectedFeatures.clear(); + selectedScores.clear(); + } double FeatureSelect::symmetricalUncertainty(int a, int b) { /* diff --git a/src/BayesNet/FeatureSelect.h b/src/BayesNet/FeatureSelect.h index c342468..46923c9 100644 --- a/src/BayesNet/FeatureSelect.h +++ b/src/BayesNet/FeatureSelect.h @@ -14,6 +14,7 @@ namespace bayesnet { vector getFeatures() const; vector getScores() const; protected: + void initialize(); void computeSuLabels(); double computeSuFeatures(const int a, const int b); double symmetricalUncertainty(int a, int b); diff --git a/src/BayesNet/IWSS.cc b/src/BayesNet/IWSS.cc new file mode 100644 index 0000000..f39f137 --- /dev/null +++ b/src/BayesNet/IWSS.cc @@ -0,0 +1,47 @@ +#include "IWSS.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + IWSS::IWSS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold) + { + if (threshold < 0 || threshold > .5) { + throw std::invalid_argument("Threshold has to be in [0, 0.5]"); + } + } + void IWSS::fit() + { + initialize(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto featureOrderCopy = featureOrder; + // Add first and second features to result + // First with its own score + auto first_feature = pop_first(featureOrderCopy); + selectedFeatures.push_back(first_feature); + selectedScores.push_back(suLabels.at(first_feature)); + // Second with the score of the candidates + selectedFeatures.push_back(pop_first(featureOrderCopy)); + auto merit = computeMeritCFS(); + selectedScores.push_back(merit); + for (const auto feature : featureOrderCopy) { + selectedFeatures.push_back(feature); + // Compute merit with selectedFeatures + auto meritNew = computeMeritCFS(); + double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0; + if (meritNew > merit || delta < threshold) { + if (meritNew > merit) { + merit = meritNew; + } + selectedScores.push_back(meritNew); + } else { + selectedFeatures.pop_back(); + break; + } + if (selectedFeatures.size() == maxFeatures) { + break; + } + } + fitted = true; + } +} \ No newline at end of file diff --git a/src/BayesNet/IWSS.h b/src/BayesNet/IWSS.h new file mode 100644 index 0000000..88a1034 --- /dev/null +++ b/src/BayesNet/IWSS.h @@ -0,0 +1,18 @@ +#ifndef IWSS_H +#define IWSS_H +#include +#include +#include "FeatureSelect.h" +using namespace std; +namespace bayesnet { + class IWSS : public FeatureSelect { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + IWSS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold); + virtual ~IWSS() {}; + void fit() override; + private: + double threshold = -1; + }; +} +#endif \ No newline at end of file -- 2.45.2 From 660e78351761b7398c72d1848b384df5489638d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 14 Oct 2023 13:32:09 +0200 Subject: [PATCH 14/15] Update validation for feature selection --- src/Platform/CMakeLists.txt | 2 +- src/Platform/testx.cpp | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 3e52c59..26584e7 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -19,4 +19,4 @@ else() target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}") endif() target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}") -target_link_libraries(testx ArffFiles mdlp BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file +target_link_libraries(testx ArffFiles BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index 1ab1d83..dfd6a21 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -1,5 +1,6 @@ #include "Folding.h" #include +#include "nlohmann/json.hpp" #include "map" #include #include @@ -8,6 +9,8 @@ #include "ArffFiles.h" #include "CPPFImdlp.h" #include "CFS.h" +#include "IWSS.h" +#include "FCBF.h" using namespace std; using namespace platform; @@ -209,6 +212,7 @@ int main() // cout << endl; // net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); auto dt = Datasets(true, "Arff"); + nlohmann::json output; for (const auto& name : dt.getNames()) { // for (const auto& name : { "iris" }) { auto [X, y] = dt.getTensors(name); @@ -222,13 +226,23 @@ int main() auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1); dataset = torch::cat({ dataset, yresized }, 0); auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights); - cout << "Dataset: " << name << " CFS features: " << flush; + auto fcbf = bayesnet::FCBF(dataset, features, className, maxFeatures, classNumStates, weights, 1e-7); + auto iwss = bayesnet::IWSS(dataset, features, className, maxFeatures, classNumStates, weights, 0.5); + cout << "Dataset: " << setw(20) << name << flush; cfs.fit(); - for (const auto& feature : cfs.getFeatures()) { - cout << feature << ", "; - } - cout << "end." << endl; + cout << " CFS: " << setw(4) << cfs.getFeatures().size() << flush; + fcbf.fit(); + cout << " FCBF: " << setw(4) << fcbf.getFeatures().size() << flush; + iwss.fit(); + cout << " IWSS: " << setw(4) << iwss.getFeatures().size() << flush; + cout << endl; + output[name]["CFS"] = cfs.getFeatures(); + output[name]["FCBF"] = fcbf.getFeatures(); + output[name]["IWSS"] = iwss.getFeatures(); } + ofstream file("features_cpp.json"); + file << output; + file.close(); } -- 2.45.2 From fa7fe081ad405d917a121154328ce8744f2fa568 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sun, 15 Oct 2023 11:19:58 +0200 Subject: [PATCH 15/15] Fix xlsx library finding --- CMakeLists.txt | 5 ++++- README.md | 4 +--- src/Platform/CMakeLists.txt | 9 ++------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 88d769f..0a4515f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,10 @@ add_git_submodule("lib/mdlp") add_git_submodule("lib/argparse") add_git_submodule("lib/json") -find_library(XLSXWRITER_LIB libxlsxwriter.dylib PATHS /usr/local/lib ${HOME}/lib/usr/local/lib) + +find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/lib) +message("XLSXWRITER_LIB=${XLSXWRITER_LIB}") + # Subdirectories # -------------- diff --git a/README.md b/README.md index 426be8d..ad2660c 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,9 @@ export BOOST_ROOT=/path/to/library/ ```bash cd lib/libxlsxwriter make -sudo make install +make install DESTDIR=/home/rmontanana/Code PREFIX= ``` -It has to be installed in /usr/local/lib otherwise CMakeLists.txt has to be modified accordingly - Environment variable has to be set: ```bash diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 26584e7..3a565e1 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -11,12 +11,7 @@ add_executable(b_list list.cc Datasets.cc Dataset.cc) add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc) add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc ) target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") -if ( CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux") - target_link_libraries(b_manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs) - target_link_libraries(b_best Boost::boost libxlsxwriter.so stdc++fs) -else() - target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) - target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}") -endif() +target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp) +target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}") target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}") target_link_libraries(testx ArffFiles BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file -- 2.45.2