From 54b8939f35afac7b9f3779daaf429b3adce84336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 13 Oct 2023 13:46:22 +0200 Subject: [PATCH] Prepare BoostAODE first try --- CMakeLists.txt | 1 - src/BayesNet/BayesMetrics.cc | 5 ---- src/BayesNet/BoostAODE.cc | 52 +++++++----------------------------- src/BayesNet/CFS.cc | 47 +++++++------------------------- src/BayesNet/CFS.h | 1 + src/BayesNet/CMakeLists.txt | 2 +- src/Platform/testx.cpp | 4 +-- 7 files changed, 24 insertions(+), 88 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f837ac..88d769f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,6 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") # CMakes modules # -------------- set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) -find_package(OpenSSL REQUIRED) include(AddGitSubmodule) if (CODE_COVERAGE) enable_testing() diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index e98f41a..6bd3bbb 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -112,11 +112,6 @@ namespace bayesnet { torch::Tensor counts = feature.bincount(weights); double totalWeight = counts.sum().item(); torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; - // cout << "Probs: "; - // for (int i = 0; i < probs.size(0); ++i) { - // cout << probs[i].item() << ", "; - // } - // cout << endl; torch::Tensor logProbs = torch::log(probs); torch::Tensor entropy = -probs * logProbs; return entropy.nansum().item(); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index a95d6e2..cee8a51 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -5,7 +5,6 @@ #include "Colors.h" #include "Folding.h" #include "Paths.h" -#include #include "CFS.h" namespace bayesnet { @@ -63,27 +62,6 @@ namespace bayesnet { cfs = hyperparameters["cfs"]; } } - string sha256(const string& input) - { - EVP_MD_CTX* mdctx; - const EVP_MD* md; - unsigned char hash[EVP_MAX_MD_SIZE]; - unsigned int hash_len; - - OpenSSL_add_all_digests(); - md = EVP_get_digestbyname("sha256"); - mdctx = EVP_MD_CTX_new(); - EVP_DigestInit_ex(mdctx, md, nullptr); - EVP_DigestUpdate(mdctx, input.c_str(), input.size()); - EVP_DigestFinal_ex(mdctx, hash, &hash_len); - EVP_MD_CTX_free(mdctx); - stringstream oss; - for (unsigned int i = 0; i < hash_len; i++) { - oss << hex << setfill('0') << setw(2) << (int)hash[i]; - } - return oss.str(); - } - unordered_set BoostAODE::initializeModels() { unordered_set featuresUsed; @@ -101,26 +79,16 @@ namespace bayesnet { Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); int maxFeatures = 0; auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); - // std::size_t str_hash = std::hash{}(output); - string str_hash = sha256(output); - stringstream oss; - oss << platform::Paths::cfs() << str_hash << ".json"; - string name = oss.str(); - ifstream file(name); - if (file.is_open()) { - nlohmann::json cfsFeatures = nlohmann::json::parse(file); - file.close(); - for (const int& feature : cfsFeatures) { - // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; - featuresUsed.insert(feature); - unique_ptr model = std::make_unique(feature); - model->fit(dataset, features, className, states, weights_); - models.push_back(std::move(model)); - significanceModels.push_back(1.0); - n_models++; - } - } else { - throw runtime_error("File " + name + " not found"); + cfs.fit(); + auto cfsFeatures = cfs.getFeatures(); + for (const int& feature : cfsFeatures) { + // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; + featuresUsed.insert(feature); + unique_ptr model = std::make_unique(feature); + model->fit(dataset, features, className, states, weights_); + models.push_back(std::move(model)); + significanceModels.push_back(1.0); + n_models++; } return featuresUsed; } diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index 51e30dc..6b64220 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -18,21 +18,16 @@ namespace bayesnet { auto x = samples.index({ a, "..." }); auto y = samples.index({ b, "..." }); auto mu = mutualInformation(x, y, weights); - // cout << "Mutual Information: (" << a << ", " << b << ") = " << mu << endl; auto hx = entropy(x, weights); - // cout << "Entropy X: " << hx << endl; auto hy = entropy(y, weights); - // cout << "Entropy Y: " << hy << endl; return 2.0 * mu / (hx + hy); } void CFS::computeSuLabels() { // Compute Simmetrical Uncertainty between features and labels // https://en.wikipedia.org/wiki/Symmetric_uncertainty - // cout << "SuLabels" << endl; for (int i = 0; i < features.size(); ++i) { suLabels.push_back(symmetricalUncertainty(i, -1)); - // cout << i << " -> " << suLabels[i] << endl; } } @@ -40,8 +35,14 @@ namespace bayesnet { { // Compute Simmetrical Uncertainty between features // https://en.wikipedia.org/wiki/Symmetric_uncertainty - // TODO: Implement Cache in this function - return symmetricalUncertainty(firstFeature, secondFeature); + try { + return suFeatures.at({ firstFeature, secondFeature }); + } + catch (const out_of_range& e) { + auto result = symmetricalUncertainty(firstFeature, secondFeature); + suFeatures[{firstFeature, secondFeature}] = result; + return result; + } } double CFS::computeMerit() { @@ -73,7 +74,6 @@ namespace bayesnet { for (auto feature : featureOrder) { cfsFeatures.push_back(feature); auto meritNew = computeMerit(); // Compute merit with cfsFeatures - //cout << "MeritNew: " << meritNew << " Merit: " << merit << endl; if (meritNew > merit) { merit = meritNew; bestFeature = feature; @@ -81,7 +81,8 @@ namespace bayesnet { cfsFeatures.pop_back(); } if (bestFeature == -1) { - throw runtime_error("Feature not found"); + // meritNew has to be nan due to constant features + break; } cfsFeatures.push_back(bestFeature); cfsScores.push_back(merit); @@ -90,34 +91,6 @@ namespace bayesnet { } fitted = true; } - void CFS::test() - { - cout << "H(y): " << entropy(samples.index({ -1, "..." }), weights) << endl; - cout << "y: "; - auto y = samples.index({ -1, "..." }); - for (int i = 0; i < y.size(0); ++i) { - cout << y[i].item() << ", "; - } - cout << endl; - computeSuLabels(); - // cout << "Probabilites of features: " << endl; - // for (const auto& featureName : features) { - // int featureIdx = find(features.begin(), features.end(), featureName) - features.begin(); - // cout << featureName << "(" << featureIdx << "): "; - // auto feature = samples.index({ featureIdx, "..." }); - // torch::Tensor counts = feature.bincount(weights); - // double totalWeight = counts.sum().item(); - // torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; - // for (int i = 0; i < probs.size(0); ++i) { - // cout << probs[i].item() << ", "; - // } - // cout << endl; - // // for (int i = 0; i < x.size(0); ++i) { - // // cout << x[i].item() << ", "; - // // } - // // cout << endl; - // } - } bool CFS::computeContinueCondition(const vector& featureOrder) { if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h index 556659a..eff5da6 100644 --- a/src/BayesNet/CFS.h +++ b/src/BayesNet/CFS.h @@ -26,6 +26,7 @@ namespace bayesnet { vector cfsFeatures; vector cfsScores; vector suLabels; + map, double> suFeatures; bool fitted = false; }; } diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index e22827e..27a2d3a 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -6,4 +6,4 @@ include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) -target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto) \ No newline at end of file +target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index c6b733e..1ab1d83 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -210,7 +210,7 @@ int main() // net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); auto dt = Datasets(true, "Arff"); for (const auto& name : dt.getNames()) { - //for (const auto& name : { "iris" }) { + // for (const auto& name : { "iris" }) { auto [X, y] = dt.getTensors(name); auto features = dt.getFeatures(name); auto states = dt.getStates(name); @@ -222,8 +222,8 @@ int main() auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1); dataset = torch::cat({ dataset, yresized }, 0); auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights); + cout << "Dataset: " << name << " CFS features: " << flush; cfs.fit(); - cout << "Dataset: " << name << " CFS features: "; for (const auto& feature : cfs.getFeatures()) { cout << feature << ", "; }