Prepare BoostAODE first try

This commit is contained in:
Ricardo Montañana Gómez 2023-10-13 13:46:22 +02:00
parent 5022a4dc90
commit 54b8939f35
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
7 changed files with 24 additions and 88 deletions

View File

@ -45,7 +45,6 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
# CMakes modules
# --------------
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
find_package(OpenSSL REQUIRED)
include(AddGitSubmodule)
if (CODE_COVERAGE)
enable_testing()

View File

@ -112,11 +112,6 @@ namespace bayesnet {
torch::Tensor counts = feature.bincount(weights);
double totalWeight = counts.sum().item<double>();
torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
// cout << "Probs: ";
// for (int i = 0; i < probs.size(0); ++i) {
// cout << probs[i].item<double>() << ", ";
// }
// cout << endl;
torch::Tensor logProbs = torch::log(probs);
torch::Tensor entropy = -probs * logProbs;
return entropy.nansum().item<double>();

View File

@ -5,7 +5,6 @@
#include "Colors.h"
#include "Folding.h"
#include "Paths.h"
#include <openssl/evp.h>
#include "CFS.h"
namespace bayesnet {
@ -63,27 +62,6 @@ namespace bayesnet {
cfs = hyperparameters["cfs"];
}
}
string sha256(const string& input)
{
EVP_MD_CTX* mdctx;
const EVP_MD* md;
unsigned char hash[EVP_MAX_MD_SIZE];
unsigned int hash_len;
OpenSSL_add_all_digests();
md = EVP_get_digestbyname("sha256");
mdctx = EVP_MD_CTX_new();
EVP_DigestInit_ex(mdctx, md, nullptr);
EVP_DigestUpdate(mdctx, input.c_str(), input.size());
EVP_DigestFinal_ex(mdctx, hash, &hash_len);
EVP_MD_CTX_free(mdctx);
stringstream oss;
for (unsigned int i = 0; i < hash_len; i++) {
oss << hex << setfill('0') << setw(2) << (int)hash[i];
}
return oss.str();
}
unordered_set<int> BoostAODE::initializeModels()
{
unordered_set<int> featuresUsed;
@ -101,26 +79,16 @@ namespace bayesnet {
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
int maxFeatures = 0;
auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
// std::size_t str_hash = std::hash<std::string>{}(output);
string str_hash = sha256(output);
stringstream oss;
oss << platform::Paths::cfs() << str_hash << ".json";
string name = oss.str();
ifstream file(name);
if (file.is_open()) {
nlohmann::json cfsFeatures = nlohmann::json::parse(file);
file.close();
for (const int& feature : cfsFeatures) {
// cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl;
featuresUsed.insert(feature);
unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
models.push_back(std::move(model));
significanceModels.push_back(1.0);
n_models++;
}
} else {
throw runtime_error("File " + name + " not found");
cfs.fit();
auto cfsFeatures = cfs.getFeatures();
for (const int& feature : cfsFeatures) {
// cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl;
featuresUsed.insert(feature);
unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
models.push_back(std::move(model));
significanceModels.push_back(1.0);
n_models++;
}
return featuresUsed;
}

View File

@ -18,21 +18,16 @@ namespace bayesnet {
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." });
auto mu = mutualInformation(x, y, weights);
// cout << "Mutual Information: (" << a << ", " << b << ") = " << mu << endl;
auto hx = entropy(x, weights);
// cout << "Entropy X: " << hx << endl;
auto hy = entropy(y, weights);
// cout << "Entropy Y: " << hy << endl;
return 2.0 * mu / (hx + hy);
}
void CFS::computeSuLabels()
{
// Compute Simmetrical Uncertainty between features and labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
// cout << "SuLabels" << endl;
for (int i = 0; i < features.size(); ++i) {
suLabels.push_back(symmetricalUncertainty(i, -1));
// cout << i << " -> " << suLabels[i] << endl;
}
}
@ -40,8 +35,14 @@ namespace bayesnet {
{
// Compute Simmetrical Uncertainty between features
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
// TODO: Implement Cache in this function
return symmetricalUncertainty(firstFeature, secondFeature);
try {
return suFeatures.at({ firstFeature, secondFeature });
}
catch (const out_of_range& e) {
auto result = symmetricalUncertainty(firstFeature, secondFeature);
suFeatures[{firstFeature, secondFeature}] = result;
return result;
}
}
double CFS::computeMerit()
{
@ -73,7 +74,6 @@ namespace bayesnet {
for (auto feature : featureOrder) {
cfsFeatures.push_back(feature);
auto meritNew = computeMerit(); // Compute merit with cfsFeatures
//cout << "MeritNew: " << meritNew << " Merit: " << merit << endl;
if (meritNew > merit) {
merit = meritNew;
bestFeature = feature;
@ -81,7 +81,8 @@ namespace bayesnet {
cfsFeatures.pop_back();
}
if (bestFeature == -1) {
throw runtime_error("Feature not found");
// meritNew has to be nan due to constant features
break;
}
cfsFeatures.push_back(bestFeature);
cfsScores.push_back(merit);
@ -90,34 +91,6 @@ namespace bayesnet {
}
fitted = true;
}
void CFS::test()
{
cout << "H(y): " << entropy(samples.index({ -1, "..." }), weights) << endl;
cout << "y: ";
auto y = samples.index({ -1, "..." });
for (int i = 0; i < y.size(0); ++i) {
cout << y[i].item<double>() << ", ";
}
cout << endl;
computeSuLabels();
// cout << "Probabilites of features: " << endl;
// for (const auto& featureName : features) {
// int featureIdx = find(features.begin(), features.end(), featureName) - features.begin();
// cout << featureName << "(" << featureIdx << "): ";
// auto feature = samples.index({ featureIdx, "..." });
// torch::Tensor counts = feature.bincount(weights);
// double totalWeight = counts.sum().item<double>();
// torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
// for (int i = 0; i < probs.size(0); ++i) {
// cout << probs[i].item<double>() << ", ";
// }
// cout << endl;
// // for (int i = 0; i < x.size(0); ++i) {
// // cout << x[i].item<double>() << ", ";
// // }
// // cout << endl;
// }
}
bool CFS::computeContinueCondition(const vector<int>& featureOrder)
{
if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) {

View File

@ -26,6 +26,7 @@ namespace bayesnet {
vector<int> cfsFeatures;
vector<double> cfsScores;
vector<double> suLabels;
map<pair<int, int>, double> suFeatures;
bool fitted = false;
};
}

View File

@ -6,4 +6,4 @@ include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto)
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")

View File

@ -210,7 +210,7 @@ int main()
// net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);
auto dt = Datasets(true, "Arff");
for (const auto& name : dt.getNames()) {
//for (const auto& name : { "iris" }) {
// for (const auto& name : { "iris" }) {
auto [X, y] = dt.getTensors(name);
auto features = dt.getFeatures(name);
auto states = dt.getStates(name);
@ -222,8 +222,8 @@ int main()
auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
dataset = torch::cat({ dataset, yresized }, 0);
auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights);
cout << "Dataset: " << name << " CFS features: " << flush;
cfs.fit();
cout << "Dataset: " << name << " CFS features: ";
for (const auto& feature : cfs.getFeatures()) {
cout << feature << ", ";
}