From 684443a788d9e064bcbd7e75a4133e88b21d4ba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sun, 9 Jun 2024 17:19:38 +0200 Subject: [PATCH] Implement Cestnik & Laplace smoothing --- CMakeLists.txt | 2 +- bayesnet/network/Network.cc | 24 +++++++++++++----------- bayesnet/network/Network.h | 10 +++++++--- bayesnet/network/Node.cc | 4 ++-- bayesnet/network/Node.h | 2 +- lib/mdlp | 2 +- tests/TestBayesModels.cc | 2 +- tests/TestModulesVersions.cc | 2 +- tests/lib/Files | 2 +- 9 files changed, 28 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eed7062..6e35773 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.20) project(BayesNet - VERSION 1.0.5.1 + VERSION 1.0.6 DESCRIPTION "Bayesian Network and basic classifiers Library." HOMEPAGE_URL "https://github.com/rmontanana/bayesnet" LANGUAGES CXX diff --git a/bayesnet/network/Network.cc b/bayesnet/network/Network.cc index e0e2e38..2cc8541 100644 --- a/bayesnet/network/Network.cc +++ b/bayesnet/network/Network.cc @@ -7,17 +7,18 @@ #include #include #include +#include #include "Network.h" #include "bayesnet/utils/bayesnetUtils.h" namespace bayesnet { - Network::Network() : fitted{ false }, maxThreads{ 0.95 }, classNumStates{ 0 }, laplaceSmoothing{ 0 } + Network::Network() : fitted{ false }, maxThreads{ 0.95 }, classNumStates{ 0 }, smoothing{ Smoothing_t::LAPLACE } { } - Network::Network(float maxT) : fitted{ false }, maxThreads{ maxT }, classNumStates{ 0 }, laplaceSmoothing{ 0 } + Network::Network(float maxT) : fitted{ false }, maxThreads{ maxT }, classNumStates{ 0 }, smoothing{ Smoothing_t::LAPLACE } { } - Network::Network(const Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), + Network::Network(const Network& other) : smoothing(other.smoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.getMaxThreads()), fitted(other.fitted), samples(other.samples) { if (samples.defined()) @@ -164,14 +165,14 @@ namespace bayesnet { for (int i = 0; i < featureNames.size(); ++i) { auto row_feature = X.index({ i, "..." }); } - completeFit(states, weights); + completeFit(states, X.size(0), weights); } void Network::fit(const torch::Tensor& samples, const torch::Tensor& weights, const std::vector& featureNames, const std::string& className, const std::map>& states) { checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states, weights); this->className = className; this->samples = samples; - completeFit(states, weights); + completeFit(states, samples.size(1), weights); } // input_data comes in nxm, where n is the number of features and m the number of samples void Network::fit(const std::vector>& input_data, const std::vector& labels, const std::vector& weights_, const std::vector& featureNames, const std::string& className, const std::map>& states) @@ -185,16 +186,17 @@ namespace bayesnet { samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32)); } samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32)); - completeFit(states, weights); + completeFit(states, input_data[0].size(), weights); } - void Network::completeFit(const std::map>& states, const torch::Tensor& weights) + void Network::completeFit(const std::map>& states, const int n_samples, const torch::Tensor& weights) { setStates(states); - laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation std::vector threads; for (auto& node : nodes) { - threads.emplace_back([this, &node, &weights]() { - node.second->computeCPT(samples, features, laplaceSmoothing, weights); + threads.emplace_back([this, &node, &weights, n_samples]() { + auto numStates = node.second->getNumStates(); + double smoothing_factor = smoothing == Smoothing_t::CESTNIK ? static_cast(n_samples) / numStates : 1.0 / static_cast(n_samples); + node.second->computeCPT(samples, features, smoothing_factor, weights); }); } for (auto& thread : threads) { @@ -337,7 +339,7 @@ namespace bayesnet { thread.join(); } // Normalize result - double sum = accumulate(result.begin(), result.end(), 0.0); + double sum = std::accumulate(result.begin(), result.end(), 0.0); transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; }); return result; } diff --git a/bayesnet/network/Network.h b/bayesnet/network/Network.h index a87d5e1..dd08110 100644 --- a/bayesnet/network/Network.h +++ b/bayesnet/network/Network.h @@ -12,6 +12,10 @@ #include "Node.h" namespace bayesnet { + enum class Smoothing_t { + LAPLACE, + CESTNIK + }; class Network { public: Network(); @@ -54,15 +58,15 @@ namespace bayesnet { int classNumStates; std::vector features; // Including classname std::string className; - double laplaceSmoothing; + Smoothing_t smoothing; torch::Tensor samples; // n+1xm tensor used to fit the model bool isCyclic(const std::string&, std::unordered_set&, std::unordered_set&); std::vector predict_sample(const std::vector&); std::vector predict_sample(const torch::Tensor&); std::vector exactInference(std::map&); double computeFactor(std::map&); - void completeFit(const std::map>& states, const torch::Tensor& weights); - void checkFitData(int n_features, int n_samples, int n_samples_y, const std::vector& featureNames, const std::string& className, const std::map>& states, const torch::Tensor& weights); + void completeFit(const std::map>& states, const int n_samples, const torch::Tensor& weights); + void checkFitData(int n_samples, int n_features, int n_samples_y, const std::vector& featureNames, const std::string& className, const std::map>& states, const torch::Tensor& weights); void setStates(const std::map>&); }; } diff --git a/bayesnet/network/Node.cc b/bayesnet/network/Node.cc index cc63b29..fcb1e53 100644 --- a/bayesnet/network/Node.cc +++ b/bayesnet/network/Node.cc @@ -90,14 +90,14 @@ namespace bayesnet { } return result; } - void Node::computeCPT(const torch::Tensor& dataset, const std::vector& features, const double laplaceSmoothing, const torch::Tensor& weights) + void Node::computeCPT(const torch::Tensor& dataset, const std::vector& features, const double smoothing, const torch::Tensor& weights) { dimensions.clear(); // Get dimensions of the CPT dimensions.push_back(numStates); transform(parents.begin(), parents.end(), back_inserter(dimensions), [](const auto& parent) { return parent->getNumStates(); }); // Create a tensor of zeros with the dimensions of the CPT - cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing; + cpTable = torch::zeros(dimensions, torch::kFloat) + smoothing; // Fill table with counts auto pos = find(features.begin(), features.end(), name); if (pos == features.end()) { diff --git a/bayesnet/network/Node.h b/bayesnet/network/Node.h index e1cfa06..dc21119 100644 --- a/bayesnet/network/Node.h +++ b/bayesnet/network/Node.h @@ -23,7 +23,7 @@ namespace bayesnet { std::vector& getParents(); std::vector& getChildren(); torch::Tensor& getCPT(); - void computeCPT(const torch::Tensor& dataset, const std::vector& features, const double laplaceSmoothing, const torch::Tensor& weights); + void computeCPT(const torch::Tensor& dataset, const std::vector& features, const double smoothing, const torch::Tensor& weights); int getNumStates() const; void setNumStates(int); unsigned minFill(); diff --git a/lib/mdlp b/lib/mdlp index 236d1b2..c4e6c04 160000 --- a/lib/mdlp +++ b/lib/mdlp @@ -1 +1 @@ -Subproject commit 236d1b2f8be185039493fe7fce04a83e02ed72e5 +Subproject commit c4e6c041fe7f769ec24c0a2bd66a5aff482fd630 diff --git a/tests/TestBayesModels.cc b/tests/TestBayesModels.cc index b5ee426..2d60d5e 100644 --- a/tests/TestBayesModels.cc +++ b/tests/TestBayesModels.cc @@ -20,7 +20,7 @@ #include "bayesnet/ensembles/BoostAODE.h" #include "TestUtils.h" -const std::string ACTUAL_VERSION = "1.0.5.1"; +const std::string ACTUAL_VERSION = "1.0.6"; TEST_CASE("Test Bayesian Classifiers score & version", "[Models]") { diff --git a/tests/TestModulesVersions.cc b/tests/TestModulesVersions.cc index a8b2ce2..5b29178 100644 --- a/tests/TestModulesVersions.cc +++ b/tests/TestModulesVersions.cc @@ -16,7 +16,7 @@ #include "TestUtils.h" std::map modules = { - { "mdlp", "1.1.2" }, + { "mdlp", "1.2.0" }, { "Folding", "1.1.0" }, { "json", "3.11" }, { "ArffFiles", "1.0.0" } diff --git a/tests/lib/Files b/tests/lib/Files index 40ac380..dbefa02 160000 --- a/tests/lib/Files +++ b/tests/lib/Files @@ -1 +1 @@ -Subproject commit 40ac38011a2445e00df8a18048c67abaff16fa59 +Subproject commit dbefa02d9c0ca0f029f77e744cd80cb0150725c8