From 684443a788d9e064bcbd7e75a4133e88b21d4ba3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?=
 <rmontanana@gmail.com>
Date: Sun, 9 Jun 2024 17:19:38 +0200
Subject: [PATCH] Implement Cestnik & Laplace smoothing

---
 CMakeLists.txt               |  2 +-
 bayesnet/network/Network.cc  | 24 +++++++++++++-----------
 bayesnet/network/Network.h   | 10 +++++++---
 bayesnet/network/Node.cc     |  4 ++--
 bayesnet/network/Node.h      |  2 +-
 lib/mdlp                     |  2 +-
 tests/TestBayesModels.cc     |  2 +-
 tests/TestModulesVersions.cc |  2 +-
 tests/lib/Files              |  2 +-
 9 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eed7062..6e35773 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.20)
 
 project(BayesNet
-  VERSION 1.0.5.1
+  VERSION 1.0.6
   DESCRIPTION "Bayesian Network and basic classifiers Library."
   HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
   LANGUAGES CXX
diff --git a/bayesnet/network/Network.cc b/bayesnet/network/Network.cc
index e0e2e38..2cc8541 100644
--- a/bayesnet/network/Network.cc
+++ b/bayesnet/network/Network.cc
@@ -7,17 +7,18 @@
 #include <thread>
 #include <mutex>
 #include <sstream>
+#include <numeric>
 #include "Network.h"
 #include "bayesnet/utils/bayesnetUtils.h"
 namespace bayesnet {
-    Network::Network() : fitted{ false }, maxThreads{ 0.95 }, classNumStates{ 0 }, laplaceSmoothing{ 0 }
+    Network::Network() : fitted{ false }, maxThreads{ 0.95 }, classNumStates{ 0 }, smoothing{ Smoothing_t::LAPLACE }
     {
     }
-    Network::Network(float maxT) : fitted{ false }, maxThreads{ maxT }, classNumStates{ 0 }, laplaceSmoothing{ 0 }
+    Network::Network(float maxT) : fitted{ false }, maxThreads{ maxT }, classNumStates{ 0 }, smoothing{ Smoothing_t::LAPLACE }
     {
 
     }
-    Network::Network(const Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()),
+    Network::Network(const Network& other) : smoothing(other.smoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()),
         maxThreads(other.getMaxThreads()), fitted(other.fitted), samples(other.samples)
     {
         if (samples.defined())
@@ -164,14 +165,14 @@ namespace bayesnet {
         for (int i = 0; i < featureNames.size(); ++i) {
             auto row_feature = X.index({ i, "..." });
         }
-        completeFit(states, weights);
+        completeFit(states, X.size(0), weights);
     }
     void Network::fit(const torch::Tensor& samples, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states)
     {
         checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states, weights);
         this->className = className;
         this->samples = samples;
-        completeFit(states, weights);
+        completeFit(states, samples.size(1), weights);
     }
     // input_data comes in nxm, where n is the number of features and m the number of samples
     void Network::fit(const std::vector<std::vector<int>>& input_data, const std::vector<int>& labels, const std::vector<double>& weights_, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states)
@@ -185,16 +186,17 @@ namespace bayesnet {
             samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32));
         }
         samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
-        completeFit(states, weights);
+        completeFit(states, input_data[0].size(), weights);
     }
-    void Network::completeFit(const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights)
+    void Network::completeFit(const std::map<std::string, std::vector<int>>& states, const int n_samples, const torch::Tensor& weights)
     {
         setStates(states);
-        laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation
         std::vector<std::thread> threads;
         for (auto& node : nodes) {
-            threads.emplace_back([this, &node, &weights]() {
-                node.second->computeCPT(samples, features, laplaceSmoothing, weights);
+            threads.emplace_back([this, &node, &weights, n_samples]() {
+                auto numStates = node.second->getNumStates();
+                double smoothing_factor = smoothing == Smoothing_t::CESTNIK ? static_cast<double>(n_samples) / numStates : 1.0 / static_cast<double>(n_samples);
+                node.second->computeCPT(samples, features, smoothing_factor, weights);
                 });
         }
         for (auto& thread : threads) {
@@ -337,7 +339,7 @@ namespace bayesnet {
             thread.join();
         }
         // Normalize result
-        double sum = accumulate(result.begin(), result.end(), 0.0);
+        double sum = std::accumulate(result.begin(), result.end(), 0.0);
         transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; });
         return result;
     }
diff --git a/bayesnet/network/Network.h b/bayesnet/network/Network.h
index a87d5e1..dd08110 100644
--- a/bayesnet/network/Network.h
+++ b/bayesnet/network/Network.h
@@ -12,6 +12,10 @@
 #include "Node.h"
 
 namespace bayesnet {
+    enum class Smoothing_t {
+        LAPLACE,
+        CESTNIK
+    };
     class Network {
     public:
         Network();
@@ -54,15 +58,15 @@ namespace bayesnet {
         int classNumStates;
         std::vector<std::string> features; // Including classname
         std::string className;
-        double laplaceSmoothing;
+        Smoothing_t smoothing;
         torch::Tensor samples; // n+1xm tensor used to fit the model
         bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&);
         std::vector<double> predict_sample(const std::vector<int>&);
         std::vector<double> predict_sample(const torch::Tensor&);
         std::vector<double> exactInference(std::map<std::string, int>&);
         double computeFactor(std::map<std::string, int>&);
-        void completeFit(const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
-        void checkFitData(int n_features, int n_samples, int n_samples_y, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
+        void completeFit(const std::map<std::string, std::vector<int>>& states, const int n_samples, const torch::Tensor& weights);
+        void checkFitData(int n_samples, int n_features, int n_samples_y, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
         void setStates(const std::map<std::string, std::vector<int>>&);
     };
 }
diff --git a/bayesnet/network/Node.cc b/bayesnet/network/Node.cc
index cc63b29..fcb1e53 100644
--- a/bayesnet/network/Node.cc
+++ b/bayesnet/network/Node.cc
@@ -90,14 +90,14 @@ namespace bayesnet {
         }
         return result;
     }
-    void Node::computeCPT(const torch::Tensor& dataset, const std::vector<std::string>& features, const double laplaceSmoothing, const torch::Tensor& weights)
+    void Node::computeCPT(const torch::Tensor& dataset, const std::vector<std::string>& features, const double smoothing, const torch::Tensor& weights)
     {
         dimensions.clear();
         // Get dimensions of the CPT
         dimensions.push_back(numStates);
         transform(parents.begin(), parents.end(), back_inserter(dimensions), [](const auto& parent) { return parent->getNumStates(); });
         // Create a tensor of zeros with the dimensions of the CPT
-        cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing;
+        cpTable = torch::zeros(dimensions, torch::kFloat) + smoothing;
         // Fill table with counts
         auto pos = find(features.begin(), features.end(), name);
         if (pos == features.end()) {
diff --git a/bayesnet/network/Node.h b/bayesnet/network/Node.h
index e1cfa06..dc21119 100644
--- a/bayesnet/network/Node.h
+++ b/bayesnet/network/Node.h
@@ -23,7 +23,7 @@ namespace bayesnet {
         std::vector<Node*>& getParents();
         std::vector<Node*>& getChildren();
         torch::Tensor& getCPT();
-        void computeCPT(const torch::Tensor& dataset, const std::vector<std::string>& features, const double laplaceSmoothing, const torch::Tensor& weights);
+        void computeCPT(const torch::Tensor& dataset, const std::vector<std::string>& features, const double smoothing, const torch::Tensor& weights);
         int getNumStates() const;
         void setNumStates(int);
         unsigned minFill();
diff --git a/lib/mdlp b/lib/mdlp
index 236d1b2..c4e6c04 160000
--- a/lib/mdlp
+++ b/lib/mdlp
@@ -1 +1 @@
-Subproject commit 236d1b2f8be185039493fe7fce04a83e02ed72e5
+Subproject commit c4e6c041fe7f769ec24c0a2bd66a5aff482fd630
diff --git a/tests/TestBayesModels.cc b/tests/TestBayesModels.cc
index b5ee426..2d60d5e 100644
--- a/tests/TestBayesModels.cc
+++ b/tests/TestBayesModels.cc
@@ -20,7 +20,7 @@
 #include "bayesnet/ensembles/BoostAODE.h"
 #include "TestUtils.h"
 
-const std::string ACTUAL_VERSION = "1.0.5.1";
+const std::string ACTUAL_VERSION = "1.0.6";
 
 TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
 {
diff --git a/tests/TestModulesVersions.cc b/tests/TestModulesVersions.cc
index a8b2ce2..5b29178 100644
--- a/tests/TestModulesVersions.cc
+++ b/tests/TestModulesVersions.cc
@@ -16,7 +16,7 @@
 #include "TestUtils.h"
 
 std::map<std::string, std::string> modules = {
-    { "mdlp", "1.1.2" },
+    { "mdlp", "1.2.0" },
     { "Folding", "1.1.0" },
     { "json", "3.11" },
     { "ArffFiles", "1.0.0" }
diff --git a/tests/lib/Files b/tests/lib/Files
index 40ac380..dbefa02 160000
--- a/tests/lib/Files
+++ b/tests/lib/Files
@@ -1 +1 @@
-Subproject commit 40ac38011a2445e00df8a18048c67abaff16fa59
+Subproject commit dbefa02d9c0ca0f029f77e744cd80cb0150725c8