From 9f3de4d924fc8c4e155f068b383041b2ee9865a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?=
 <rmontanana@gmail.com>
Date: Sun, 29 Jun 2025 13:00:34 +0200
Subject: [PATCH] Add new hyperparameters to the Ld classifiers   -
 *ld_algorithm*: algorithm to use for local discretization, with the following
 options: "MDLP", "BINQ", "BINU".   - *ld_proposed_cuts*: number of cut points
 to return.   - *mdlp_min_length*: minimum length of a partition in MDLP
 algorithm to be evaluated for partition.   - *mdlp_max_depth*: maximum level
 of recursion in MDLP algorithm.

---
 CHANGELOG.md                     | 11 ++++++++
 bayesnet/classifiers/KDB.h       | 11 ++++----
 bayesnet/classifiers/KDBLd.cc    | 20 ++++++++++++-
 bayesnet/classifiers/KDBLd.h     |  2 +-
 bayesnet/classifiers/Proposal.cc | 48 ++++++++++++++++++++++++++++----
 bayesnet/classifiers/Proposal.h  | 18 ++++++++++--
 bayesnet/classifiers/SPODELd.cc  |  6 +++-
 bayesnet/ensembles/AODELd.cc     |  2 ++
 bayesnet/ensembles/AODELd.h      |  2 ++
 tests/TestModulesVersions.cc     |  2 +-
 10 files changed, 104 insertions(+), 18 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f31ad10..f4f63ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.2.0] - 2025-06-30
+
+### Internal
+
+- Add docs generation to CMakeLists.txt.
+- Add new hyperparameters to the Ld classifiers:
+  - *ld_algorithm*: algorithm to use for local discretization, with the following options: "MDLP", "BINQ", "BINU".
+  - *ld_proposed_cuts*: number of cut points to return.
+  - *mdlp_min_length*: minimum length of a partition in MDLP algorithm to be evaluated for partition.
+  - *mdlp_max_depth*: maximum level of recursion in MDLP algorithm.
+
 ## [1.1.1] - 2025-05-20
 
 ### Internal
diff --git a/bayesnet/classifiers/KDB.h b/bayesnet/classifiers/KDB.h
index 85e9353..0fb7420 100644
--- a/bayesnet/classifiers/KDB.h
+++ b/bayesnet/classifiers/KDB.h
@@ -10,17 +10,16 @@
 #include "Classifier.h"
 namespace bayesnet {
     class KDB : public Classifier {
-    private:
-        int k;
-        float theta;
-    protected:
-        void add_m_edges(int idx, std::vector<int>& S, torch::Tensor& weights);
-        void buildModel(const torch::Tensor& weights) override;
     public:
         explicit KDB(int k, float theta = 0.03);
         virtual ~KDB() = default;
         void setHyperparameters(const nlohmann::json& hyperparameters_) override;
         std::vector<std::string> graph(const std::string& name = "KDB") const override;
+    protected:
+        int k;
+        float theta;
+        void add_m_edges(int idx, std::vector<int>& S, torch::Tensor& weights);
+        void buildModel(const torch::Tensor& weights) override;
     };
 }
 #endif
diff --git a/bayesnet/classifiers/KDBLd.cc b/bayesnet/classifiers/KDBLd.cc
index 0decd1b..e112c1c 100644
--- a/bayesnet/classifiers/KDBLd.cc
+++ b/bayesnet/classifiers/KDBLd.cc
@@ -7,7 +7,25 @@
 #include "KDBLd.h"
 
 namespace bayesnet {
-    KDBLd::KDBLd(int k) : KDB(k), Proposal(dataset, features, className) {}
+    KDBLd::KDBLd(int k) : KDB(k), Proposal(dataset, features, className)
+    {
+        validHyperparameters = validHyperparameters_ld;
+        validHyperparameters.push_back("k");
+        validHyperparameters.push_back("theta");
+    }
+    void KDBLd::setHyperparameters(const nlohmann::json& hyperparameters_)
+    {
+        auto hyperparameters = hyperparameters_;
+        if (hyperparameters.contains("k")) {
+            k = hyperparameters["k"];
+            hyperparameters.erase("k");
+        }
+        if (hyperparameters.contains("theta")) {
+            theta = hyperparameters["theta"];
+            hyperparameters.erase("theta");
+        }
+        Proposal::setHyperparameters(hyperparameters);
+    }
     KDBLd& KDBLd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
     {
         checkInput(X_, y_);
diff --git a/bayesnet/classifiers/KDBLd.h b/bayesnet/classifiers/KDBLd.h
index 6bdce0b..4fa5f82 100644
--- a/bayesnet/classifiers/KDBLd.h
+++ b/bayesnet/classifiers/KDBLd.h
@@ -11,12 +11,12 @@
 
 namespace bayesnet {
     class KDBLd : public KDB, public Proposal {
-    private:
     public:
         explicit KDBLd(int k);
         virtual ~KDBLd() = default;
         KDBLd& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing) override;
         std::vector<std::string> graph(const std::string& name = "KDB") const override;
+        void setHyperparameters(const nlohmann::json& hyperparameters_) override;
         torch::Tensor predict(torch::Tensor& X) override;
         torch::Tensor predict_proba(torch::Tensor& X) override;
         static inline std::string version() { return "0.0.1"; };
diff --git a/bayesnet/classifiers/Proposal.cc b/bayesnet/classifiers/Proposal.cc
index 1029247..3ef8a78 100644
--- a/bayesnet/classifiers/Proposal.cc
+++ b/bayesnet/classifiers/Proposal.cc
@@ -7,13 +7,42 @@
 #include "Proposal.h"
 
 namespace bayesnet {
-    Proposal::Proposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_) : pDataset(dataset_), pFeatures(features_), pClassName(className_) {}
-    Proposal::~Proposal()
+    Proposal::Proposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_) : pDataset(dataset_), pFeatures(features_), pClassName(className_)
     {
-        for (auto& [key, value] : discretizers) {
-            delete value;
+    }
+    void Proposal::setHyperparameters(const nlohmann::json& hyperparameters_)
+    {
+        auto hyperparameters = hyperparameters_;
+        if (hyperparameters.contains("ld_proposed_cuts")) {
+            ld_params.proposed_cuts = hyperparameters["ld_proposed_cuts"];
+            hyperparameters.erase("ld_proposed_cuts");
+        }
+        if (hyperparameters.contains("mdlp_max_depth")) {
+            ld_params.max_depth = hyperparameters["mdlp_max_depth"];
+            hyperparameters.erase("mdlp_max_depth");
+        }
+        if (hyperparameters.contains("mdlp_min_length")) {
+            ld_params.min_length = hyperparameters["mdlp_min_length"];
+            hyperparameters.erase("mdlp_min_length");
+        }
+        if (hyperparameters.contains("ld_algorithm")) {
+            auto algorithm = hyperparameters["ld_algorithm"];
+            hyperparameters.erase("ld_algorithm");
+            if (algorithm == "MDLP") {
+                discretizationType = discretization_t::MDLP;
+            } else if (algorithm == "BINQ") {
+                discretizationType = discretization_t::BINQ;
+            } else if (algorithm == "BINU") {
+                discretizationType = discretization_t::BINU;
+            } else {
+                throw std::invalid_argument("Invalid discretization algorithm: " + algorithm.get<std::string>());
+            }
+        }
+        if (!hyperparameters.empty()) {
+            throw std::invalid_argument("Invalid hyperparameters for Proposal: " + hyperparameters.dump());
         }
     }
+
     void Proposal::checkInput(const torch::Tensor& X, const torch::Tensor& y)
     {
         if (!torch::is_floating_point(X)) {
@@ -84,8 +113,15 @@ namespace bayesnet {
         pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
         auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
         // discretize input data by feature(row)
+        std::unique_ptr<mdlp::Discretizer> discretizer;
         for (auto i = 0; i < pFeatures.size(); ++i) {
-            auto* discretizer = new mdlp::CPPFImdlp();
+            if (discretizationType == discretization_t::BINQ) {
+                discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
+            } else if (discretizationType == discretization_t::BINU) {
+                discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM);
+            } else { // Default is MDLP
+                discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
+            }
             auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
             auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
             discretizer->fit(Xt, yv);
@@ -93,7 +129,7 @@ namespace bayesnet {
             auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
             iota(xStates.begin(), xStates.end(), 0);
             states[pFeatures[i]] = xStates;
-            discretizers[pFeatures[i]] = discretizer;
+            discretizers[pFeatures[i]] = std::move(discretizer);
         }
         int n_classes = torch::max(y).item<int>() + 1;
         auto yStates = std::vector<int>(n_classes);
diff --git a/bayesnet/classifiers/Proposal.h b/bayesnet/classifiers/Proposal.h
index 26118bf..6823a38 100644
--- a/bayesnet/classifiers/Proposal.h
+++ b/bayesnet/classifiers/Proposal.h
@@ -10,14 +10,16 @@
 #include <map>
 #include <torch/torch.h>
 #include <fimdlp/CPPFImdlp.h>
+#include <fimdlp/BinDisc.h>
 #include "bayesnet/network/Network.h"
+#include <nlohmann/json.hpp>
 #include "Classifier.h"
 
 namespace bayesnet {
     class Proposal {
     public:
         Proposal(torch::Tensor& pDataset, std::vector<std::string>& features_, std::string& className_);
-        virtual ~Proposal();
+        void setHyperparameters(const nlohmann::json& hyperparameters_);
     protected:
         void checkInput(const torch::Tensor& X, const torch::Tensor& y);
         torch::Tensor prepareX(torch::Tensor& X);
@@ -25,12 +27,24 @@ namespace bayesnet {
         map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y);
         torch::Tensor Xf; // X continuous nxm tensor
         torch::Tensor y; // y discrete nx1 tensor
-        map<std::string, mdlp::CPPFImdlp*> discretizers;
+        map<std::string, std::unique_ptr<mdlp::Discretizer>> discretizers;
+        // MDLP parameters
+        struct {
+            size_t min_length = 3; // Minimum length of the interval to consider it in mdlp
+            float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm
+            int max_depth = std::numeric_limits<int>::max(); // Maximum depth of the MDLP tree
+        } ld_params;
+        nlohmann::json validHyperparameters_ld = { "ld_algorithm", "ld_proposed_cuts", "mdlp_min_length", "mdlp_max_depth" };
     private:
         std::vector<int> factorize(const std::vector<std::string>& labels_t);
         torch::Tensor& pDataset; // (n+1)xm tensor
         std::vector<std::string>& pFeatures;
         std::string& pClassName;
+        enum class discretization_t {
+            MDLP,
+            BINQ,
+            BINU
+        } discretizationType = discretization_t::MDLP; // Default discretization type
     };
 }
 
diff --git a/bayesnet/classifiers/SPODELd.cc b/bayesnet/classifiers/SPODELd.cc
index c68b7d9..1bb55fb 100644
--- a/bayesnet/classifiers/SPODELd.cc
+++ b/bayesnet/classifiers/SPODELd.cc
@@ -7,7 +7,11 @@
 #include "SPODELd.h"
 
 namespace bayesnet {
-    SPODELd::SPODELd(int root) : SPODE(root), Proposal(dataset, features, className) {}
+    SPODELd::SPODELd(int root) : SPODE(root), Proposal(dataset, features, className)
+    {
+        validHyperparameters = validHyperparameters_ld; // Inherits the valid hyperparameters from Proposal
+    }
+
     SPODELd& SPODELd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
     {
         checkInput(X_, y_);
diff --git a/bayesnet/ensembles/AODELd.cc b/bayesnet/ensembles/AODELd.cc
index 07a9295..3dc80bf 100644
--- a/bayesnet/ensembles/AODELd.cc
+++ b/bayesnet/ensembles/AODELd.cc
@@ -9,6 +9,7 @@
 namespace bayesnet {
     AODELd::AODELd(bool predict_voting) : Ensemble(predict_voting), Proposal(dataset, features, className)
     {
+        validHyperparameters = validHyperparameters_ld; // Inherits the valid hyperparameters from Proposal
     }
     AODELd& AODELd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
     {
@@ -31,6 +32,7 @@ namespace bayesnet {
         models.clear();
         for (int i = 0; i < features.size(); ++i) {
             models.push_back(std::make_unique<SPODELd>(i));
+            models.back()->setHyperparameters(hyperparameters);
         }
         n_models = models.size();
         significanceModels = std::vector<double>(n_models, 1.0);
diff --git a/bayesnet/ensembles/AODELd.h b/bayesnet/ensembles/AODELd.h
index 4bf0b63..d697554 100644
--- a/bayesnet/ensembles/AODELd.h
+++ b/bayesnet/ensembles/AODELd.h
@@ -20,6 +20,8 @@ namespace bayesnet {
     protected:
         void trainModel(const torch::Tensor& weights, const Smoothing_t smoothing) override;
         void buildModel(const torch::Tensor& weights) override;
+    private:
+        nlohmann::json hyperparameters = {}; // Hyperparameters for the model
     };
 }
 #endif // !AODELD_H
\ No newline at end of file
diff --git a/tests/TestModulesVersions.cc b/tests/TestModulesVersions.cc
index bd07c64..cf760d6 100644
--- a/tests/TestModulesVersions.cc
+++ b/tests/TestModulesVersions.cc
@@ -18,7 +18,7 @@
 std::map<std::string, std::string> modules = {
     { "mdlp", "2.0.1" },
     { "Folding", "1.1.1" },
-    { "json", "3.12" },
+    { "json", "3.11" },
     { "ArffFiles", "1.1.0" }
 };