From 9f9369269ad5ae4e23144f90b3e9deeb7e0bb6ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Tue, 19 Aug 2025 12:29:54 +0200
Subject: [PATCH] Fix ld that was discretizing all input features

---
 CHANGELOG.md                     |  6 ++++++
 CMakeLists.txt                   |  2 +-
 bayesnet/classifiers/Proposal.cc | 36 +++++++++++++++++++-------------
 bayesnet/classifiers/Proposal.h  |  9 +++-----
 bayesnet/ensembles/AODELd.cc     |  2 +-
 5 files changed, 32 insertions(+), 23 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index da87b74..37cb24e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.2.2] - 2025-08-19
+
+### Fixed
+
+- Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical.
+
 ## [1.2.1] - 2025-07-19
 
 ### Internal
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ff301f4..13def32 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.27)
 
 project(bayesnet
-  VERSION 1.2.1
+  VERSION 1.2.2
   DESCRIPTION "Bayesian Network and basic classifiers Library."
   HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
   LANGUAGES CXX
diff --git a/bayesnet/classifiers/Proposal.cc b/bayesnet/classifiers/Proposal.cc
index b3c7639..6974b70 100644
--- a/bayesnet/classifiers/Proposal.cc
+++ b/bayesnet/classifiers/Proposal.cc
@@ -118,31 +118,37 @@ namespace bayesnet {
         }
         return states;
     }
-    map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
+    map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states)
     {
         // Discretize the continuous input data and build pDataset (Classifier::dataset)
+        // We expect to have in states for numeric features an empty vector and for discretized features a vector of states
         int m = Xf.size(1);
         int n = Xf.size(0);
-        map<std::string, std::vector<int>> states;
         pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
         auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
         // discretize input data by feature(row)
         std::unique_ptr<mdlp::Discretizer> discretizer;
         for (auto i = 0; i < pFeatures.size(); ++i) {
-            if (discretizationType == discretization_t::BINQ) {
-                discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
-            } else if (discretizationType == discretization_t::BINU) {
-                discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM);
-            } else { // Default is MDLP
-                discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
-            }
             auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
             auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
-            discretizer->fit(Xt, yv);
-            pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
-            auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
-            iota(xStates.begin(), xStates.end(), 0);
-            states[pFeatures[i]] = xStates;
+            if (states[pFeatures[i]].empty()) {
+                // If the feature is numeric, we discretize it
+                if (discretizationType == discretization_t::BINQ) {
+                    discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
+                } else if (discretizationType == discretization_t::BINU) {
+                    discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM);
+                } else { // Default is MDLP
+                    discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
+                }
+                pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->fit_transform(Xt, yv)));
+                int n_states = discretizer->getCutPoints().size() + 1;
+                auto xStates = std::vector<int>(n_states);
+                iota(xStates.begin(), xStates.end(), 0);
+                states[pFeatures[i]] = xStates;
+            } else {
+                // If the feature is categorical, we just copy it
+                pDataset.index_put_({ i, "..." }, Xf[i].to(torch::kInt32));
+            }
             discretizers[pFeatures[i]] = std::move(discretizer);
         }
         int n_classes = torch::max(y).item<int>() + 1;
@@ -190,7 +196,7 @@ namespace bayesnet {
     )
     {
         // Phase 1: Initial discretization (same as original)
-        auto currentStates = fit_local_discretization(y);
+        auto currentStates = fit_local_discretization(y, initialStates);
         auto previousModel = Network();
 
         if (convergence_params.verbose) {
diff --git a/bayesnet/classifiers/Proposal.h b/bayesnet/classifiers/Proposal.h
index bb53776..4ee5a15 100644
--- a/bayesnet/classifiers/Proposal.h
+++ b/bayesnet/classifiers/Proposal.h
@@ -23,9 +23,8 @@ namespace bayesnet {
     protected:
         void checkInput(const torch::Tensor& X, const torch::Tensor& y);
         torch::Tensor prepareX(torch::Tensor& X);
-        map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
-        map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y);
-
+        // fit_local_discretization is only called by aodeld
+        map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states);
         // Iterative discretization method
         template<typename Classifier>
         map<std::string, std::vector<int>> iterativeLocalDiscretization(
@@ -37,18 +36,15 @@ namespace bayesnet {
             const map<std::string, std::vector<int>>& initialStates,
             const Smoothing_t smoothing
         );
-
         torch::Tensor Xf; // X continuous nxm tensor
         torch::Tensor y; // y discrete nx1 tensor
         map<std::string, std::unique_ptr<mdlp::Discretizer>> discretizers;
-
         // MDLP parameters
         struct {
             size_t min_length = 3; // Minimum length of the interval to consider it in mdlp
             float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm
             int max_depth = std::numeric_limits<int>::max(); // Maximum depth of the MDLP tree
         } ld_params;
-
         // Convergence parameters
         struct {
             int maxIterations = 10;
@@ -60,6 +56,7 @@ namespace bayesnet {
             "max_iterations", "verbose_convergence"
         };
     private:
+        map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
         std::vector<int> factorize(const std::vector<std::string>& labels_t);
         std::vector<std::string>& notes; // Notes during fit from BaseClassifier
         torch::Tensor& pDataset; // (n+1)xm tensor
diff --git a/bayesnet/ensembles/AODELd.cc b/bayesnet/ensembles/AODELd.cc
index 4f0f0cd..991e5b4 100644
--- a/bayesnet/ensembles/AODELd.cc
+++ b/bayesnet/ensembles/AODELd.cc
@@ -19,7 +19,7 @@ namespace bayesnet {
         Xf = X_;
         y = y_;
         // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
-        states = fit_local_discretization(y);
+        states = fit_local_discretization(y, states_);
         // We have discretized the input data
         // 1st we need to fit the model to build the normal AODE structure, Ensemble::fit  
         // calls buildModel to initialize the base models