Fix ld that was discretizing all input features

Update version number
2025-08-19 12:29:54 +02:00 · 2025-07-19 22:47:32 +02:00
6 changed files with 42 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,21 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

-## [Unreleased]
+## [1.2.2] - 2025-08-19
+
+### Fixed
+
+- Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical.
+
+## [1.2.1] - 2025-07-19
+
+### Internal
+
+- Update Libtorch to version 2.7.1
+- Update libraries versions:
+  - mdlp: 2.1.1
+  - Folding: 1.1.2
+  - ArffFiles: 1.2.1

 ## [1.2.0] - 2025-07-08

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.27)

 project(bayesnet
-  VERSION 1.2.0
+  VERSION 1.2.2
  DESCRIPTION "Bayesian Network and basic classifiers Library."
  HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
  LANGUAGES CXX
--- a/bayesnet/classifiers/Proposal.cc
+++ b/bayesnet/classifiers/Proposal.cc
@@ -118,31 +118,37 @@ namespace bayesnet {
        }
        return states;
    }
-    map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
+    map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states)
    {
        // Discretize the continuous input data and build pDataset (Classifier::dataset)
+        // We expect to have in states for numeric features an empty vector and for discretized features a vector of states
        int m = Xf.size(1);
        int n = Xf.size(0);
-        map<std::string, std::vector<int>> states;
        pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
        auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
        // discretize input data by feature(row)
        std::unique_ptr<mdlp::Discretizer> discretizer;
        for (auto i = 0; i < pFeatures.size(); ++i) {
-            if (discretizationType == discretization_t::BINQ) {
-                discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
-            } else if (discretizationType == discretization_t::BINU) {
-                discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM);
-            } else { // Default is MDLP
-                discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
-            }
            auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
            auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
-            discretizer->fit(Xt, yv);
-            pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
-            auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
-            iota(xStates.begin(), xStates.end(), 0);
-            states[pFeatures[i]] = xStates;
+            if (states[pFeatures[i]].empty()) {
+                // If the feature is numeric, we discretize it
+                if (discretizationType == discretization_t::BINQ) {
+                    discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
+                } else if (discretizationType == discretization_t::BINU) {
+                    discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM);
+                } else { // Default is MDLP
+                    discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
+                }
+                pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->fit_transform(Xt, yv)));
+                int n_states = discretizer->getCutPoints().size() + 1;
+                auto xStates = std::vector<int>(n_states);
+                iota(xStates.begin(), xStates.end(), 0);
+                states[pFeatures[i]] = xStates;
+            } else {
+                // If the feature is categorical, we just copy it
+                pDataset.index_put_({ i, "..." }, Xf[i].to(torch::kInt32));
+            }
            discretizers[pFeatures[i]] = std::move(discretizer);
        }
        int n_classes = torch::max(y).item<int>() + 1;
@@ -190,7 +196,7 @@ namespace bayesnet {
    )
    {
        // Phase 1: Initial discretization (same as original)
-        auto currentStates = fit_local_discretization(y);
+        auto currentStates = fit_local_discretization(y, initialStates);
        auto previousModel = Network();

        if (convergence_params.verbose) {
--- a/bayesnet/classifiers/Proposal.h
+++ b/bayesnet/classifiers/Proposal.h
@@ -23,9 +23,8 @@ namespace bayesnet {
    protected:
        void checkInput(const torch::Tensor& X, const torch::Tensor& y);
        torch::Tensor prepareX(torch::Tensor& X);
-        map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
-        map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y);
-
+        // fit_local_discretization is only called by aodeld
+        map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states);
        // Iterative discretization method
        template<typename Classifier>
        map<std::string, std::vector<int>> iterativeLocalDiscretization(
@@ -37,18 +36,15 @@ namespace bayesnet {
            const map<std::string, std::vector<int>>& initialStates,
            const Smoothing_t smoothing
        );
-
        torch::Tensor Xf; // X continuous nxm tensor
        torch::Tensor y; // y discrete nx1 tensor
        map<std::string, std::unique_ptr<mdlp::Discretizer>> discretizers;
-
        // MDLP parameters
        struct {
            size_t min_length = 3; // Minimum length of the interval to consider it in mdlp
            float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm
            int max_depth = std::numeric_limits<int>::max(); // Maximum depth of the MDLP tree
        } ld_params;
-
        // Convergence parameters
        struct {
            int maxIterations = 10;
@@ -60,6 +56,7 @@ namespace bayesnet {
            "max_iterations", "verbose_convergence"
        };
    private:
+        map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
        std::vector<int> factorize(const std::vector<std::string>& labels_t);
        std::vector<std::string>& notes; // Notes during fit from BaseClassifier
        torch::Tensor& pDataset; // (n+1)xm tensor
--- a/bayesnet/ensembles/AODELd.cc
+++ b/bayesnet/ensembles/AODELd.cc
@@ -19,7 +19,7 @@ namespace bayesnet {
        Xf = X_;
        y = y_;
        // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
-        states = fit_local_discretization(y);
+        states = fit_local_discretization(y, states_);
        // We have discretized the input data
        // 1st we need to fit the model to build the normal AODE structure, Ensemble::fit  
        // calls buildModel to initialize the base models
--- a/tests/TestBayesModels.cc
+++ b/tests/TestBayesModels.cc
@@ -20,7 +20,7 @@
 #include "bayesnet/ensembles/AODELd.h"
 #include "bayesnet/ensembles/BoostAODE.h"

-const std::string ACTUAL_VERSION = "1.2.0";
+const std::string ACTUAL_VERSION = "1.2.1";

 TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
 {
Author	SHA1	Message	Date
Ricardo Montañana	9f9369269a	Fix ld that was discretizing all input features	2025-08-19 12:29:54 +02:00
Ricardo Montañana	89142f8997	Update version number	2025-07-19 22:47:32 +02:00