From 9f9369269ad5ae4e23144f90b3e9deeb7e0bb6ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 19 Aug 2025 12:29:54 +0200 Subject: [PATCH] Fix ld that was discretizing all input features --- CHANGELOG.md | 6 ++++++ CMakeLists.txt | 2 +- bayesnet/classifiers/Proposal.cc | 36 +++++++++++++++++++------------- bayesnet/classifiers/Proposal.h | 9 +++----- bayesnet/ensembles/AODELd.cc | 2 +- 5 files changed, 32 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da87b74..37cb24e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.2.2] - 2025-08-19 + +### Fixed + +- Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical. + ## [1.2.1] - 2025-07-19 ### Internal diff --git a/CMakeLists.txt b/CMakeLists.txt index ff301f4..13def32 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.27) project(bayesnet - VERSION 1.2.1 + VERSION 1.2.2 DESCRIPTION "Bayesian Network and basic classifiers Library." HOMEPAGE_URL "https://github.com/rmontanana/bayesnet" LANGUAGES CXX diff --git a/bayesnet/classifiers/Proposal.cc b/bayesnet/classifiers/Proposal.cc index b3c7639..6974b70 100644 --- a/bayesnet/classifiers/Proposal.cc +++ b/bayesnet/classifiers/Proposal.cc @@ -118,31 +118,37 @@ namespace bayesnet { } return states; } - map> Proposal::fit_local_discretization(const torch::Tensor& y) + map> Proposal::fit_local_discretization(const torch::Tensor& y, map> states) { // Discretize the continuous input data and build pDataset (Classifier::dataset) + // We expect to have in states for numeric features an empty vector and for discretized features a vector of states int m = Xf.size(1); int n = Xf.size(0); - map> states; pDataset = torch::zeros({ n + 1, m }, torch::kInt32); auto yv = std::vector(y.data_ptr(), y.data_ptr() + y.size(0)); // discretize input data by feature(row) std::unique_ptr discretizer; for (auto i = 0; i < pFeatures.size(); ++i) { - if (discretizationType == discretization_t::BINQ) { - discretizer = std::make_unique(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE); - } else if (discretizationType == discretization_t::BINU) { - discretizer = std::make_unique(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM); - } else { // Default is MDLP - discretizer = std::make_unique(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts); - } auto Xt_ptr = Xf.index({ i }).data_ptr(); auto Xt = std::vector(Xt_ptr, Xt_ptr + Xf.size(1)); - discretizer->fit(Xt, yv); - pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt))); - auto xStates = std::vector(discretizer->getCutPoints().size() + 1); - iota(xStates.begin(), xStates.end(), 0); - states[pFeatures[i]] = xStates; + if (states[pFeatures[i]].empty()) { + // If the feature is numeric, we discretize it + if (discretizationType == discretization_t::BINQ) { + discretizer = std::make_unique(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE); + } else if (discretizationType == discretization_t::BINU) { + discretizer = std::make_unique(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM); + } else { // Default is MDLP + discretizer = std::make_unique(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts); + } + pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->fit_transform(Xt, yv))); + int n_states = discretizer->getCutPoints().size() + 1; + auto xStates = std::vector(n_states); + iota(xStates.begin(), xStates.end(), 0); + states[pFeatures[i]] = xStates; + } else { + // If the feature is categorical, we just copy it + pDataset.index_put_({ i, "..." }, Xf[i].to(torch::kInt32)); + } discretizers[pFeatures[i]] = std::move(discretizer); } int n_classes = torch::max(y).item() + 1; @@ -190,7 +196,7 @@ namespace bayesnet { ) { // Phase 1: Initial discretization (same as original) - auto currentStates = fit_local_discretization(y); + auto currentStates = fit_local_discretization(y, initialStates); auto previousModel = Network(); if (convergence_params.verbose) { diff --git a/bayesnet/classifiers/Proposal.h b/bayesnet/classifiers/Proposal.h index bb53776..4ee5a15 100644 --- a/bayesnet/classifiers/Proposal.h +++ b/bayesnet/classifiers/Proposal.h @@ -23,9 +23,8 @@ namespace bayesnet { protected: void checkInput(const torch::Tensor& X, const torch::Tensor& y); torch::Tensor prepareX(torch::Tensor& X); - map> localDiscretizationProposal(const map>& states, Network& model); - map> fit_local_discretization(const torch::Tensor& y); - + // fit_local_discretization is only called by aodeld + map> fit_local_discretization(const torch::Tensor& y, map> states); // Iterative discretization method template map> iterativeLocalDiscretization( @@ -37,18 +36,15 @@ namespace bayesnet { const map>& initialStates, const Smoothing_t smoothing ); - torch::Tensor Xf; // X continuous nxm tensor torch::Tensor y; // y discrete nx1 tensor map> discretizers; - // MDLP parameters struct { size_t min_length = 3; // Minimum length of the interval to consider it in mdlp float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm int max_depth = std::numeric_limits::max(); // Maximum depth of the MDLP tree } ld_params; - // Convergence parameters struct { int maxIterations = 10; @@ -60,6 +56,7 @@ namespace bayesnet { "max_iterations", "verbose_convergence" }; private: + map> localDiscretizationProposal(const map>& states, Network& model); std::vector factorize(const std::vector& labels_t); std::vector& notes; // Notes during fit from BaseClassifier torch::Tensor& pDataset; // (n+1)xm tensor diff --git a/bayesnet/ensembles/AODELd.cc b/bayesnet/ensembles/AODELd.cc index 4f0f0cd..991e5b4 100644 --- a/bayesnet/ensembles/AODELd.cc +++ b/bayesnet/ensembles/AODELd.cc @@ -19,7 +19,7 @@ namespace bayesnet { Xf = X_; y = y_; // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y - states = fit_local_discretization(y); + states = fit_local_discretization(y, states_); // We have discretized the input data // 1st we need to fit the model to build the normal AODE structure, Ensemble::fit // calls buildModel to initialize the base models