Update fimdlp version and change tests

Restart proposal
Fix adult numeric features mistake
2025-08-22 20:16:15 +02:00 · 2025-08-21 19:20:03 +02:00 · 2025-08-21 19:01:10 +02:00 · 2025-08-21 12:56:41 +02:00 · 2025-08-21 01:21:24 +02:00 · 2025-08-19 12:29:54 +02:00
14 changed files with 49143 additions and 31 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -16,7 +16,7 @@
            "name": "test",
            "program": "${workspaceFolder}/build_Debug/tests/TestBayesNet",
            "args": [
-                "[XBAODE]"
+                "Test Dataset Loading"
            ],
            "cwd": "${workspaceFolder}/build_Debug/tests"
        },
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [1.2.2] - 2025-08-19
 ### Fixed
 - Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical.
 - Fix testutils to return states for all features:
  - An empty vector is now returned for numeric features.
  - Categorical features now return their unique states.
 ## [1.2.1] - 2025-07-19
 ### Internal
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.27)
 project(bayesnet
-  VERSION 1.2.1
+  VERSION 1.2.2
  DESCRIPTION "Bayesian Network and basic classifiers Library."
  HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
  LANGUAGES CXX
--- a/49
+++ b/49
@@ -21,15 +21,18 @@ sed_command_diagram = 's/Diagram"/Diagram" width="100%" height="100%" /g'
 CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
                 || nproc --all 2>/dev/null \
                 || sysctl -n hw.ncpu)
 # --- Your desired job count: CPUs – 7, but never less than 1 --------------
 JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
 # Colors for output
 GREEN = \033[0;32m
 YELLOW = \033[1;33m
 RED = \033[0;31m
 NC = \033[0m # No Color
 define ClearTests
 	@for t in $(test_targets); do \
 		if [ -f $(f_debug)/tests/$$t ]; then \
-			echo ">>> Cleaning $$t..." ; \
+			echo ">>> Removing $$t..." ; \
 			rm -f $(f_debug)/tests/$$t ; \
 		fi ; \
 	done
@@ -48,6 +51,20 @@ define setup_target
 	@echo ">>> Done"
 endef
 define status_file_folder
 	@if [ -d $(1) ]; then \
 		st1=" ✅ $(GREEN)"; \
 	else \
 		st1=" ❌ $(RED)"; \
 	fi; \
 	if [ -f $(1)/libbayesnet.a ]; then \
 		st2=" ✅ $(GREEN)"; \
 	else \
 		st2=" ❌ $(RED)"; \
 	fi; \
 	printf "  $(YELLOW)$(2):$(NC) $$st1 Folder $(NC)  $$st2 Library $(NC)\n"
 endef
 setup: ## Install dependencies for tests and coverage
 	@if [ "$(shell uname)" = "Darwin" ]; then \
 		brew install gcovr; \
@@ -61,7 +78,7 @@ setup: ## Install dependencies for tests and coverage
 clean: ## Clean the project
 	@echo ">>> Cleaning the project..."
-	@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fimake 
+	@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fi
 	@for folder in $(f_release) $(f_debug) vpcpkg_installed install_test ; do \
 		if test -d "$$folder" ; then \
 			echo "- Deleting $$folder folder" ; \
@@ -80,11 +97,12 @@ debug: ## Setup debug version using Conan
 release: ## Setup release version using Conan
 	@$(call setup_target,"Release","$(f_release)","ENABLE_TESTING=OFF")
-buildd: ## Build the debug targets
+buildd: ## Build the debug && test targets
-	cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
+	@cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
 	@cmake --build $(f_debug) -t $(test_targets) --parallel $(JOBS)
 buildr: ## Build the release targets
-	cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
+	@cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
 # Install targets
@@ -241,9 +259,24 @@ sample: ## Build sample with Conan
 	sample/build/bayesnet_sample $(fname) $(model)
 	@echo ">>> Done";
 info: ## Show project information
 	@version=$$(grep -A1 "project(bayesnet" CMakeLists.txt | grep "VERSION" | sed 's/.*VERSION \([0-9.]*\).*/\1/'); \
 	printf "$(GREEN)BayesNet Library: $(YELLOW)ver. $$version$(NC)\n"
 	@echo ""
 	@printf "$(GREEN)Project folders:$(NC)\n"
 	$(call status_file_folder, $(f_release), "Build\ Release")
 	$(call status_file_folder, $(f_debug), "Build\ Debug\ \ ")
 	@echo ""
 	@printf "$(GREEN)Build commands:$(NC)\n"
 	@printf "   $(YELLOW)make release && make buildr$(NC) - Build library for release\n"
 	@printf "   $(YELLOW)make debug && make buildd$(NC)   - Build library for debug\n"
 	@printf "   $(YELLOW)make test$(NC) - Run tests\n"
 	@printf "   $(YELLOW)Usage:$(NC) make help\n"
 	@echo ""
 	@printf "   $(YELLOW)Parallel Jobs:   $(GREEN)$(JOBS)$(NC)\n"
 # Help target
 # ===========
 help: ## Show help message
 	@IFS=$$'\n' ; \
 	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
--- a/bayesnet/classifiers/Proposal.cc
+++ b/bayesnet/classifiers/Proposal.cc
@@ -118,7 +118,7 @@ namespace bayesnet {
        }
        return states;
    }
-    map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
+    map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states_)
    {
        // Discretize the continuous input data and build pDataset (Classifier::dataset)
        int m = Xf.size(1);
@@ -190,7 +190,7 @@ namespace bayesnet {
    )
    {
        // Phase 1: Initial discretization (same as original)
-        auto currentStates = fit_local_discretization(y);
+        auto currentStates = fit_local_discretization(y, initialStates);
        auto previousModel = Network();
        if (convergence_params.verbose) {
--- a/bayesnet/classifiers/Proposal.h
+++ b/bayesnet/classifiers/Proposal.h
@@ -23,9 +23,8 @@ namespace bayesnet {
    protected:
        void checkInput(const torch::Tensor& X, const torch::Tensor& y);
        torch::Tensor prepareX(torch::Tensor& X);
-        map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
+        // fit_local_discretization is only called by aodeld
-        map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y);
+        map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states);
        // Iterative discretization method
        template<typename Classifier>
        map<std::string, std::vector<int>> iterativeLocalDiscretization(
@@ -37,18 +36,15 @@ namespace bayesnet {
            const map<std::string, std::vector<int>>& initialStates,
            const Smoothing_t smoothing
        );
        torch::Tensor Xf; // X continuous nxm tensor
        torch::Tensor y; // y discrete nx1 tensor
        map<std::string, std::unique_ptr<mdlp::Discretizer>> discretizers;
        // MDLP parameters
        struct {
            size_t min_length = 3; // Minimum length of the interval to consider it in mdlp
            float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm
            int max_depth = std::numeric_limits<int>::max(); // Maximum depth of the MDLP tree
        } ld_params;
        // Convergence parameters
        struct {
            int maxIterations = 10;
@@ -60,6 +56,7 @@ namespace bayesnet {
            "max_iterations", "verbose_convergence"
        };
    private:
        map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
        std::vector<int> factorize(const std::vector<std::string>& labels_t);
        std::vector<std::string>& notes; // Notes during fit from BaseClassifier
        torch::Tensor& pDataset; // (n+1)xm tensor
--- a/bayesnet/ensembles/AODELd.cc
+++ b/bayesnet/ensembles/AODELd.cc
@@ -19,7 +19,7 @@ namespace bayesnet {
        Xf = X_;
        y = y_;
        // Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
-        states = fit_local_discretization(y);
+        states = fit_local_discretization(y, states_);
        // We have discretized the input data
        // 1st we need to fit the model to build the normal AODE structure, Ensemble::fit  
        // calls buildModel to initialize the base models
--- a/conanfile.py
+++ b/conanfile.py
@@ -60,7 +60,7 @@ class BayesNetConan(ConanFile):
        self.requires("libtorch/2.7.1")
        self.requires("nlohmann_json/3.11.3")
        self.requires("folding/1.1.2")  # Custom package
-        self.requires("fimdlp/2.1.1")  # Custom package
+        self.requires("fimdlp/2.1.2")  # Custom package
    def build_requirements(self):
        self.build_requires("cmake/[>=3.27]")
--- a/tests/TestBayesModels.cc
+++ b/tests/TestBayesModels.cc
@@ -20,7 +20,7 @@
 #include "bayesnet/ensembles/AODELd.h"
 #include "bayesnet/ensembles/BoostAODE.h"
-const std::string ACTUAL_VERSION = "1.2.1";
+const std::string ACTUAL_VERSION = "1.2.2";
 TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
 {
@@ -496,3 +496,58 @@ TEST_CASE("Local discretization hyperparameters", "[Models]")
    REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
    REQUIRE(clft.getStatus() == bayesnet::NORMAL);
 }
 TEST_CASE("Test Dataset Loading", "[Datasets]")
 {
    int max_sample = 4;
    // Test loading a dataset
    RawDatasets dataset("iris", true);
    REQUIRE(dataset.Xt.size(0) == 4);
    REQUIRE(dataset.Xt.size(1) == 150);
    REQUIRE(dataset.yt.size(0) == 150);
    std::cout << "Dataset iris discretized " << std::endl;
    for (int sample = 0; sample < max_sample; sample++) {
        for (int feature = 0; feature < 4; feature++) {
            std::cout << dataset.Xt[feature][sample].item<int>() << " ";
        }
        std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
    }
    dataset = RawDatasets("iris", false);
    std::cout << "Dataset iris raw " << std::endl;
    for (int sample = 0; sample < max_sample; sample++) {
        for (int feature = 0; feature < 4; feature++) {
            std::cout << dataset.Xt[feature][sample].item<float>() << " ";
        }
        std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
    }
    // Test loading a dataset
    dataset = RawDatasets("adult", true);
    REQUIRE(dataset.Xt.size(0) == 14);
    REQUIRE(dataset.Xt.size(1) == 45222);
    REQUIRE(dataset.yt.size(0) == 45222);
    std::cout << "Dataset adult discretized " << std::endl;
    for (int sample = 0; sample < max_sample; sample++) {
        for (int feature = 0; feature < 14; feature++) {
            std::cout << dataset.Xt[feature][sample].item<int>() << " ";
        }
        std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
    }
    auto features = dataset.features;
    std::cout << "States:" << std::endl;
    for (int i = 0; i < 14; i++) {
        std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
    }
    dataset = RawDatasets("adult", false);
    std::cout << "Dataset adult raw " << std::endl;
    for (int sample = 0; sample < max_sample; sample++) {
        for (int feature = 0; feature < 14; feature++) {
            std::cout << dataset.Xt[feature][sample].item<float>() << " ";
        }
        std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
    }
    std::cout << "States:" << std::endl;
    for (int i = 0; i < 14; i++) {
        std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
    }
    auto clf = bayesnet::TANLd();
    clf.fit(dataset.Xt, dataset.yt, dataset.features, dataset.className, dataset.states, dataset.smoothing);
 }
--- a/tests/TestModulesVersions.cc
+++ b/tests/TestModulesVersions.cc
@@ -16,7 +16,7 @@
 #include "TestUtils.h"
 std::map<std::string, std::string> modules = {
-    { "mdlp", "2.1.1" },
+    { "mdlp", "2.1.2" },
    { "Folding", "1.1.2" },
    { "json", "3.11" },
    { "ArffFiles", "1.2.1" }
--- a/tests/TestUtils.cc
+++ b/tests/TestUtils.cc
@@ -5,6 +5,7 @@
 // ***************************************************************
 #include <random>
 #include <nlohmann/json.hpp>
 #include "TestUtils.h"
 #include "bayesnet/config.h"
@@ -51,6 +52,7 @@ private:
 RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug)
 {
    catalog = loadCatalog();
    num_samples = num_samples_;
    shuffle = shuffle_;
    discretize = discretize_;
@@ -62,7 +64,7 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
    nSamples = dataset.size(1);
    weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
    weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
-    classNumStates = discretize ? states.at(className).size() : 0;
+    classNumStates = states.at(className).size();
    auto fold = folding::StratifiedKFold(5, yt, 271);
    auto [train, test] = fold.getFold(0);
    auto train_t = torch::tensor(train);
@@ -78,18 +80,90 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
 map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
 {
    map<std::string, int> maxes;
    auto fimdlp = mdlp::CPPFImdlp();
    for (int i = 0; i < X.size(); i++) {
        mdlp::labels_t xd;
        if (is_numeric.at(i)) {
            fimdlp.fit(X[i], yv);
-        mdlp::labels_t& xd = fimdlp.transform(X[i]);
+            xd = fimdlp.transform(X[i]);
        } else {
            std::transform(X[i].begin(), X[i].end(), back_inserter(xd), [](const auto& val) {
                return static_cast<int>(val);
                });
        }
        maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
        Xv.push_back(xd);
    }
    return maxes;
 }
 map<std::string, std::vector<int>> RawDatasets::loadCatalog()
 {
    map<std::string, std::vector<int>> catalogNames;
    ifstream catalog(Paths::datasets() + "all.txt");
    std::vector<int> numericFeaturesIdx;
    if (!catalog.is_open()) {
        throw std::invalid_argument("Unable to open catalog file. [" + Paths::datasets() + +"all.txt" + "]");
    }
    std::string line;
    std::vector<std::string> sorted_lines;
    while (getline(catalog, line)) {
        if (line.empty() || line[0] == '#') {
            continue;
        }
        sorted_lines.push_back(line);
    }
    sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
        const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
        return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
        });
    for (const auto& line : sorted_lines) {
        std::vector<std::string> tokens = split(line, ';');
        std::string name = tokens[0];
        std::string className;
        numericFeaturesIdx.clear();
        int size = tokens.size();
        switch (size) {
            case 1:
                className = "-1";
                numericFeaturesIdx.push_back(-1);
                break;
            case 2:
                className = tokens[1];
                numericFeaturesIdx.push_back(-1);
                break;
            case 3:
                {
                    className = tokens[1];
                    auto numericFeatures = tokens[2];
                    if (numericFeatures == "all") {
                        numericFeaturesIdx.push_back(-1);
                    } else {
                        if (numericFeatures != "none") {
                            auto features = nlohmann::json::parse(numericFeatures);
                            for (auto& f : features) {
                                numericFeaturesIdx.push_back(f);
                            }
                        }
                    }
                }
                break;
            default:
                throw std::invalid_argument("Invalid catalog file format.");
        }
        catalogNames[name] = numericFeaturesIdx;
    }
    catalog.close();
    if (catalogNames.empty()) {
        throw std::invalid_argument("Catalog is empty. Please check the catalog file.");
    }
    return catalogNames;
 }
 void RawDatasets::loadDataset(const std::string& name, bool class_last)
 {
    auto handler = ShuffleArffFiles(num_samples, shuffle);
@@ -101,6 +175,26 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
    className = handler.getClassName();
    auto attributes = handler.getAttributes();
    transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
    is_numeric.clear();
    is_numeric.reserve(features.size());
    auto numericFeaturesIdx = catalog.at(name);
    if (numericFeaturesIdx.empty()) {
        // no numeric features
        is_numeric.assign(features.size(), false);
    } else {
        if (numericFeaturesIdx[0] == -1) {
            // all features are numeric
            is_numeric.assign(features.size(), true);
        } else {
            // some features are numeric
            is_numeric.assign(features.size(), false);
            for (const auto& idx : numericFeaturesIdx) {
                if (idx >= 0 && idx < features.size()) {
                    is_numeric[idx] = true;
                }
            }
        }
    }
    // Discretize Dataset
    auto maxValues = discretizeDataset(X);
    maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
@@ -113,13 +207,23 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
            Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
        }
        states[className] = std::vector<int>(maxValues[className]);
        iota(begin(states.at(className)), end(states.at(className)), 0);
    } else {
        Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
        for (int i = 0; i < features.size(); ++i) {
            Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
            if (!is_numeric.at(i)) {
                states[features[i]] = std::vector<int>(maxValues[features[i]]);
                iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
            } else {
                states[features[i]] = std::vector<int>();
            }
        }
        yt = torch::tensor(yv, torch::kInt32);
        int maxy = *max_element(yv.begin(), yv.end()) + 1;
        states[className] = std::vector<int>(maxy);
    }
    iota(begin(states.at(className)), end(states.at(className)), 0);
    yt = torch::tensor(yv, torch::kInt32);
 }
--- a/tests/TestUtils.h
+++ b/tests/TestUtils.h
@@ -27,7 +27,11 @@ public:
    std::vector<double> weightsv;
    std::vector<string> features;
    std::string className;
    std::vector<bool> is_numeric; // indicates whether each feature is numeric
    map<std::string, std::vector<int>> states;
    //catalog holds the mapping between dataset names and their corresponding indices of numeric features (-1) means all are numeric 
    //and an empty vector means none are numeric
    map<std::string, std::vector<int>> catalog;
    int nSamples, classNumStates;
    double epsilon = 1e-5;
    bool discretize;
@@ -65,8 +69,30 @@ private:
            + "classNumStates: " + std::to_string(classNumStates) + "\n"
            + "states: " + states_ + "\n";
    }
    std::string trim(const std::string& str)
    {
        std::string result = str;
        result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
            return !std::isspace(ch);
            }));
        result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
            return !std::isspace(ch);
            }).base(), result.end());
        return result;
    }
    std::vector<std::string> split(const std::string& text, char delimiter)
    {
        std::vector<std::string> result;
        std::stringstream ss(text);
        std::string token;
        while (std::getline(ss, token, delimiter)) {
            result.push_back(trim(token));
        }
        return result;
    }
    map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
    void loadDataset(const std::string& name, bool class_last);
    map<std::string, std::vector<int>> loadCatalog();
 };
 #endif //TEST_UTILS_H
--- a/tests/data/adult.arff
+++ b/tests/data/adult.arff
--- a/tests/data/all.txt
+++ b/tests/data/all.txt
@@ -0,0 +1,27 @@
 adult;class;[0,2,4,10,11,12]
 balance-scale;class; all
 breast-w;Class; all
 diabetes;class; all
 ecoli;class; all
 glass;Type; all
 hayes-roth;class; none
 heart-statlog;class; [0,3,4,7,9,11]
 ionosphere;class; all
 iris;class; all
 kdd_JapaneseVowels;speaker; all
 letter;class; all
 liver-disorders;selector; all
 mfeat-factors;class; all
 mfeat-fourier;class; all
 mfeat-karhunen;class; all
 mfeat-morphological;class; all
 mfeat-zernike;class; all
 optdigits;class; all
 page-blocks;class; all
 pendigits;class; all
 segment;class; all
 sonar;Class; all
 spambase;class; all
 vehicle;Class; all
 waveform-5000;class; all
 wine;class; all
Author	SHA1	Message	Date
Ricardo Montañana Gómez	0c7452e35b	Update fimdlp version and change tests	2025-08-22 20:16:15 +02:00
Ricardo Montañana	74b391907a	Restart proposal	2025-08-21 19:20:03 +02:00
Ricardo Montañana	1aa3b609e5	Fix adult numeric features mistake	2025-08-21 19:01:10 +02:00
Ricardo Montañana	f1a2349245	Fix discretize only numerics in tests	2025-08-21 12:56:41 +02:00
Ricardo Montañana	8578d68c57	Add make info command	2025-08-21 01:21:24 +02:00
Ricardo Montañana	9f9369269a	Fix ld that was discretizing all input features	2025-08-19 12:29:54 +02:00