Compare commits

..

18 Commits

Author SHA1 Message Date
22e846f47f Fix discrete feature discretizer once they are rediscretized in the algorithm 2025-08-24 12:44:09 +02:00
7c01646726 Complete proposal with only discretizing numeric features 2025-08-24 01:09:25 +02:00
0c7452e35b Update fimdlp version and change tests 2025-08-22 20:16:15 +02:00
74b391907a Restart proposal 2025-08-21 19:20:03 +02:00
1aa3b609e5 Fix adult numeric features mistake 2025-08-21 19:01:10 +02:00
f1a2349245 Fix discretize only numerics in tests 2025-08-21 12:56:41 +02:00
8578d68c57 Add make info command 2025-08-21 01:21:24 +02:00
9f9369269a Fix ld that was discretizing all input features 2025-08-19 12:29:54 +02:00
89142f8997 Update version number 2025-07-19 22:47:32 +02:00
17ee6a909a Merge pull request 'Create version 1.2.1' (#40) from ldi into main
Reviewed-on: #40
2025-07-19 20:42:25 +00:00
56d85b1a43 Update test libraries version number 2025-07-19 22:25:17 +02:00
481c702302 Update libraries versions 2025-07-19 22:12:27 +02:00
3e0b790cfe Update Changelog 2025-07-08 18:57:57 +02:00
e2a0c5f4a5 Add Notes to Proposal convergence 2025-07-08 18:50:09 +02:00
aa77745e55 Fix TANLd valid_hyperparameters 2025-07-08 17:28:27 +02:00
e5227c5f4b Add dataset tests to Ld models 2025-07-08 16:07:16 +02:00
ed380b1494 Complete implementation with tests 2025-07-08 11:42:20 +02:00
2c7352ac38 Fix classifier build in proposal 2025-07-07 02:10:08 +02:00
27 changed files with 1018 additions and 261 deletions

View File

@@ -1,4 +1,5 @@
{
"sonarCloudOrganization": "rmontanana",
"projectKey": "rmontanana_BayesNet"
"projectKey": "rmontanana_BayesNet",
"region": "EU"
}

2
.vscode/launch.json vendored
View File

@@ -16,7 +16,7 @@
"name": "test",
"program": "${workspaceFolder}/build_Debug/tests/TestBayesNet",
"args": [
"[XBAODE]"
"Test Dataset Loading"
],
"cwd": "${workspaceFolder}/build_Debug/tests"
},

View File

@@ -5,9 +5,26 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.2.2] - 2025-08-19
## [1.2.0] - 2025-06-30
### Fixed
- Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical.
- Fix testutils to return states for all features:
- An empty vector is now returned for numeric features.
- Categorical features now return their unique states.
## [1.2.1] - 2025-07-19
### Internal
- Update Libtorch to version 2.7.1
- Update libraries versions:
- mdlp: 2.1.1
- Folding: 1.1.2
- ArffFiles: 1.2.1
## [1.2.0] - 2025-07-08
### Internal
@@ -17,6 +34,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- *ld_proposed_cuts*: number of cut points to return.
- *mdlp_min_length*: minimum length of a partition in MDLP algorithm to be evaluated for partition.
- *mdlp_max_depth*: maximum level of recursion in MDLP algorithm.
- *max_iterations*: maximum number of iterations of discretization-build model loop.
- *verbose_convergence*: display status messages during the convergence process.
- Remove vcpkg as a dependency manager, now the library is built with Conan package manager and CMake.
- Add `build_type` option to the sample target in the Makefile to allow building in *Debug* or *Release* mode. Default is *Debug*.

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.27)
project(bayesnet
VERSION 1.2.0
VERSION 1.2.2
DESCRIPTION "Bayesian Network and basic classifiers Library."
HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
LANGUAGES CXX

View File

@@ -21,15 +21,18 @@ sed_command_diagram = 's/Diagram"/Diagram" width="100%" height="100%" /g'
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|| nproc --all 2>/dev/null \
|| sysctl -n hw.ncpu)
# --- Your desired job count: CPUs 7, but never less than 1 --------------
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
# Colors for output
GREEN = \033[0;32m
YELLOW = \033[1;33m
RED = \033[0;31m
NC = \033[0m # No Color
define ClearTests
@for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \
echo ">>> Cleaning $$t..." ; \
echo ">>> Removing $$t..." ; \
rm -f $(f_debug)/tests/$$t ; \
fi ; \
done
@@ -48,6 +51,20 @@ define setup_target
@echo ">>> Done"
endef
define status_file_folder
@if [ -d $(1) ]; then \
st1="$(GREEN)"; \
else \
st1="$(RED)"; \
fi; \
if [ -f $(1)/libbayesnet.a ]; then \
st2="$(GREEN)"; \
else \
st2="$(RED)"; \
fi; \
printf " $(YELLOW)$(2):$(NC) $$st1 Folder $(NC) $$st2 Library $(NC)\n"
endef
setup: ## Install dependencies for tests and coverage
@if [ "$(shell uname)" = "Darwin" ]; then \
brew install gcovr; \
@@ -61,12 +78,12 @@ setup: ## Install dependencies for tests and coverage
clean: ## Clean the project
@echo ">>> Cleaning the project..."
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fimake
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fi
@for folder in $(f_release) $(f_debug) vpcpkg_installed install_test ; do \
if test -d "$$folder" ; then \
echo "- Deleting $$folder folder" ; \
rm -rf "$$folder"; \
fi; \
if test -d "$$folder" ; then \
echo "- Deleting $$folder folder" ; \
rm -rf "$$folder"; \
fi; \
done
@$(MAKE) clean-test
@echo ">>> Done";
@@ -80,11 +97,12 @@ debug: ## Setup debug version using Conan
release: ## Setup release version using Conan
@$(call setup_target,"Release","$(f_release)","ENABLE_TESTING=OFF")
buildd: ## Build the debug targets
cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
buildd: ## Build the debug && test targets
@cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
@cmake --build $(f_debug) -t $(test_targets) --parallel $(JOBS)
buildr: ## Build the release targets
cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
@cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
# Install targets
@@ -237,13 +255,28 @@ sample: ## Build sample with Conan
@if [ -d ./sample/build ]; then rm -rf ./sample/build; fi
@cd sample && conan install . --output-folder=build --build=missing -s build_type=$(build_type) -o "&:enable_coverage=False" -o "&:enable_testing=False"
@cd sample && cmake -B build -S . -DCMAKE_BUILD_TYPE=$(build_type) -DCMAKE_TOOLCHAIN_FILE=build/conan_toolchain.cmake && \
cmake --build build -t bayesnet_sample
cmake --build build -t bayesnet_sample --parallel $(JOBS)
sample/build/bayesnet_sample $(fname) $(model)
@echo ">>> Done";
info: ## Show project information
@version=$$(grep -A1 "project(bayesnet" CMakeLists.txt | grep "VERSION" | sed 's/.*VERSION \([0-9.]*\).*/\1/'); \
printf "$(GREEN)BayesNet Library: $(YELLOW)ver. $$version$(NC)\n"
@echo ""
@printf "$(GREEN)Project folders:$(NC)\n"
$(call status_file_folder, $(f_release), "Build\ Release")
$(call status_file_folder, $(f_debug), "Build\ Debug\ \ ")
@echo ""
@printf "$(GREEN)Build commands:$(NC)\n"
@printf " $(YELLOW)make release && make buildr$(NC) - Build library for release\n"
@printf " $(YELLOW)make debug && make buildd$(NC) - Build library for debug\n"
@printf " $(YELLOW)make test$(NC) - Run tests\n"
@printf " $(YELLOW)Usage:$(NC) make help\n"
@echo ""
@printf " $(YELLOW)Parallel Jobs: $(GREEN)$(JOBS)$(NC)\n"
# Help target
# ===========
help: ## Show help message
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

View File

@@ -8,7 +8,7 @@
[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=rmontanana_BayesNet&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=rmontanana_BayesNet)
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/Doctorado-ML/BayesNet)
![Gitea Last Commit](https://img.shields.io/gitea/last-commit/rmontanana/bayesnet?gitea_url=https://gitea.rmontanana.es&logo=gitea)
[![Coverage Badge](https://img.shields.io/badge/Coverage-98,0%25-green)](https://gitea.rmontanana.es/rmontanana/BayesNet)
[![Coverage Badge](https://img.shields.io/badge/Coverage-99,1%25-green)](https://gitea.rmontanana.es/rmontanana/BayesNet)
[![DOI](https://zenodo.org/badge/667782806.svg)](https://doi.org/10.5281/zenodo.14210344)
Bayesian Network Classifiers library

View File

@@ -37,6 +37,7 @@ namespace bayesnet {
std::vector<std::string> getNotes() const override { return notes; }
std::string dump_cpt() const override;
void setHyperparameters(const nlohmann::json& hyperparameters) override; //For classifiers that don't have hyperparameters
Network& getModel() { return model; }
protected:
bool fitted;
unsigned int m, n; // m: number of samples, n: number of features

View File

@@ -5,41 +5,39 @@
// ***************************************************************
#include "KDBLd.h"
#include <memory>
namespace bayesnet {
KDBLd::KDBLd(int k) : KDB(k), Proposal(dataset, features, className)
KDBLd::KDBLd(int k) : KDB(k), Proposal(dataset, features, className, KDB::notes)
{
validHyperparameters = validHyperparameters_ld;
validHyperparameters.push_back("k");
validHyperparameters.push_back("theta");
}
void KDBLd::setHyperparameters(const nlohmann::json& hyperparameters_)
{
auto hyperparameters = hyperparameters_;
if (hyperparameters.contains("k")) {
k = hyperparameters["k"];
hyperparameters.erase("k");
}
if (hyperparameters.contains("theta")) {
theta = hyperparameters["theta"];
hyperparameters.erase("theta");
}
Proposal::setHyperparameters(hyperparameters);
}
KDBLd& KDBLd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
{
checkInput(X_, y_);
features = features_;
className = className_;
Xf = X_;
y = y_;
// Use iterative local discretization instead of the two-phase approach
states = iterativeLocalDiscretization(y, this, dataset, features, className, states_, smoothing);
// Final fit with converged discretization
return commonFit(features_, className_, states_, smoothing);
}
KDBLd& KDBLd::fit(torch::Tensor& dataset, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
{
if (!torch::is_floating_point(dataset)) {
throw std::runtime_error("Dataset must be a floating point tensor");
}
Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
y = dataset.index({ -1, "..." }).clone().to(torch::kInt32);
return commonFit(features_, className_, states_, smoothing);
}
KDBLd& KDBLd::commonFit(const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
{
features = features_;
className = className_;
states = iterativeLocalDiscretization(y, static_cast<KDB*>(this), dataset, features, className, states_, smoothing);
KDB::fit(dataset, features, className, states, smoothing);
fitted = true;
return *this;
}
torch::Tensor KDBLd::predict(torch::Tensor& X)
@@ -56,4 +54,4 @@ namespace bayesnet {
{
return KDB::graph(name);
}
}
}

View File

@@ -15,8 +15,15 @@ namespace bayesnet {
explicit KDBLd(int k);
virtual ~KDBLd() = default;
KDBLd& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing) override;
KDBLd& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing) override;
KDBLd& commonFit(const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing);
std::vector<std::string> graph(const std::string& name = "KDB") const override;
void setHyperparameters(const nlohmann::json& hyperparameters_) override;
void setHyperparameters(const nlohmann::json& hyperparameters_) override
{
auto hyperparameters = hyperparameters_;
Proposal::setHyperparameters(hyperparameters);
KDB::setHyperparameters(hyperparameters);
}
torch::Tensor predict(torch::Tensor& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
static inline std::string version() { return "0.0.1"; };

View File

@@ -8,14 +8,19 @@
#include <iostream>
#include <cmath>
#include <limits>
#include "Classifier.h"
#include "KDB.h"
#include "TAN.h"
#include "SPODE.h"
#include "KDBLd.h"
#include "TANLd.h"
namespace bayesnet {
Proposal::Proposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_) : pDataset(dataset_), pFeatures(features_), pClassName(className_)
Proposal::Proposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_, std::vector<std::string>& notes_) : pDataset(dataset_), pFeatures(features_), pClassName(className_), notes(notes_)
{
}
void Proposal::setHyperparameters(const nlohmann::json& hyperparameters_)
void Proposal::setHyperparameters(nlohmann::json& hyperparameters)
{
auto hyperparameters = hyperparameters_;
if (hyperparameters.contains("ld_proposed_cuts")) {
ld_params.proposed_cuts = hyperparameters["ld_proposed_cuts"];
hyperparameters.erase("ld_proposed_cuts");
@@ -50,9 +55,6 @@ namespace bayesnet {
convergence_params.verbose = hyperparameters["verbose_convergence"];
hyperparameters.erase("verbose_convergence");
}
if (!hyperparameters.empty()) {
throw std::invalid_argument("Invalid hyperparameters for Proposal: " + hyperparameters.dump());
}
}
void Proposal::checkInput(const torch::Tensor& X, const torch::Tensor& y)
@@ -99,6 +101,9 @@ namespace bayesnet {
auto xvf_ptr = Xf.index({ index }).data_ptr<float>();
auto xvf = std::vector<mdlp::precision_t>(xvf_ptr, xvf_ptr + Xf.size(1));
discretizers[feature]->fit(xvf, yxv);
// Enables the discretizer in predict time, because now we have a discretizer fitted for this feature,
// either it was a numeric feature in the beginning or not
wasNumeric[index] = true;
}
if (upgrade) {
// Discretize again X (only the affected indices) with the new fitted discretizers
@@ -116,17 +121,20 @@ namespace bayesnet {
}
return states;
}
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states)
{
// Discretize the continuous input data and build pDataset (Classifier::dataset)
// We expect to have in states for numeric features an empty vector and for discretized features a vector of states
int m = Xf.size(1);
int n = Xf.size(0);
map<std::string, std::vector<int>> states;
pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
// discretize input data by feature(row)
std::unique_ptr<mdlp::Discretizer> discretizer;
wasNumeric.resize(pFeatures.size());
for (auto i = 0; i < pFeatures.size(); ++i) {
auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
if (discretizationType == discretization_t::BINQ) {
discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
} else if (discretizationType == discretization_t::BINU) {
@@ -134,13 +142,19 @@ namespace bayesnet {
} else { // Default is MDLP
discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
}
auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
discretizer->fit(Xt, yv);
pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
iota(xStates.begin(), xStates.end(), 0);
states[pFeatures[i]] = xStates;
if (states[pFeatures[i]].empty()) {
// If the feature is numeric, we discretize it
pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->fit_transform(Xt, yv)));
int n_states = discretizer->getCutPoints().size() + 1;
auto xStates = std::vector<int>(n_states);
iota(xStates.begin(), xStates.end(), 0);
states[pFeatures[i]] = xStates;
wasNumeric[i] = true;
} else {
wasNumeric[i] = false;
// If the feature is categorical, we just copy it
pDataset.index_put_({ i, "..." }, Xf[i].to(torch::kInt32));
}
discretizers[pFeatures[i]] = std::move(discretizer);
}
int n_classes = torch::max(y).item<int>() + 1;
@@ -155,8 +169,13 @@ namespace bayesnet {
auto Xtd = torch::zeros_like(X, torch::kInt32);
for (int i = 0; i < X.size(0); ++i) {
auto Xt = std::vector<float>(X[i].data_ptr<float>(), X[i].data_ptr<float>() + X.size(1));
auto Xd = discretizers[pFeatures[i]]->transform(Xt);
Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32));
std::vector<int> Xd;
if (wasNumeric[i]) {
auto Xd = discretizers[pFeatures[i]]->transform(Xt);
Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32));
} else {
Xtd.index_put_({ i }, Xf[i].to(torch::kInt32));
}
}
return Xtd;
}
@@ -180,7 +199,7 @@ namespace bayesnet {
map<std::string, std::vector<int>> Proposal::iterativeLocalDiscretization(
const torch::Tensor& y,
Classifier* classifier,
const torch::Tensor& dataset,
torch::Tensor& dataset,
const std::vector<std::string>& features,
const std::string& className,
const map<std::string, std::vector<int>>& initialStates,
@@ -188,7 +207,7 @@ namespace bayesnet {
)
{
// Phase 1: Initial discretization (same as original)
auto currentStates = fit_local_discretization(y);
auto currentStates = fit_local_discretization(y, initialStates);
auto previousModel = Network();
if (convergence_params.verbose) {
@@ -196,73 +215,44 @@ namespace bayesnet {
<< convergence_params.maxIterations << " max iterations" << std::endl;
}
const torch::Tensor weights = torch::full({ pDataset.size(1) }, 1.0 / pDataset.size(1), torch::kDouble);
for (int iteration = 0; iteration < convergence_params.maxIterations; ++iteration) {
if (convergence_params.verbose) {
std::cout << "Iteration " << (iteration + 1) << "/" << convergence_params.maxIterations << std::endl;
}
// Phase 2: Build model with current discretization
classifier->fit(dataset, features, className, currentStates, smoothing);
classifier->fit(dataset, features, className, currentStates, weights, smoothing);
// Phase 3: Network-aware discretization refinement
currentStates = localDiscretizationProposal(currentStates, classifier->model);
currentStates = localDiscretizationProposal(currentStates, classifier->getModel());
// Check convergence
if (iteration > 0 && previousModel == classifier->model) {
if (iteration > 0 && previousModel == classifier->getModel()) {
if (convergence_params.verbose) {
std::cout << "Converged after " << (iteration + 1) << " iterations" << std::endl;
}
notes.push_back("Converged after " + std::to_string(iteration + 1) + " of "
+ std::to_string(convergence_params.maxIterations) + " iterations");
break;
}
// Update for next iteration
previousModel = classifier->model;
previousModel = classifier->getModel();
}
return currentStates;
}
double Proposal::computeLogLikelihood(Network& model, const torch::Tensor& dataset)
{
double logLikelihood = 0.0;
int n_samples = dataset.size(0);
int n_features = dataset.size(1);
for (int i = 0; i < n_samples; ++i) {
double sampleLogLikelihood = 0.0;
// Get class value for this sample
int classValue = dataset[i][n_features - 1].item<int>();
// Compute log-likelihood for each feature given its parents and class
for (const auto& node : model.getNodes()) {
if (node.first == model.getClassName()) {
// For class node, add log P(class)
auto classCounts = node.second->getCPT();
double classProb = classCounts[classValue].item<double>() / dataset.size(0);
sampleLogLikelihood += std::log(std::max(classProb, 1e-10));
} else {
// For feature nodes, add log P(feature | parents, class)
int featureIdx = std::distance(model.getFeatures().begin(),
std::find(model.getFeatures().begin(),
model.getFeatures().end(),
node.first));
int featureValue = dataset[i][featureIdx].item<int>();
// Simplified probability computation - in practice would need full CPT lookup
double featureProb = 0.1; // Placeholder - would compute from CPT
sampleLogLikelihood += std::log(std::max(featureProb, 1e-10));
}
}
logLikelihood += sampleLogLikelihood;
}
return logLikelihood;
}
// Explicit template instantiation for common classifier types
// template map<std::string, std::vector<int>> Proposal::iterativeLocalDiscretization<Classifier>(
// const torch::Tensor&, Classifier*, const torch::Tensor&, const std::vector<std::string>&,
// const std::string&, const map<std::string, std::vector<int>>&, Smoothing_t);
template map<std::string, std::vector<int>> Proposal::iterativeLocalDiscretization<KDB>(
const torch::Tensor&, KDB*, torch::Tensor&, const std::vector<std::string>&,
const std::string&, const map<std::string, std::vector<int>>&, Smoothing_t);
template map<std::string, std::vector<int>> Proposal::iterativeLocalDiscretization<TAN>(
const torch::Tensor&, TAN*, torch::Tensor&, const std::vector<std::string>&,
const std::string&, const map<std::string, std::vector<int>>&, Smoothing_t);
template map<std::string, std::vector<int>> Proposal::iterativeLocalDiscretization<SPODE>(
const torch::Tensor&, SPODE*, torch::Tensor&, const std::vector<std::string>&,
const std::string&, const map<std::string, std::vector<int>>&, Smoothing_t);
}

View File

@@ -18,37 +18,33 @@
namespace bayesnet {
class Proposal {
public:
Proposal(torch::Tensor& pDataset, std::vector<std::string>& features_, std::string& className_);
void setHyperparameters(const nlohmann::json& hyperparameters_);
Proposal(torch::Tensor& pDataset, std::vector<std::string>& features_, std::string& className_, std::vector<std::string>& notes);
void setHyperparameters(nlohmann::json& hyperparameters_);
protected:
void checkInput(const torch::Tensor& X, const torch::Tensor& y);
torch::Tensor prepareX(torch::Tensor& X);
map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y);
// fit_local_discretization is only called by aodeld
map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states);
// Iterative discretization method
template<typename Classifier>
map<std::string, std::vector<int>> iterativeLocalDiscretization(
const torch::Tensor& y,
Classifier* classifier,
const torch::Tensor& dataset,
torch::Tensor& dataset,
const std::vector<std::string>& features,
const std::string& className,
const map<std::string, std::vector<int>>& initialStates,
const Smoothing_t smoothing
);
torch::Tensor Xf; // X continuous nxm tensor
torch::Tensor y; // y discrete nx1 tensor
map<std::string, std::unique_ptr<mdlp::Discretizer>> discretizers;
// MDLP parameters
struct {
size_t min_length = 3; // Minimum length of the interval to consider it in mdlp
float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm
int max_depth = std::numeric_limits<int>::max(); // Maximum depth of the MDLP tree
} ld_params;
// Convergence parameters
struct {
int maxIterations = 10;
@@ -60,10 +56,12 @@ namespace bayesnet {
"max_iterations", "verbose_convergence"
};
private:
map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
std::vector<int> factorize(const std::vector<std::string>& labels_t);
double computeLogLikelihood(Network& model, const torch::Tensor& dataset);
std::vector<std::string>& notes; // Notes during fit from BaseClassifier
torch::Tensor& pDataset; // (n+1)xm tensor
std::vector<std::string>& pFeatures;
std::vector<bool> wasNumeric;
std::string& pClassName;
enum class discretization_t {
MDLP,

View File

@@ -7,7 +7,7 @@
#include "SPODELd.h"
namespace bayesnet {
SPODELd::SPODELd(int root) : SPODE(root), Proposal(dataset, features, className)
SPODELd::SPODELd(int root) : SPODE(root), Proposal(dataset, features, className, SPODE::notes)
{
validHyperparameters = validHyperparameters_ld; // Inherits the valid hyperparameters from Proposal
}
@@ -34,12 +34,9 @@ namespace bayesnet {
{
features = features_;
className = className_;
// Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
// We have discretized the input data
// 1st we need to fit the model to build the normal SPODE structure, SPODE::fit initializes the base Bayesian network
states = iterativeLocalDiscretization(y, static_cast<SPODE*>(this), dataset, features, className, states_, smoothing);
SPODE::fit(dataset, features, className, states, smoothing);
states = localDiscretizationProposal(states, model);
fitted = true;
return *this;
}
torch::Tensor SPODELd::predict(torch::Tensor& X)

View File

@@ -18,6 +18,12 @@ namespace bayesnet {
SPODELd& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing) override;
SPODELd& commonFit(const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing);
std::vector<std::string> graph(const std::string& name = "SPODELd") const override;
void setHyperparameters(const nlohmann::json& hyperparameters_) override
{
auto hyperparameters = hyperparameters_;
Proposal::setHyperparameters(hyperparameters);
SPODE::setHyperparameters(hyperparameters);
}
torch::Tensor predict(torch::Tensor& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
static inline std::string version() { return "0.0.1"; };

View File

@@ -5,23 +5,37 @@
// ***************************************************************
#include "TANLd.h"
#include <memory>
namespace bayesnet {
TANLd::TANLd() : TAN(), Proposal(dataset, features, className) {}
TANLd::TANLd() : TAN(), Proposal(dataset, features, className, TAN::notes)
{
validHyperparameters = validHyperparameters_ld; // Inherits the valid hyperparameters from Proposal
}
TANLd& TANLd::fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
{
checkInput(X_, y_);
features = features_;
className = className_;
Xf = X_;
y = y_;
// Use iterative local discretization instead of the two-phase approach
states = iterativeLocalDiscretization(y, this, dataset, features, className, states_, smoothing);
// Final fit with converged discretization
return commonFit(features_, className_, states_, smoothing);
}
TANLd& TANLd::fit(torch::Tensor& dataset, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
{
if (!torch::is_floating_point(dataset)) {
throw std::runtime_error("Dataset must be a floating point tensor");
}
Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
y = dataset.index({ -1, "..." }).clone().to(torch::kInt32);
return commonFit(features_, className_, states_, smoothing);
}
TANLd& TANLd::commonFit(const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing)
{
features = features_;
className = className_;
states = iterativeLocalDiscretization(y, static_cast<TAN*>(this), dataset, features, className, states_, smoothing);
TAN::fit(dataset, features, className, states, smoothing);
fitted = true;
return *this;
}
torch::Tensor TANLd::predict(torch::Tensor& X)
@@ -38,4 +52,4 @@ namespace bayesnet {
{
return TAN::graph(name);
}
}
}

View File

@@ -16,7 +16,15 @@ namespace bayesnet {
TANLd();
virtual ~TANLd() = default;
TANLd& fit(torch::Tensor& X, torch::Tensor& y, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing) override;
TANLd& fit(torch::Tensor& dataset, const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing) override;
TANLd& commonFit(const std::vector<std::string>& features, const std::string& className, map<std::string, std::vector<int>>& states, const Smoothing_t smoothing);
std::vector<std::string> graph(const std::string& name = "TANLd") const override;
void setHyperparameters(const nlohmann::json& hyperparameters_) override
{
auto hyperparameters = hyperparameters_;
Proposal::setHyperparameters(hyperparameters);
TAN::setHyperparameters(hyperparameters);
}
torch::Tensor predict(torch::Tensor& X) override;
torch::Tensor predict_proba(torch::Tensor& X) override;
};

View File

@@ -7,7 +7,7 @@
#include "AODELd.h"
namespace bayesnet {
AODELd::AODELd(bool predict_voting) : Ensemble(predict_voting), Proposal(dataset, features, className)
AODELd::AODELd(bool predict_voting) : Ensemble(predict_voting), Proposal(dataset, features, className, Ensemble::notes)
{
validHyperparameters = validHyperparameters_ld; // Inherits the valid hyperparameters from Proposal
}
@@ -19,11 +19,12 @@ namespace bayesnet {
Xf = X_;
y = y_;
// Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
states = fit_local_discretization(y);
states = fit_local_discretization(y, states_);
// We have discretized the input data
// 1st we need to fit the model to build the normal AODE structure, Ensemble::fit
// calls buildModel to initialize the base models
Ensemble::fit(dataset, features, className, states, smoothing);
fitted = true;
return *this;
}

View File

@@ -17,6 +17,10 @@ namespace bayesnet {
virtual ~AODELd() = default;
AODELd& fit(torch::Tensor& X_, torch::Tensor& y_, const std::vector<std::string>& features_, const std::string& className_, map<std::string, std::vector<int>>& states_, const Smoothing_t smoothing) override;
std::vector<std::string> graph(const std::string& name = "AODELd") const override;
void setHyperparameters(const nlohmann::json& hyperparameters_) override
{
hyperparameters = hyperparameters_;
}
protected:
void trainModel(const torch::Tensor& weights, const Smoothing_t smoothing) override;
void buildModel(const torch::Tensor& weights) override;

View File

@@ -3,6 +3,7 @@ from conan import ConanFile
from conan.tools.cmake import CMakeToolchain, CMake, cmake_layout, CMakeDeps
from conan.tools.files import copy
class BayesNetConan(ConanFile):
name = "bayesnet"
settings = "os", "compiler", "build_type", "arch"
@@ -10,26 +11,35 @@ class BayesNetConan(ConanFile):
"shared": [True, False],
"fPIC": [True, False],
"enable_testing": [True, False],
"enable_coverage": [True, False]
"enable_coverage": [True, False],
}
default_options = {
"shared": False,
"fPIC": True,
"enable_testing": False,
"enable_coverage": False
"enable_coverage": False,
}
# Sources are located in the same place as this recipe, copy them to the recipe
exports_sources = "CMakeLists.txt", "bayesnet/*", "config/*", "cmake/*", "docs/*", "tests/*", "bayesnetConfig.cmake.in"
exports_sources = (
"CMakeLists.txt",
"bayesnet/*",
"config/*",
"cmake/*",
"docs/*",
"tests/*",
"bayesnetConfig.cmake.in",
)
def set_version(self) -> None:
cmake = pathlib.Path(self.recipe_folder) / "CMakeLists.txt"
text = cmake.read_text(encoding="utf-8")
text = cmake.read_text(encoding="utf-8")
# Accept either: project(foo VERSION 1.2.3) or set(foo_VERSION 1.2.3)
match = re.search(
r"""project\s*\([^\)]*VERSION\s+([0-9]+\.[0-9]+\.[0-9]+)""",
text, re.IGNORECASE | re.VERBOSE
text,
re.IGNORECASE | re.VERBOSE,
)
if match:
self.version = match.group(1)
@@ -40,26 +50,26 @@ class BayesNetConan(ConanFile):
def config_options(self):
if self.settings.os == "Windows":
del self.options.fPIC
def configure(self):
if self.options.shared:
self.options.rm_safe("fPIC")
def requirements(self):
# Core dependencies
self.requires("libtorch/2.7.0")
self.requires("libtorch/2.7.1")
self.requires("nlohmann_json/3.11.3")
self.requires("folding/1.1.1") # Custom package
self.requires("fimdlp/2.1.0") # Custom package
self.requires("folding/1.1.2") # Custom package
self.requires("fimdlp/2.1.2") # Custom package
def build_requirements(self):
self.build_requires("cmake/[>=3.27]")
self.test_requires("arff-files/1.2.0") # Custom package
self.test_requires("arff-files/1.2.1") # Custom package
self.test_requires("catch2/3.8.1")
def layout(self):
cmake_layout(self)
def generate(self):
deps = CMakeDeps(self)
deps.generate()
@@ -67,27 +77,32 @@ class BayesNetConan(ConanFile):
tc.variables["ENABLE_TESTING"] = self.options.enable_testing
tc.variables["CODE_COVERAGE"] = self.options.enable_coverage
tc.generate()
def build(self):
cmake = CMake(self)
cmake.configure()
cmake.build()
if self.options.enable_testing:
# Run tests only if we're building with testing enabled
self.run("ctest --output-on-failure", cwd=self.build_folder)
def package(self):
copy(self, "LICENSE", src=self.source_folder, dst=os.path.join(self.package_folder, "licenses"))
copy(
self,
"LICENSE",
src=self.source_folder,
dst=os.path.join(self.package_folder, "licenses"),
)
cmake = CMake(self)
cmake.install()
def package_info(self):
self.cpp_info.libs = ["bayesnet"]
self.cpp_info.includedirs = ["include"]
self.cpp_info.set_property("cmake_find_mode", "both")
self.cpp_info.set_property("cmake_target_name", "bayesnet::bayesnet")
# Add compiler flags that might be needed
if self.settings.os == "Linux":
self.cpp_info.system_libs = ["pthread"]
self.cpp_info.system_libs = ["pthread"]

View File

@@ -8,7 +8,7 @@ if(ENABLE_TESTING)
add_executable(TestBayesNet TestBayesNetwork.cc TestBayesNode.cc TestBayesClassifier.cc TestXSPnDE.cc TestXBA2DE.cc
TestBayesModels.cc TestBayesMetrics.cc TestFeatureSelection.cc TestBoostAODE.cc TestXBAODE.cc TestA2DE.cc
TestUtils.cc TestBayesEnsemble.cc TestModulesVersions.cc TestBoostA2DE.cc TestMST.cc TestXSPODE.cc ${BayesNet_SOURCES})
target_link_libraries(TestBayesNet PUBLIC "${TORCH_LIBRARIES}" fimdlp::fimdlp PRIVATE Catch2::Catch2WithMain folding::folding)
target_link_libraries(TestBayesNet PRIVATE torch::torch fimdlp::fimdlp Catch2::Catch2WithMain folding::folding)
add_test(NAME BayesNetworkTest COMMAND TestBayesNet)
add_test(NAME A2DE COMMAND TestBayesNet "[A2DE]")
add_test(NAME BoostA2DE COMMAND TestBayesNet "[BoostA2DE]")

View File

@@ -20,7 +20,7 @@
#include "bayesnet/ensembles/AODELd.h"
#include "bayesnet/ensembles/BoostAODE.h"
const std::string ACTUAL_VERSION = "1.2.0";
const std::string ACTUAL_VERSION = "1.2.2";
TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
{
@@ -31,9 +31,9 @@ TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
{{"diabetes", "SPODE"}, 0.802083},
{{"diabetes", "TAN"}, 0.821615},
{{"diabetes", "AODELd"}, 0.8125f},
{{"diabetes", "KDBLd"}, 0.80208f},
{{"diabetes", "KDBLd"}, 0.804688f},
{{"diabetes", "SPODELd"}, 0.7890625f},
{{"diabetes", "TANLd"}, 0.803385437f},
{{"diabetes", "TANLd"}, 0.8125f},
{{"diabetes", "BoostAODE"}, 0.83984f},
// Ecoli
{{"ecoli", "AODE"}, 0.889881},
@@ -42,9 +42,9 @@ TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
{{"ecoli", "SPODE"}, 0.880952},
{{"ecoli", "TAN"}, 0.892857},
{{"ecoli", "AODELd"}, 0.875f},
{{"ecoli", "KDBLd"}, 0.880952358f},
{{"ecoli", "KDBLd"}, 0.872024f},
{{"ecoli", "SPODELd"}, 0.839285731f},
{{"ecoli", "TANLd"}, 0.848214269f},
{{"ecoli", "TANLd"}, 0.869047642f},
{{"ecoli", "BoostAODE"}, 0.89583f},
// Glass
{{"glass", "AODE"}, 0.79439},
@@ -53,9 +53,9 @@ TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
{{"glass", "SPODE"}, 0.775701},
{{"glass", "TAN"}, 0.827103},
{{"glass", "AODELd"}, 0.799065411f},
{{"glass", "KDBLd"}, 0.82710278f},
{{"glass", "KDBLd"}, 0.864485979f},
{{"glass", "SPODELd"}, 0.780373812f},
{{"glass", "TANLd"}, 0.869158864f},
{{"glass", "TANLd"}, 0.831775725f},
{{"glass", "BoostAODE"}, 0.84579f},
// Iris
{{"iris", "AODE"}, 0.973333},
@@ -68,29 +68,29 @@ TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
{{"iris", "SPODELd"}, 0.96f},
{{"iris", "TANLd"}, 0.97333f},
{{"iris", "BoostAODE"}, 0.98f} };
std::map<std::string, bayesnet::BaseClassifier*> models{ {"AODE", new bayesnet::AODE()},
{"AODELd", new bayesnet::AODELd()},
{"BoostAODE", new bayesnet::BoostAODE()},
{"KDB", new bayesnet::KDB(2)},
{"KDBLd", new bayesnet::KDBLd(2)},
{"XSPODE", new bayesnet::XSpode(1)},
{"SPODE", new bayesnet::SPODE(1)},
{"SPODELd", new bayesnet::SPODELd(1)},
{"TAN", new bayesnet::TAN()},
{"TANLd", new bayesnet::TANLd()} };
std::map<std::string, std::unique_ptr<bayesnet::BaseClassifier>> models;
models["AODE"] = std::make_unique<bayesnet::AODE>();
models["AODELd"] = std::make_unique<bayesnet::AODELd>();
models["BoostAODE"] = std::make_unique<bayesnet::BoostAODE>();
models["KDB"] = std::make_unique<bayesnet::KDB>(2);
models["KDBLd"] = std::make_unique<bayesnet::KDBLd>(2);
models["XSPODE"] = std::make_unique<bayesnet::XSpode>(1);
models["SPODE"] = std::make_unique<bayesnet::SPODE>(1);
models["SPODELd"] = std::make_unique<bayesnet::SPODELd>(1);
models["TAN"] = std::make_unique<bayesnet::TAN>();
models["TANLd"] = std::make_unique<bayesnet::TANLd>();
std::string name = GENERATE("AODE", "AODELd", "KDB", "KDBLd", "SPODE", "XSPODE", "SPODELd", "TAN", "TANLd");
auto clf = models[name];
auto clf = std::move(models[name]);
SECTION("Test " + name + " classifier")
{
for (const std::string& file_name : { "glass", "iris", "ecoli", "diabetes" }) {
auto clf = models[name];
auto discretize = name.substr(name.length() - 2) != "Ld";
auto raw = RawDatasets(file_name, discretize);
clf->fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
auto score = clf->score(raw.Xt, raw.yt);
// std::cout << "Classifier: " << name << " File: " << file_name << " Score: " << score << " expected = " <<
// scores[{file_name, name}] << std::endl;
// scores[{file_name, name}] << std::endl;
INFO("Classifier: " << name << " File: " << file_name);
REQUIRE(score == Catch::Approx(scores[{file_name, name}]).epsilon(raw.epsilon));
REQUIRE(clf->getStatus() == bayesnet::NORMAL);
@@ -101,7 +101,6 @@ TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
INFO("Checking version of " << name << " classifier");
REQUIRE(clf->getVersion() == ACTUAL_VERSION);
}
delete clf;
}
TEST_CASE("Models features & Graph", "[Models]")
{
@@ -133,7 +132,7 @@ TEST_CASE("Models features & Graph", "[Models]")
clf.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
REQUIRE(clf.getNumberOfNodes() == 5);
REQUIRE(clf.getNumberOfEdges() == 7);
REQUIRE(clf.getNumberOfStates() == 27);
REQUIRE(clf.getNumberOfStates() == 26);
REQUIRE(clf.getClassNumStates() == 3);
REQUIRE(clf.show() == std::vector<std::string>{"class -> sepallength, sepalwidth, petallength, petalwidth, ",
"petallength -> sepallength, ", "petalwidth -> ",
@@ -149,7 +148,6 @@ TEST_CASE("Get num features & num edges", "[Models]")
REQUIRE(clf.getNumberOfNodes() == 5);
REQUIRE(clf.getNumberOfEdges() == 8);
}
TEST_CASE("Model predict_proba", "[Models]")
{
std::string model = GENERATE("TAN", "SPODE", "BoostAODEproba", "BoostAODEvoting", "TANLd", "SPODELd", "KDBLd");
@@ -180,15 +178,15 @@ TEST_CASE("Model predict_proba", "[Models]")
{0.0284828, 0.770524, 0.200993},
{0.0213182, 0.857189, 0.121493},
{0.00868436, 0.949494, 0.0418215} });
auto res_prob_tanld = std::vector<std::vector<double>>({ {0.000544493, 0.995796, 0.00365992 },
{0.000908092, 0.997268, 0.00182429 },
{0.000908092, 0.997268, 0.00182429 },
{0.000908092, 0.997268, 0.00182429 },
{0.00228423, 0.994645, 0.00307078 },
{0.00120539, 0.0666788, 0.932116 },
{0.00361847, 0.979203, 0.017179 },
{0.00483293, 0.985326, 0.00984064 },
{0.000595606, 0.9977, 0.00170441 } });
auto res_prob_tanld = std::vector<std::vector<double>>({ {0.000597557, 0.9957, 0.00370254},
{0.000731377, 0.997914, 0.0013544},
{0.000731377, 0.997914, 0.0013544},
{0.000731377, 0.997914, 0.0013544},
{0.000838614, 0.998122, 0.00103923},
{0.00130852, 0.0659492, 0.932742},
{0.00365946, 0.979412, 0.0169281},
{0.00435035, 0.986248, 0.00940212},
{0.000583815, 0.997746, 0.00167066} });
auto res_prob_spodeld = std::vector<std::vector<double>>({ {0.000908024, 0.993742, 0.00535024 },
{0.00187726, 0.99167, 0.00645308 },
{0.00187726, 0.99167, 0.00645308 },
@@ -216,29 +214,33 @@ TEST_CASE("Model predict_proba", "[Models]")
{"TANLd", res_prob_tanld},
{"SPODELd", res_prob_spodeld},
{"KDBLd", res_prob_kdbld} };
std::map<std::string, bayesnet::BaseClassifier*> models{ {"TAN", new bayesnet::TAN()},
{"SPODE", new bayesnet::SPODE(0)},
{"BoostAODEproba", new bayesnet::BoostAODE(false)},
{"BoostAODEvoting", new bayesnet::BoostAODE(true)},
{"TANLd", new bayesnet::TANLd()},
{"SPODELd", new bayesnet::SPODELd(0)},
{"KDBLd", new bayesnet::KDBLd(2)} };
std::map<std::string, std::unique_ptr<bayesnet::BaseClassifier>> models;
models["TAN"] = std::make_unique<bayesnet::TAN>();
models["SPODE"] = std::make_unique<bayesnet::SPODE>(0);
models["BoostAODEproba"] = std::make_unique<bayesnet::BoostAODE>(false);
models["BoostAODEvoting"] = std::make_unique<bayesnet::BoostAODE>(true);
models["TANLd"] = std::make_unique<bayesnet::TANLd>();
models["SPODELd"] = std::make_unique<bayesnet::SPODELd>(0);
models["KDBLd"] = std::make_unique<bayesnet::KDBLd>(2);
int init_index = 78;
SECTION("Test " + model + " predict_proba")
{
INFO("Testing " << model << " predict_proba");
auto ld_model = model.substr(model.length() - 2) == "Ld";
auto discretize = !ld_model;
auto raw = RawDatasets("iris", discretize);
auto clf = models[model];
clf->fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
auto yt_pred_proba = clf->predict_proba(raw.Xt);
auto yt_pred = clf->predict(raw.Xt);
auto& clf = *models[model];
clf.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
auto yt_pred_proba = clf.predict_proba(raw.Xt);
auto yt_pred = clf.predict(raw.Xt);
std::vector<int> y_pred;
std::vector<std::vector<double>> y_pred_proba;
if (!ld_model) {
y_pred = clf->predict(raw.Xv);
y_pred_proba = clf->predict_proba(raw.Xv);
y_pred = clf.predict(raw.Xv);
y_pred_proba = clf.predict_proba(raw.Xv);
REQUIRE(y_pred.size() == y_pred_proba.size());
REQUIRE(y_pred.size() == yt_pred.size(0));
REQUIRE(y_pred.size() == yt_pred_proba.size(0));
@@ -267,18 +269,20 @@ TEST_CASE("Model predict_proba", "[Models]")
} else {
// Check predict_proba values for vectors and tensors
auto predictedClasses = yt_pred_proba.argmax(1);
// std::cout << model << std::endl;
for (int i = 0; i < 9; i++) {
REQUIRE(predictedClasses[i].item<int>() == yt_pred[i].item<int>());
// std::cout << "{";
for (int j = 0; j < 3; j++) {
// std::cout << yt_pred_proba[i + init_index][j].item<double>() << ", ";
REQUIRE(res_prob[model][i][j] ==
Catch::Approx(yt_pred_proba[i + init_index][j].item<double>()).epsilon(raw.epsilon));
}
// std::cout << "\b\b}," << std::endl;
}
}
delete clf;
}
}
TEST_CASE("AODE voting-proba", "[Models]")
{
auto raw = RawDatasets("glass", true);
@@ -297,17 +301,30 @@ TEST_CASE("AODE voting-proba", "[Models]")
REQUIRE(pred_proba[67][0] == Catch::Approx(0.702184).epsilon(raw.epsilon));
REQUIRE(clf.topological_order() == std::vector<std::string>());
}
TEST_CASE("SPODELd dataset", "[Models]")
TEST_CASE("Ld models with dataset", "[Models]")
{
auto raw = RawDatasets("iris", false);
auto clf = bayesnet::SPODELd(0);
// raw.dataset.to(torch::kFloat32);
clf.fit(raw.dataset, raw.features, raw.className, raw.states, raw.smoothing);
auto score = clf.score(raw.Xt, raw.yt);
clf.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
auto scoret = clf.score(raw.Xt, raw.yt);
REQUIRE(score == Catch::Approx(0.97333f).epsilon(raw.epsilon));
REQUIRE(scoret == Catch::Approx(0.97333f).epsilon(raw.epsilon));
auto clf2 = bayesnet::TANLd();
clf2.fit(raw.dataset, raw.features, raw.className, raw.states, raw.smoothing);
auto score2 = clf2.score(raw.Xt, raw.yt);
clf2.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
auto score2t = clf2.score(raw.Xt, raw.yt);
REQUIRE(score2 == Catch::Approx(0.97333f).epsilon(raw.epsilon));
REQUIRE(score2t == Catch::Approx(0.97333f).epsilon(raw.epsilon));
auto clf3 = bayesnet::KDBLd(2);
clf3.fit(raw.dataset, raw.features, raw.className, raw.states, raw.smoothing);
auto score3 = clf3.score(raw.Xt, raw.yt);
clf3.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
auto score3t = clf3.score(raw.Xt, raw.yt);
REQUIRE(score3 == Catch::Approx(0.97333f).epsilon(raw.epsilon));
REQUIRE(score3t == Catch::Approx(0.97333f).epsilon(raw.epsilon));
}
TEST_CASE("KDB with hyperparameters", "[Models]")
{
@@ -324,11 +341,15 @@ TEST_CASE("KDB with hyperparameters", "[Models]")
REQUIRE(score == Catch::Approx(0.827103).epsilon(raw.epsilon));
REQUIRE(scoret == Catch::Approx(0.761682).epsilon(raw.epsilon));
}
TEST_CASE("Incorrect type of data for SPODELd", "[Models]")
TEST_CASE("Incorrect type of data for Ld models", "[Models]")
{
auto raw = RawDatasets("iris", true);
auto clf = bayesnet::SPODELd(0);
REQUIRE_THROWS_AS(clf.fit(raw.dataset, raw.features, raw.className, raw.states, raw.smoothing), std::runtime_error);
auto clfs = bayesnet::SPODELd(0);
REQUIRE_THROWS_AS(clfs.fit(raw.dataset, raw.features, raw.className, raw.states, raw.smoothing), std::runtime_error);
auto clft = bayesnet::TANLd();
REQUIRE_THROWS_AS(clft.fit(raw.dataset, raw.features, raw.className, raw.states, raw.smoothing), std::runtime_error);
auto clfk = bayesnet::KDBLd(0);
REQUIRE_THROWS_AS(clfk.fit(raw.dataset, raw.features, raw.className, raw.states, raw.smoothing), std::runtime_error);
}
TEST_CASE("Predict, predict_proba & score without fitting", "[Models]")
{
@@ -386,14 +407,15 @@ TEST_CASE("Check proposal checkInput", "[Models]")
{
class testProposal : public bayesnet::Proposal {
public:
testProposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_)
: Proposal(dataset_, features_, className_)
testProposal(torch::Tensor& dataset_, std::vector<std::string>& features_, std::string& className_, std::vector<std::string>& notes_)
: Proposal(dataset_, features_, className_, notes_)
{
}
void test_X_y(const torch::Tensor& X, const torch::Tensor& y) { checkInput(X, y); }
};
auto raw = RawDatasets("iris", true);
auto clf = testProposal(raw.dataset, raw.features, raw.className);
std::vector<std::string> notes;
auto clf = testProposal(raw.dataset, raw.features, raw.className, notes);
torch::Tensor X = torch::randint(0, 3, { 10, 4 });
torch::Tensor y = torch::rand({ 10 });
INFO("Check X is not float");
@@ -428,3 +450,105 @@ TEST_CASE("Check KDB loop detection", "[Models]")
REQUIRE_NOTHROW(clf.test_add_m_edges(features, 0, S, weights));
REQUIRE_NOTHROW(clf.test_add_m_edges(features, 1, S, weights));
}
TEST_CASE("Local discretization hyperparameters", "[Models]")
{
auto raw = RawDatasets("iris", false);
auto clfs = bayesnet::SPODELd(0);
clfs.setHyperparameters({
{"max_iterations", 7},
{"verbose_convergence", true},
});
REQUIRE_NOTHROW(clfs.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
REQUIRE(clfs.getStatus() == bayesnet::NORMAL);
auto clfk = bayesnet::KDBLd(0);
clfk.setHyperparameters({
{"k", 3},
{"theta", 1e-4},
});
REQUIRE_NOTHROW(clfk.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
REQUIRE(clfk.getStatus() == bayesnet::NORMAL);
auto clfa = bayesnet::AODELd();
clfa.setHyperparameters({
{"ld_proposed_cuts", 9},
{"ld_algorithm", "BINQ"},
});
REQUIRE_NOTHROW(clfa.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
REQUIRE(clfa.getStatus() == bayesnet::NORMAL);
auto clft = bayesnet::TANLd();
clft.setHyperparameters({
{"ld_proposed_cuts", 7},
{"mdlp_max_depth", 5},
{"mdlp_min_length", 3},
{"ld_algorithm", "MDLP"},
});
REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
REQUIRE(clft.getStatus() == bayesnet::NORMAL);
clft.setHyperparameters({
{"ld_proposed_cuts", 9},
{"ld_algorithm", "BINQ"},
});
REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
REQUIRE(clft.getStatus() == bayesnet::NORMAL);
clft.setHyperparameters({
{"ld_proposed_cuts", 5},
{"ld_algorithm", "BINU"},
});
REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
REQUIRE(clft.getStatus() == bayesnet::NORMAL);
}
TEST_CASE("Test Dataset Loading", "[Datasets]")
{
int max_sample = 4;
// Test loading a dataset
RawDatasets dataset("iris", true);
REQUIRE(dataset.Xt.size(0) == 4);
REQUIRE(dataset.Xt.size(1) == 150);
REQUIRE(dataset.yt.size(0) == 150);
std::cout << "Dataset iris discretized " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 4; feature++) {
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
dataset = RawDatasets("iris", false);
std::cout << "Dataset iris raw " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 4; feature++) {
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
// Test loading a dataset
dataset = RawDatasets("heart-statlog", true);
REQUIRE(dataset.Xt.size(0) == 13);
REQUIRE(dataset.Xt.size(1) == 270);
REQUIRE(dataset.yt.size(0) == 270);
std::cout << "Dataset heart-statlog discretized " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 13; feature++) {
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
auto features = dataset.features;
std::cout << "States:" << std::endl;
for (int i = 0; i < 13; i++) {
std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
}
dataset = RawDatasets("heart-statlog", false);
std::cout << "Dataset heart-statlog raw " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 13; feature++) {
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
std::cout << "States:" << std::endl;
for (int i = 0; i < 13; i++) {
std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
}
auto clf = bayesnet::TANLd();
clf.fit(dataset.Xt, dataset.yt, dataset.features, dataset.className, dataset.states, dataset.smoothing);
std::cout << "Score: " << clf.score(dataset.Xt, dataset.yt) << std::endl;
}

View File

@@ -345,12 +345,12 @@ TEST_CASE("Test Bayesian Network", "[Network]")
auto net1 = bayesnet::Network();
buildModel(net1, raw.features, raw.className);
net1.fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, raw.states, raw.smoothing);
// Create empty network and assign
auto net2 = bayesnet::Network();
net2.addNode("TempNode"); // Add something to make sure it gets cleared
net2 = net1;
// Verify they are equal
REQUIRE(net1.getFeatures() == net2.getFeatures());
REQUIRE(net1.getEdges() == net2.getEdges());
@@ -361,10 +361,10 @@ TEST_CASE("Test Bayesian Network", "[Network]")
REQUIRE(net1.getSamples().size(0) == net2.getSamples().size(0));
REQUIRE(net1.getSamples().size(1) == net2.getSamples().size(1));
REQUIRE(net1.getNodes().size() == net2.getNodes().size());
// Verify topology equality
REQUIRE(net1 == net2);
// Verify they are separate objects by modifying one
net2.initialize();
net2.addNode("OnlyInNet2");
@@ -376,46 +376,47 @@ TEST_CASE("Test Bayesian Network", "[Network]")
INFO("Test self assignment");
buildModel(net, raw.features, raw.className);
net.fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, raw.states, raw.smoothing);
int original_edges = net.getNumEdges();
int original_nodes = net.getNodes().size();
// Self assignment should not corrupt the network
net = net;
auto all_features = raw.features;
all_features.push_back(raw.className);
REQUIRE(net.getNumEdges() == original_edges);
REQUIRE(net.getNodes().size() == original_nodes);
REQUIRE(net.getFeatures() == raw.features);
REQUIRE(net.getFeatures() == all_features);
REQUIRE(net.getClassName() == raw.className);
}
SECTION("Test operator== topology comparison")
{
INFO("Test operator== topology comparison");
// Test 1: Two identical networks
auto net1 = bayesnet::Network();
auto net2 = bayesnet::Network();
net1.addNode("A");
net1.addNode("B");
net1.addNode("C");
net1.addEdge("A", "B");
net1.addEdge("B", "C");
net2.addNode("A");
net2.addNode("B");
net2.addNode("C");
net2.addEdge("A", "B");
net2.addEdge("B", "C");
REQUIRE(net1 == net2);
// Test 2: Different nodes
auto net3 = bayesnet::Network();
net3.addNode("A");
net3.addNode("D"); // Different node
REQUIRE_FALSE(net1 == net3);
// Test 3: Same nodes, different edges
auto net4 = bayesnet::Network();
net4.addNode("A");
@@ -424,12 +425,12 @@ TEST_CASE("Test Bayesian Network", "[Network]")
net4.addEdge("A", "C"); // Different topology
net4.addEdge("B", "C");
REQUIRE_FALSE(net1 == net4);
// Test 4: Empty networks
auto net5 = bayesnet::Network();
auto net6 = bayesnet::Network();
REQUIRE(net5 == net6);
// Test 5: Same topology, different edge order
auto net7 = bayesnet::Network();
net7.addNode("A");
@@ -442,35 +443,36 @@ TEST_CASE("Test Bayesian Network", "[Network]")
SECTION("Test RAII compliance with smart pointers")
{
INFO("Test RAII compliance with smart pointers");
std::unique_ptr<bayesnet::Network> net1 = std::make_unique<bayesnet::Network>();
buildModel(*net1, raw.features, raw.className);
net1->fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, raw.states, raw.smoothing);
// Test that copy constructor works with smart pointers
std::unique_ptr<bayesnet::Network> net2 = std::make_unique<bayesnet::Network>(*net1);
REQUIRE(*net1 == *net2);
REQUIRE(net1->getNumEdges() == net2->getNumEdges());
REQUIRE(net1->getNodes().size() == net2->getNodes().size());
// Destroy original
net1.reset();
// Test predictions still work
std::vector<std::vector<int>> test = { {1}, {2}, {0}, {1} };
REQUIRE_NOTHROW(net2->predict(test));
// net2 should still be valid and functional
net2->initialize();
REQUIRE_NOTHROW(net2->addNode("NewNode"));
REQUIRE(net2->getNodes().count("NewNode") == 1);
// Test predictions still work
std::vector<std::vector<int>> test = { {1, 2, 0, 1, 1} };
REQUIRE_NOTHROW(net2->predict(test));
}
SECTION("Test complex topology copy")
{
INFO("Test complex topology copy");
auto original = bayesnet::Network();
// Create a more complex network
original.addNode("Root");
original.addNode("Child1");
@@ -478,45 +480,45 @@ TEST_CASE("Test Bayesian Network", "[Network]")
original.addNode("Grandchild1");
original.addNode("Grandchild2");
original.addNode("Grandchild3");
original.addEdge("Root", "Child1");
original.addEdge("Root", "Child2");
original.addEdge("Child1", "Grandchild1");
original.addEdge("Child1", "Grandchild2");
original.addEdge("Child2", "Grandchild3");
// Copy it
auto copy = original;
// Verify topology is identical
REQUIRE(original == copy);
REQUIRE(original.getNodes().size() == copy.getNodes().size());
REQUIRE(original.getNumEdges() == copy.getNumEdges());
// Verify edges are properly reconstructed
auto originalEdges = original.getEdges();
auto copyEdges = copy.getEdges();
REQUIRE(originalEdges.size() == copyEdges.size());
// Verify node relationships are properly copied
for (const auto& nodePair : original.getNodes()) {
const std::string& nodeName = nodePair.first;
auto* originalNode = nodePair.second.get();
auto* copyNode = copy.getNodes().at(nodeName).get();
REQUIRE(originalNode->getParents().size() == copyNode->getParents().size());
REQUIRE(originalNode->getChildren().size() == copyNode->getChildren().size());
// Verify parent names match
for (size_t i = 0; i < originalNode->getParents().size(); ++i) {
REQUIRE(originalNode->getParents()[i]->getName() ==
copyNode->getParents()[i]->getName());
REQUIRE(originalNode->getParents()[i]->getName() ==
copyNode->getParents()[i]->getName());
}
// Verify child names match
for (size_t i = 0; i < originalNode->getChildren().size(); ++i) {
REQUIRE(originalNode->getChildren()[i]->getName() ==
copyNode->getChildren()[i]->getName());
REQUIRE(originalNode->getChildren()[i]->getName() ==
copyNode->getChildren()[i]->getName());
}
}
}

View File

@@ -158,4 +158,48 @@ TEST_CASE("TEST MinFill method", "[Node]")
REQUIRE(node_2.minFill() == 6);
REQUIRE(node_3.minFill() == 3);
REQUIRE(node_4.minFill() == 1);
}
TEST_CASE("Test operator =", "[Node]")
{
// Generate a test to test the operator = of the Node class
// Create a node with 3 parents and 2 children
auto node = bayesnet::Node("N1");
auto parent_1 = bayesnet::Node("P1");
parent_1.setNumStates(3);
auto child_1 = bayesnet::Node("H1");
child_1.setNumStates(2);
node.addParent(&parent_1);
node.addChild(&child_1);
// Create a cpt in the node using computeCPT
auto dataset = torch::tensor({ {1, 0, 0, 1}, {0, 1, 2, 1}, {0, 1, 1, 0} });
auto states = std::vector<int>({ 2, 3, 3 });
auto features = std::vector<std::string>{ "N1", "P1", "H1" };
auto className = std::string("Class");
auto weights = torch::tensor({ 1.0, 1.0, 1.0, 1.0 }, torch::kDouble);
node.setNumStates(2);
node.computeCPT(dataset, features, 0.0, weights);
// Get the cpt of the node
auto cpt = node.getCPT();
// Check that the cpt is not empty
REQUIRE(cpt.numel() > 0);
// Check that the cpt has the correct dimensions
auto dimensions = cpt.sizes();
REQUIRE(dimensions.size() == 2);
REQUIRE(dimensions[0] == 2); // Number of states of the node
REQUIRE(dimensions[1] == 3); // Number of states of the first parent
// Create a copy of the node
bayesnet::Node node_copy("XX");
node_copy = node;
// Check that the copy has not any parents or children
auto parents = node_copy.getParents();
auto children = node_copy.getChildren();
REQUIRE(parents.size() == 0);
REQUIRE(children.size() == 0);
// Check that the copy has the same name
REQUIRE(node_copy.getName() == "N1");
// Check that the copy has the same cpt
auto cpt_copy = node_copy.getCPT();
REQUIRE(cpt_copy.equal(cpt));
// Check that the copy has the same number of states
REQUIRE(node_copy.getNumStates() == node.getNumStates());
}

View File

@@ -16,10 +16,10 @@
#include "TestUtils.h"
std::map<std::string, std::string> modules = {
{ "mdlp", "2.1.0" },
{ "Folding", "1.1.1" },
{ "mdlp", "2.1.2" },
{ "Folding", "1.1.2" },
{ "json", "3.11" },
{ "ArffFiles", "1.2.0" }
{ "ArffFiles", "1.2.1" }
};
TEST_CASE("MDLP", "[Modules]")

View File

@@ -5,6 +5,7 @@
// ***************************************************************
#include <random>
#include <nlohmann/json.hpp>
#include "TestUtils.h"
#include "bayesnet/config.h"
@@ -51,6 +52,7 @@ private:
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug)
{
catalog = loadCatalog();
num_samples = num_samples_;
shuffle = shuffle_;
discretize = discretize_;
@@ -62,7 +64,7 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
nSamples = dataset.size(1);
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
classNumStates = discretize ? states.at(className).size() : 0;
classNumStates = states.at(className).size();
auto fold = folding::StratifiedKFold(5, yt, 271);
auto [train, test] = fold.getFold(0);
auto train_t = torch::tensor(train);
@@ -78,18 +80,90 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
{
map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], yv);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
mdlp::labels_t xd;
if (is_numeric.at(i)) {
fimdlp.fit(X[i], yv);
xd = fimdlp.transform(X[i]);
} else {
std::transform(X[i].begin(), X[i].end(), back_inserter(xd), [](const auto& val) {
return static_cast<int>(val);
});
}
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xv.push_back(xd);
}
return maxes;
}
map<std::string, std::vector<int>> RawDatasets::loadCatalog()
{
map<std::string, std::vector<int>> catalogNames;
ifstream catalog(Paths::datasets() + "all.txt");
std::vector<int> numericFeaturesIdx;
if (!catalog.is_open()) {
throw std::invalid_argument("Unable to open catalog file. [" + Paths::datasets() + +"all.txt" + "]");
}
std::string line;
std::vector<std::string> sorted_lines;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
sorted_lines.push_back(line);
}
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
});
for (const auto& line : sorted_lines) {
std::vector<std::string> tokens = split(line, ';');
std::string name = tokens[0];
std::string className;
numericFeaturesIdx.clear();
int size = tokens.size();
switch (size) {
case 1:
className = "-1";
numericFeaturesIdx.push_back(-1);
break;
case 2:
className = tokens[1];
numericFeaturesIdx.push_back(-1);
break;
case 3:
{
className = tokens[1];
auto numericFeatures = tokens[2];
if (numericFeatures == "all") {
numericFeaturesIdx.push_back(-1);
} else {
if (numericFeatures != "none") {
auto features = nlohmann::json::parse(numericFeatures);
for (auto& f : features) {
numericFeaturesIdx.push_back(f);
}
}
}
}
break;
default:
throw std::invalid_argument("Invalid catalog file format.");
}
catalogNames[name] = numericFeaturesIdx;
}
catalog.close();
if (catalogNames.empty()) {
throw std::invalid_argument("Catalog is empty. Please check the catalog file.");
}
return catalogNames;
}
void RawDatasets::loadDataset(const std::string& name, bool class_last)
{
auto handler = ShuffleArffFiles(num_samples, shuffle);
@@ -101,6 +175,26 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
className = handler.getClassName();
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
is_numeric.clear();
is_numeric.reserve(features.size());
auto numericFeaturesIdx = catalog.at(name);
if (numericFeaturesIdx.empty()) {
// no numeric features
is_numeric.assign(features.size(), false);
} else {
if (numericFeaturesIdx[0] == -1) {
// all features are numeric
is_numeric.assign(features.size(), true);
} else {
// some features are numeric
is_numeric.assign(features.size(), false);
for (const auto& idx : numericFeaturesIdx) {
if (idx >= 0 && idx < features.size()) {
is_numeric[idx] = true;
}
}
}
}
// Discretize Dataset
auto maxValues = discretizeDataset(X);
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
@@ -113,13 +207,23 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
}
states[className] = std::vector<int>(maxValues[className]);
iota(begin(states.at(className)), end(states.at(className)), 0);
} else {
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
if (!is_numeric.at(i)) {
states[features[i]] = std::vector<int>(maxValues[features[i]]);
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
} else {
states[features[i]] = std::vector<int>();
}
}
yt = torch::tensor(yv, torch::kInt32);
int maxy = *max_element(yv.begin(), yv.end()) + 1;
states[className] = std::vector<int>(maxy);
}
iota(begin(states.at(className)), end(states.at(className)), 0);
yt = torch::tensor(yv, torch::kInt32);
}

View File

@@ -27,7 +27,11 @@ public:
std::vector<double> weightsv;
std::vector<string> features;
std::string className;
std::vector<bool> is_numeric; // indicates whether each feature is numeric
map<std::string, std::vector<int>> states;
//catalog holds the mapping between dataset names and their corresponding indices of numeric features (-1) means all are numeric
//and an empty vector means none are numeric
map<std::string, std::vector<int>> catalog;
int nSamples, classNumStates;
double epsilon = 1e-5;
bool discretize;
@@ -65,8 +69,30 @@ private:
+ "classNumStates: " + std::to_string(classNumStates) + "\n"
+ "states: " + states_ + "\n";
}
std::string trim(const std::string& str)
{
std::string result = str;
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
return !std::isspace(ch);
}));
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), result.end());
return result;
}
std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(trim(token));
}
return result;
}
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
void loadDataset(const std::string& name, bool class_last);
map<std::string, std::vector<int>> loadCatalog();
};
#endif //TEST_UTILS_H

27
tests/data/all.txt Normal file
View File

@@ -0,0 +1,27 @@
adult;class;[0,2,4,10,11,12]
balance-scale;class; all
breast-w;Class; all
diabetes;class; all
ecoli;class; all
glass;Type; all
hayes-roth;class; none
heart-statlog;class; [0,3,4,7,9,11]
ionosphere;class; all
iris;class; all
kdd_JapaneseVowels;speaker; all
letter;class; all
liver-disorders;selector; all
mfeat-factors;class; all
mfeat-fourier;class; all
mfeat-karhunen;class; all
mfeat-morphological;class; all
mfeat-zernike;class; all
optdigits;class; all
page-blocks;class; all
pendigits;class; all
segment;class; all
sonar;Class; all
spambase;class; all
vehicle;Class; all
waveform-5000;class; all
wine;class; all

338
tests/data/heart-statlog.arff Executable file
View File

@@ -0,0 +1,338 @@
% This database contains 13 attributes (which have been extracted from
% a larger set of 75)
%
%
%
% Attribute Information:
% ------------------------
% 0 -- 1. age
% 1 -- 2. sex
% 2 -- 3. chest pain type (4 values)
% 3 -- 4. resting blood pressure
% 4 -- 5. serum cholestoral in mg/dl
% 5 -- 6. fasting blood sugar > 120 mg/dl
% 6 -- 7. resting electrocardiographic results (values 0,1,2)
% 7 -- 8. maximum heart rate achieved
% 8 -- 9. exercise induced angina
% 9 -- 10. oldpeak = ST depression induced by exercise relative to rest
% 10 -- 11. the slope of the peak exercise ST segment
% 11 -- 12. number of major vessels (0-3) colored by flourosopy
% 12 -- 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
%
% Attributes types
% -----------------
%
% Real: 1,4,5,8,10,12
% Ordered:11,
% Binary: 2,6,9
% Nominal:7,3,13
%
% Variable to be predicted
% ------------------------
% Absence (1) or presence (2) of heart disease
%
% Cost Matrix
%
% abse pres
% absence 0 1
% presence 5 0
%
% where the rows represent the true values and the columns the predicted.
%
% No missing values.
%
% 270 observations
%
%
%
%
% Relabeled values in attribute class
% From: 1 To: absent
% From: 2 To: present
%
@relation heart-statlog
@attribute age real
@attribute sex real
@attribute chest real
@attribute resting_blood_pressure real
@attribute serum_cholestoral real
@attribute fasting_blood_sugar real
@attribute resting_electrocardiographic_results real
@attribute maximum_heart_rate_achieved real
@attribute exercise_induced_angina real
@attribute oldpeak real
@attribute slope real
@attribute number_of_major_vessels real
@attribute thal real
@attribute class { absent, present}
@data
70,1,4,130,322,0,2,109,0,2.4,2,3,3,present
67,0,3,115,564,0,2,160,0,1.6,2,0,7,absent
57,1,2,124,261,0,0,141,0,0.3,1,0,7,present
64,1,4,128,263,0,0,105,1,0.2,2,1,7,absent
74,0,2,120,269,0,2,121,1,0.2,1,1,3,absent
65,1,4,120,177,0,0,140,0,0.4,1,0,7,absent
56,1,3,130,256,1,2,142,1,0.6,2,1,6,present
59,1,4,110,239,0,2,142,1,1.2,2,1,7,present
60,1,4,140,293,0,2,170,0,1.2,2,2,7,present
63,0,4,150,407,0,2,154,0,4,2,3,7,present
59,1,4,135,234,0,0,161,0,0.5,2,0,7,absent
53,1,4,142,226,0,2,111,1,0,1,0,7,absent
44,1,3,140,235,0,2,180,0,0,1,0,3,absent
61,1,1,134,234,0,0,145,0,2.6,2,2,3,present
57,0,4,128,303,0,2,159,0,0,1,1,3,absent
71,0,4,112,149,0,0,125,0,1.6,2,0,3,absent
46,1,4,140,311,0,0,120,1,1.8,2,2,7,present
53,1,4,140,203,1,2,155,1,3.1,3,0,7,present
64,1,1,110,211,0,2,144,1,1.8,2,0,3,absent
40,1,1,140,199,0,0,178,1,1.4,1,0,7,absent
67,1,4,120,229,0,2,129,1,2.6,2,2,7,present
48,1,2,130,245,0,2,180,0,0.2,2,0,3,absent
43,1,4,115,303,0,0,181,0,1.2,2,0,3,absent
47,1,4,112,204,0,0,143,0,0.1,1,0,3,absent
54,0,2,132,288,1,2,159,1,0,1,1,3,absent
48,0,3,130,275,0,0,139,0,0.2,1,0,3,absent
46,0,4,138,243,0,2,152,1,0,2,0,3,absent
51,0,3,120,295,0,2,157,0,0.6,1,0,3,absent
58,1,3,112,230,0,2,165,0,2.5,2,1,7,present
71,0,3,110,265,1,2,130,0,0,1,1,3,absent
57,1,3,128,229,0,2,150,0,0.4,2,1,7,present
66,1,4,160,228,0,2,138,0,2.3,1,0,6,absent
37,0,3,120,215,0,0,170,0,0,1,0,3,absent
59,1,4,170,326,0,2,140,1,3.4,3,0,7,present
50,1,4,144,200,0,2,126,1,0.9,2,0,7,present
48,1,4,130,256,1,2,150,1,0,1,2,7,present
61,1,4,140,207,0,2,138,1,1.9,1,1,7,present
59,1,1,160,273,0,2,125,0,0,1,0,3,present
42,1,3,130,180,0,0,150,0,0,1,0,3,absent
48,1,4,122,222,0,2,186,0,0,1,0,3,absent
40,1,4,152,223,0,0,181,0,0,1,0,7,present
62,0,4,124,209,0,0,163,0,0,1,0,3,absent
44,1,3,130,233,0,0,179,1,0.4,1,0,3,absent
46,1,2,101,197,1,0,156,0,0,1,0,7,absent
59,1,3,126,218,1,0,134,0,2.2,2,1,6,present
58,1,3,140,211,1,2,165,0,0,1,0,3,absent
49,1,3,118,149,0,2,126,0,0.8,1,3,3,present
44,1,4,110,197,0,2,177,0,0,1,1,3,present
66,1,2,160,246,0,0,120,1,0,2,3,6,present
65,0,4,150,225,0,2,114,0,1,2,3,7,present
42,1,4,136,315,0,0,125,1,1.8,2,0,6,present
52,1,2,128,205,1,0,184,0,0,1,0,3,absent
65,0,3,140,417,1,2,157,0,0.8,1,1,3,absent
63,0,2,140,195,0,0,179,0,0,1,2,3,absent
45,0,2,130,234,0,2,175,0,0.6,2,0,3,absent
41,0,2,105,198,0,0,168,0,0,1,1,3,absent
61,1,4,138,166,0,2,125,1,3.6,2,1,3,present
60,0,3,120,178,1,0,96,0,0,1,0,3,absent
59,0,4,174,249,0,0,143,1,0,2,0,3,present
62,1,2,120,281,0,2,103,0,1.4,2,1,7,present
57,1,3,150,126,1,0,173,0,0.2,1,1,7,absent
51,0,4,130,305,0,0,142,1,1.2,2,0,7,present
44,1,3,120,226,0,0,169,0,0,1,0,3,absent
60,0,1,150,240,0,0,171,0,0.9,1,0,3,absent
63,1,1,145,233,1,2,150,0,2.3,3,0,6,absent
57,1,4,150,276,0,2,112,1,0.6,2,1,6,present
51,1,4,140,261,0,2,186,1,0,1,0,3,absent
58,0,2,136,319,1,2,152,0,0,1,2,3,present
44,0,3,118,242,0,0,149,0,0.3,2,1,3,absent
47,1,3,108,243,0,0,152,0,0,1,0,3,present
61,1,4,120,260,0,0,140,1,3.6,2,1,7,present
57,0,4,120,354,0,0,163,1,0.6,1,0,3,absent
70,1,2,156,245,0,2,143,0,0,1,0,3,absent
76,0,3,140,197,0,1,116,0,1.1,2,0,3,absent
67,0,4,106,223,0,0,142,0,0.3,1,2,3,absent
45,1,4,142,309,0,2,147,1,0,2,3,7,present
45,1,4,104,208,0,2,148,1,3,2,0,3,absent
39,0,3,94,199,0,0,179,0,0,1,0,3,absent
42,0,3,120,209,0,0,173,0,0,2,0,3,absent
56,1,2,120,236,0,0,178,0,0.8,1,0,3,absent
58,1,4,146,218,0,0,105,0,2,2,1,7,present
35,1,4,120,198,0,0,130,1,1.6,2,0,7,present
58,1,4,150,270,0,2,111,1,0.8,1,0,7,present
41,1,3,130,214,0,2,168,0,2,2,0,3,absent
57,1,4,110,201,0,0,126,1,1.5,2,0,6,absent
42,1,1,148,244,0,2,178,0,0.8,1,2,3,absent
62,1,2,128,208,1,2,140,0,0,1,0,3,absent
59,1,1,178,270,0,2,145,0,4.2,3,0,7,absent
41,0,2,126,306,0,0,163,0,0,1,0,3,absent
50,1,4,150,243,0,2,128,0,2.6,2,0,7,present
59,1,2,140,221,0,0,164,1,0,1,0,3,absent
61,0,4,130,330,0,2,169,0,0,1,0,3,present
54,1,4,124,266,0,2,109,1,2.2,2,1,7,present
54,1,4,110,206,0,2,108,1,0,2,1,3,present
52,1,4,125,212,0,0,168,0,1,1,2,7,present
47,1,4,110,275,0,2,118,1,1,2,1,3,present
66,1,4,120,302,0,2,151,0,0.4,2,0,3,absent
58,1,4,100,234,0,0,156,0,0.1,1,1,7,present
64,0,3,140,313,0,0,133,0,0.2,1,0,7,absent
50,0,2,120,244,0,0,162,0,1.1,1,0,3,absent
44,0,3,108,141,0,0,175,0,0.6,2,0,3,absent
67,1,4,120,237,0,0,71,0,1,2,0,3,present
49,0,4,130,269,0,0,163,0,0,1,0,3,absent
57,1,4,165,289,1,2,124,0,1,2,3,7,present
63,1,4,130,254,0,2,147,0,1.4,2,1,7,present
48,1,4,124,274,0,2,166,0,0.5,2,0,7,present
51,1,3,100,222,0,0,143,1,1.2,2,0,3,absent
60,0,4,150,258,0,2,157,0,2.6,2,2,7,present
59,1,4,140,177,0,0,162,1,0,1,1,7,present
45,0,2,112,160,0,0,138,0,0,2,0,3,absent
55,0,4,180,327,0,1,117,1,3.4,2,0,3,present
41,1,2,110,235,0,0,153,0,0,1,0,3,absent
60,0,4,158,305,0,2,161,0,0,1,0,3,present
54,0,3,135,304,1,0,170,0,0,1,0,3,absent
42,1,2,120,295,0,0,162,0,0,1,0,3,absent
49,0,2,134,271,0,0,162,0,0,2,0,3,absent
46,1,4,120,249,0,2,144,0,0.8,1,0,7,present
56,0,4,200,288,1,2,133,1,4,3,2,7,present
66,0,1,150,226,0,0,114,0,2.6,3,0,3,absent
56,1,4,130,283,1,2,103,1,1.6,3,0,7,present
49,1,3,120,188,0,0,139,0,2,2,3,7,present
54,1,4,122,286,0,2,116,1,3.2,2,2,3,present
57,1,4,152,274,0,0,88,1,1.2,2,1,7,present
65,0,3,160,360,0,2,151,0,0.8,1,0,3,absent
54,1,3,125,273,0,2,152,0,0.5,3,1,3,absent
54,0,3,160,201,0,0,163,0,0,1,1,3,absent
62,1,4,120,267,0,0,99,1,1.8,2,2,7,present
52,0,3,136,196,0,2,169,0,0.1,2,0,3,absent
52,1,2,134,201,0,0,158,0,0.8,1,1,3,absent
60,1,4,117,230,1,0,160,1,1.4,1,2,7,present
63,0,4,108,269,0,0,169,1,1.8,2,2,3,present
66,1,4,112,212,0,2,132,1,0.1,1,1,3,present
42,1,4,140,226,0,0,178,0,0,1,0,3,absent
64,1,4,120,246,0,2,96,1,2.2,3,1,3,present
54,1,3,150,232,0,2,165,0,1.6,1,0,7,absent
46,0,3,142,177,0,2,160,1,1.4,3,0,3,absent
67,0,3,152,277,0,0,172,0,0,1,1,3,absent
56,1,4,125,249,1,2,144,1,1.2,2,1,3,present
34,0,2,118,210,0,0,192,0,0.7,1,0,3,absent
57,1,4,132,207,0,0,168,1,0,1,0,7,absent
64,1,4,145,212,0,2,132,0,2,2,2,6,present
59,1,4,138,271,0,2,182,0,0,1,0,3,absent
50,1,3,140,233,0,0,163,0,0.6,2,1,7,present
51,1,1,125,213,0,2,125,1,1.4,1,1,3,absent
54,1,2,192,283,0,2,195,0,0,1,1,7,present
53,1,4,123,282,0,0,95,1,2,2,2,7,present
52,1,4,112,230,0,0,160,0,0,1,1,3,present
40,1,4,110,167,0,2,114,1,2,2,0,7,present
58,1,3,132,224,0,2,173,0,3.2,1,2,7,present
41,0,3,112,268,0,2,172,1,0,1,0,3,absent
41,1,3,112,250,0,0,179,0,0,1,0,3,absent
50,0,3,120,219,0,0,158,0,1.6,2,0,3,absent
54,0,3,108,267,0,2,167,0,0,1,0,3,absent
64,0,4,130,303,0,0,122,0,2,2,2,3,absent
51,0,3,130,256,0,2,149,0,0.5,1,0,3,absent
46,0,2,105,204,0,0,172,0,0,1,0,3,absent
55,1,4,140,217,0,0,111,1,5.6,3,0,7,present
45,1,2,128,308,0,2,170,0,0,1,0,3,absent
56,1,1,120,193,0,2,162,0,1.9,2,0,7,absent
66,0,4,178,228,1,0,165,1,1,2,2,7,present
38,1,1,120,231,0,0,182,1,3.8,2,0,7,present
62,0,4,150,244,0,0,154,1,1.4,2,0,3,present
55,1,2,130,262,0,0,155,0,0,1,0,3,absent
58,1,4,128,259,0,2,130,1,3,2,2,7,present
43,1,4,110,211,0,0,161,0,0,1,0,7,absent
64,0,4,180,325,0,0,154,1,0,1,0,3,absent
50,0,4,110,254,0,2,159,0,0,1,0,3,absent
53,1,3,130,197,1,2,152,0,1.2,3,0,3,absent
45,0,4,138,236,0,2,152,1,0.2,2,0,3,absent
65,1,1,138,282,1,2,174,0,1.4,2,1,3,present
69,1,1,160,234,1,2,131,0,0.1,2,1,3,absent
69,1,3,140,254,0,2,146,0,2,2,3,7,present
67,1,4,100,299,0,2,125,1,0.9,2,2,3,present
68,0,3,120,211,0,2,115,0,1.5,2,0,3,absent
34,1,1,118,182,0,2,174,0,0,1,0,3,absent
62,0,4,138,294,1,0,106,0,1.9,2,3,3,present
51,1,4,140,298,0,0,122,1,4.2,2,3,7,present
46,1,3,150,231,0,0,147,0,3.6,2,0,3,present
67,1,4,125,254,1,0,163,0,0.2,2,2,7,present
50,1,3,129,196,0,0,163,0,0,1,0,3,absent
42,1,3,120,240,1,0,194,0,0.8,3,0,7,absent
56,0,4,134,409,0,2,150,1,1.9,2,2,7,present
41,1,4,110,172,0,2,158,0,0,1,0,7,present
42,0,4,102,265,0,2,122,0,0.6,2,0,3,absent
53,1,3,130,246,1,2,173,0,0,1,3,3,absent
43,1,3,130,315,0,0,162,0,1.9,1,1,3,absent
56,1,4,132,184,0,2,105,1,2.1,2,1,6,present
52,1,4,108,233,1,0,147,0,0.1,1,3,7,absent
62,0,4,140,394,0,2,157,0,1.2,2,0,3,absent
70,1,3,160,269,0,0,112,1,2.9,2,1,7,present
54,1,4,140,239,0,0,160,0,1.2,1,0,3,absent
70,1,4,145,174,0,0,125,1,2.6,3,0,7,present
54,1,2,108,309,0,0,156,0,0,1,0,7,absent
35,1,4,126,282,0,2,156,1,0,1,0,7,present
48,1,3,124,255,1,0,175,0,0,1,2,3,absent
55,0,2,135,250,0,2,161,0,1.4,2,0,3,absent
58,0,4,100,248,0,2,122,0,1,2,0,3,absent
54,0,3,110,214,0,0,158,0,1.6,2,0,3,absent
69,0,1,140,239,0,0,151,0,1.8,1,2,3,absent
77,1,4,125,304,0,2,162,1,0,1,3,3,present
68,1,3,118,277,0,0,151,0,1,1,1,7,absent
58,1,4,125,300,0,2,171,0,0,1,2,7,present
60,1,4,125,258,0,2,141,1,2.8,2,1,7,present
51,1,4,140,299,0,0,173,1,1.6,1,0,7,present
55,1,4,160,289,0,2,145,1,0.8,2,1,7,present
52,1,1,152,298,1,0,178,0,1.2,2,0,7,absent
60,0,3,102,318,0,0,160,0,0,1,1,3,absent
58,1,3,105,240,0,2,154,1,0.6,2,0,7,absent
64,1,3,125,309,0,0,131,1,1.8,2,0,7,present
37,1,3,130,250,0,0,187,0,3.5,3,0,3,absent
59,1,1,170,288,0,2,159,0,0.2,2,0,7,present
51,1,3,125,245,1,2,166,0,2.4,2,0,3,absent
43,0,3,122,213,0,0,165,0,0.2,2,0,3,absent
58,1,4,128,216,0,2,131,1,2.2,2,3,7,present
29,1,2,130,204,0,2,202,0,0,1,0,3,absent
41,0,2,130,204,0,2,172,0,1.4,1,0,3,absent
63,0,3,135,252,0,2,172,0,0,1,0,3,absent
51,1,3,94,227,0,0,154,1,0,1,1,7,absent
54,1,3,120,258,0,2,147,0,0.4,2,0,7,absent
44,1,2,120,220,0,0,170,0,0,1,0,3,absent
54,1,4,110,239,0,0,126,1,2.8,2,1,7,present
65,1,4,135,254,0,2,127,0,2.8,2,1,7,present
57,1,3,150,168,0,0,174,0,1.6,1,0,3,absent
63,1,4,130,330,1,2,132,1,1.8,1,3,7,present
35,0,4,138,183,0,0,182,0,1.4,1,0,3,absent
41,1,2,135,203,0,0,132,0,0,2,0,6,absent
62,0,3,130,263,0,0,97,0,1.2,2,1,7,present
43,0,4,132,341,1,2,136,1,3,2,0,7,present
58,0,1,150,283,1,2,162,0,1,1,0,3,absent
52,1,1,118,186,0,2,190,0,0,2,0,6,absent
61,0,4,145,307,0,2,146,1,1,2,0,7,present
39,1,4,118,219,0,0,140,0,1.2,2,0,7,present
45,1,4,115,260,0,2,185,0,0,1,0,3,absent
52,1,4,128,255,0,0,161,1,0,1,1,7,present
62,1,3,130,231,0,0,146,0,1.8,2,3,7,absent
62,0,4,160,164,0,2,145,0,6.2,3,3,7,present
53,0,4,138,234,0,2,160,0,0,1,0,3,absent
43,1,4,120,177,0,2,120,1,2.5,2,0,7,present
47,1,3,138,257,0,2,156,0,0,1,0,3,absent
52,1,2,120,325,0,0,172,0,0.2,1,0,3,absent
68,1,3,180,274,1,2,150,1,1.6,2,0,7,present
39,1,3,140,321,0,2,182,0,0,1,0,3,absent
53,0,4,130,264,0,2,143,0,0.4,2,0,3,absent
62,0,4,140,268,0,2,160,0,3.6,3,2,3,present
51,0,3,140,308,0,2,142,0,1.5,1,1,3,absent
60,1,4,130,253,0,0,144,1,1.4,1,1,7,present
65,1,4,110,248,0,2,158,0,0.6,1,2,6,present
65,0,3,155,269,0,0,148,0,0.8,1,0,3,absent
60,1,3,140,185,0,2,155,0,3,2,0,3,present
60,1,4,145,282,0,2,142,1,2.8,2,2,7,present
54,1,4,120,188,0,0,113,0,1.4,2,1,7,present
44,1,2,130,219,0,2,188,0,0,1,0,3,absent
44,1,4,112,290,0,2,153,0,0,1,1,3,present
51,1,3,110,175,0,0,123,0,0.6,1,0,3,absent
59,1,3,150,212,1,0,157,0,1.6,1,0,3,absent
71,0,2,160,302,0,0,162,0,0.4,1,2,3,absent
61,1,3,150,243,1,0,137,1,1,2,0,3,absent
55,1,4,132,353,0,0,132,1,1.2,2,1,7,present
64,1,3,140,335,0,0,158,0,0,1,0,3,present
43,1,4,150,247,0,0,171,0,1.5,1,0,3,absent
58,0,3,120,340,0,0,172,0,0,1,0,3,absent
60,1,4,130,206,0,2,132,1,2.4,2,2,7,present
58,1,2,120,284,0,2,160,0,1.8,2,0,3,present
49,1,2,130,266,0,0,171,0,0.6,1,0,3,absent
48,1,2,110,229,0,0,168,0,1,3,0,7,present
52,1,3,172,199,1,0,162,0,0.5,1,0,7,absent
44,1,2,120,263,0,0,173,0,0,1,0,7,absent
56,0,2,140,294,0,2,153,0,1.3,2,0,3,absent
57,1,4,140,192,0,0,148,0,0.4,2,0,6,absent
67,1,4,160,286,0,2,108,1,1.5,2,3,3,present