Compare commits
6 Commits
Author | SHA1 | Date | |
---|---|---|---|
0c7452e35b
|
|||
74b391907a
|
|||
1aa3b609e5
|
|||
f1a2349245
|
|||
8578d68c57
|
|||
9f9369269a
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@@ -16,7 +16,7 @@
|
|||||||
"name": "test",
|
"name": "test",
|
||||||
"program": "${workspaceFolder}/build_Debug/tests/TestBayesNet",
|
"program": "${workspaceFolder}/build_Debug/tests/TestBayesNet",
|
||||||
"args": [
|
"args": [
|
||||||
"[XBAODE]"
|
"Test Dataset Loading"
|
||||||
],
|
],
|
||||||
"cwd": "${workspaceFolder}/build_Debug/tests"
|
"cwd": "${workspaceFolder}/build_Debug/tests"
|
||||||
},
|
},
|
||||||
|
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [1.2.2] - 2025-08-19
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical.
|
||||||
|
- Fix testutils to return states for all features:
|
||||||
|
- An empty vector is now returned for numeric features.
|
||||||
|
- Categorical features now return their unique states.
|
||||||
|
|
||||||
## [1.2.1] - 2025-07-19
|
## [1.2.1] - 2025-07-19
|
||||||
|
|
||||||
### Internal
|
### Internal
|
||||||
|
@@ -1,7 +1,7 @@
|
|||||||
cmake_minimum_required(VERSION 3.27)
|
cmake_minimum_required(VERSION 3.27)
|
||||||
|
|
||||||
project(bayesnet
|
project(bayesnet
|
||||||
VERSION 1.2.1
|
VERSION 1.2.2
|
||||||
DESCRIPTION "Bayesian Network and basic classifiers Library."
|
DESCRIPTION "Bayesian Network and basic classifiers Library."
|
||||||
HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
|
HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
|
||||||
LANGUAGES CXX
|
LANGUAGES CXX
|
||||||
|
49
Makefile
49
Makefile
@@ -21,15 +21,18 @@ sed_command_diagram = 's/Diagram"/Diagram" width="100%" height="100%" /g'
|
|||||||
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|
||||||
|| nproc --all 2>/dev/null \
|
|| nproc --all 2>/dev/null \
|
||||||
|| sysctl -n hw.ncpu)
|
|| sysctl -n hw.ncpu)
|
||||||
|
|
||||||
# --- Your desired job count: CPUs – 7, but never less than 1 --------------
|
|
||||||
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
|
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
GREEN = \033[0;32m
|
||||||
|
YELLOW = \033[1;33m
|
||||||
|
RED = \033[0;31m
|
||||||
|
NC = \033[0m # No Color
|
||||||
|
|
||||||
define ClearTests
|
define ClearTests
|
||||||
@for t in $(test_targets); do \
|
@for t in $(test_targets); do \
|
||||||
if [ -f $(f_debug)/tests/$$t ]; then \
|
if [ -f $(f_debug)/tests/$$t ]; then \
|
||||||
echo ">>> Cleaning $$t..." ; \
|
echo ">>> Removing $$t..." ; \
|
||||||
rm -f $(f_debug)/tests/$$t ; \
|
rm -f $(f_debug)/tests/$$t ; \
|
||||||
fi ; \
|
fi ; \
|
||||||
done
|
done
|
||||||
@@ -48,6 +51,20 @@ define setup_target
|
|||||||
@echo ">>> Done"
|
@echo ">>> Done"
|
||||||
endef
|
endef
|
||||||
|
|
||||||
|
define status_file_folder
|
||||||
|
@if [ -d $(1) ]; then \
|
||||||
|
st1=" ✅ $(GREEN)"; \
|
||||||
|
else \
|
||||||
|
st1=" ❌ $(RED)"; \
|
||||||
|
fi; \
|
||||||
|
if [ -f $(1)/libbayesnet.a ]; then \
|
||||||
|
st2=" ✅ $(GREEN)"; \
|
||||||
|
else \
|
||||||
|
st2=" ❌ $(RED)"; \
|
||||||
|
fi; \
|
||||||
|
printf " $(YELLOW)$(2):$(NC) $$st1 Folder $(NC) $$st2 Library $(NC)\n"
|
||||||
|
endef
|
||||||
|
|
||||||
setup: ## Install dependencies for tests and coverage
|
setup: ## Install dependencies for tests and coverage
|
||||||
@if [ "$(shell uname)" = "Darwin" ]; then \
|
@if [ "$(shell uname)" = "Darwin" ]; then \
|
||||||
brew install gcovr; \
|
brew install gcovr; \
|
||||||
@@ -61,7 +78,7 @@ setup: ## Install dependencies for tests and coverage
|
|||||||
|
|
||||||
clean: ## Clean the project
|
clean: ## Clean the project
|
||||||
@echo ">>> Cleaning the project..."
|
@echo ">>> Cleaning the project..."
|
||||||
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fimake
|
@if test -f CMakeCache.txt ; then echo "- Deleting CMakeCache.txt"; rm -f CMakeCache.txt; fi
|
||||||
@for folder in $(f_release) $(f_debug) vpcpkg_installed install_test ; do \
|
@for folder in $(f_release) $(f_debug) vpcpkg_installed install_test ; do \
|
||||||
if test -d "$$folder" ; then \
|
if test -d "$$folder" ; then \
|
||||||
echo "- Deleting $$folder folder" ; \
|
echo "- Deleting $$folder folder" ; \
|
||||||
@@ -80,11 +97,12 @@ debug: ## Setup debug version using Conan
|
|||||||
release: ## Setup release version using Conan
|
release: ## Setup release version using Conan
|
||||||
@$(call setup_target,"Release","$(f_release)","ENABLE_TESTING=OFF")
|
@$(call setup_target,"Release","$(f_release)","ENABLE_TESTING=OFF")
|
||||||
|
|
||||||
buildd: ## Build the debug targets
|
buildd: ## Build the debug && test targets
|
||||||
cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
|
@cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
|
||||||
|
@cmake --build $(f_debug) -t $(test_targets) --parallel $(JOBS)
|
||||||
|
|
||||||
buildr: ## Build the release targets
|
buildr: ## Build the release targets
|
||||||
cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
|
@cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
|
||||||
|
|
||||||
|
|
||||||
# Install targets
|
# Install targets
|
||||||
@@ -241,9 +259,24 @@ sample: ## Build sample with Conan
|
|||||||
sample/build/bayesnet_sample $(fname) $(model)
|
sample/build/bayesnet_sample $(fname) $(model)
|
||||||
@echo ">>> Done";
|
@echo ">>> Done";
|
||||||
|
|
||||||
|
info: ## Show project information
|
||||||
|
@version=$$(grep -A1 "project(bayesnet" CMakeLists.txt | grep "VERSION" | sed 's/.*VERSION \([0-9.]*\).*/\1/'); \
|
||||||
|
printf "$(GREEN)BayesNet Library: $(YELLOW)ver. $$version$(NC)\n"
|
||||||
|
@echo ""
|
||||||
|
@printf "$(GREEN)Project folders:$(NC)\n"
|
||||||
|
$(call status_file_folder, $(f_release), "Build\ Release")
|
||||||
|
$(call status_file_folder, $(f_debug), "Build\ Debug\ \ ")
|
||||||
|
@echo ""
|
||||||
|
@printf "$(GREEN)Build commands:$(NC)\n"
|
||||||
|
@printf " $(YELLOW)make release && make buildr$(NC) - Build library for release\n"
|
||||||
|
@printf " $(YELLOW)make debug && make buildd$(NC) - Build library for debug\n"
|
||||||
|
@printf " $(YELLOW)make test$(NC) - Run tests\n"
|
||||||
|
@printf " $(YELLOW)Usage:$(NC) make help\n"
|
||||||
|
@echo ""
|
||||||
|
@printf " $(YELLOW)Parallel Jobs: $(GREEN)$(JOBS)$(NC)\n"
|
||||||
|
|
||||||
# Help target
|
# Help target
|
||||||
# ===========
|
# ===========
|
||||||
|
|
||||||
help: ## Show help message
|
help: ## Show help message
|
||||||
@IFS=$$'\n' ; \
|
@IFS=$$'\n' ; \
|
||||||
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
|
||||||
|
@@ -118,7 +118,7 @@ namespace bayesnet {
|
|||||||
}
|
}
|
||||||
return states;
|
return states;
|
||||||
}
|
}
|
||||||
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y)
|
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states_)
|
||||||
{
|
{
|
||||||
// Discretize the continuous input data and build pDataset (Classifier::dataset)
|
// Discretize the continuous input data and build pDataset (Classifier::dataset)
|
||||||
int m = Xf.size(1);
|
int m = Xf.size(1);
|
||||||
@@ -190,7 +190,7 @@ namespace bayesnet {
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
// Phase 1: Initial discretization (same as original)
|
// Phase 1: Initial discretization (same as original)
|
||||||
auto currentStates = fit_local_discretization(y);
|
auto currentStates = fit_local_discretization(y, initialStates);
|
||||||
auto previousModel = Network();
|
auto previousModel = Network();
|
||||||
|
|
||||||
if (convergence_params.verbose) {
|
if (convergence_params.verbose) {
|
||||||
|
@@ -23,9 +23,8 @@ namespace bayesnet {
|
|||||||
protected:
|
protected:
|
||||||
void checkInput(const torch::Tensor& X, const torch::Tensor& y);
|
void checkInput(const torch::Tensor& X, const torch::Tensor& y);
|
||||||
torch::Tensor prepareX(torch::Tensor& X);
|
torch::Tensor prepareX(torch::Tensor& X);
|
||||||
map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
|
// fit_local_discretization is only called by aodeld
|
||||||
map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y);
|
map<std::string, std::vector<int>> fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states);
|
||||||
|
|
||||||
// Iterative discretization method
|
// Iterative discretization method
|
||||||
template<typename Classifier>
|
template<typename Classifier>
|
||||||
map<std::string, std::vector<int>> iterativeLocalDiscretization(
|
map<std::string, std::vector<int>> iterativeLocalDiscretization(
|
||||||
@@ -37,18 +36,15 @@ namespace bayesnet {
|
|||||||
const map<std::string, std::vector<int>>& initialStates,
|
const map<std::string, std::vector<int>>& initialStates,
|
||||||
const Smoothing_t smoothing
|
const Smoothing_t smoothing
|
||||||
);
|
);
|
||||||
|
|
||||||
torch::Tensor Xf; // X continuous nxm tensor
|
torch::Tensor Xf; // X continuous nxm tensor
|
||||||
torch::Tensor y; // y discrete nx1 tensor
|
torch::Tensor y; // y discrete nx1 tensor
|
||||||
map<std::string, std::unique_ptr<mdlp::Discretizer>> discretizers;
|
map<std::string, std::unique_ptr<mdlp::Discretizer>> discretizers;
|
||||||
|
|
||||||
// MDLP parameters
|
// MDLP parameters
|
||||||
struct {
|
struct {
|
||||||
size_t min_length = 3; // Minimum length of the interval to consider it in mdlp
|
size_t min_length = 3; // Minimum length of the interval to consider it in mdlp
|
||||||
float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm
|
float proposed_cuts = 0.0; // Proposed cuts for the Discretization algorithm
|
||||||
int max_depth = std::numeric_limits<int>::max(); // Maximum depth of the MDLP tree
|
int max_depth = std::numeric_limits<int>::max(); // Maximum depth of the MDLP tree
|
||||||
} ld_params;
|
} ld_params;
|
||||||
|
|
||||||
// Convergence parameters
|
// Convergence parameters
|
||||||
struct {
|
struct {
|
||||||
int maxIterations = 10;
|
int maxIterations = 10;
|
||||||
@@ -60,6 +56,7 @@ namespace bayesnet {
|
|||||||
"max_iterations", "verbose_convergence"
|
"max_iterations", "verbose_convergence"
|
||||||
};
|
};
|
||||||
private:
|
private:
|
||||||
|
map<std::string, std::vector<int>> localDiscretizationProposal(const map<std::string, std::vector<int>>& states, Network& model);
|
||||||
std::vector<int> factorize(const std::vector<std::string>& labels_t);
|
std::vector<int> factorize(const std::vector<std::string>& labels_t);
|
||||||
std::vector<std::string>& notes; // Notes during fit from BaseClassifier
|
std::vector<std::string>& notes; // Notes during fit from BaseClassifier
|
||||||
torch::Tensor& pDataset; // (n+1)xm tensor
|
torch::Tensor& pDataset; // (n+1)xm tensor
|
||||||
|
@@ -19,7 +19,7 @@ namespace bayesnet {
|
|||||||
Xf = X_;
|
Xf = X_;
|
||||||
y = y_;
|
y = y_;
|
||||||
// Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
|
// Fills std::vectors Xv & yv with the data from tensors X_ (discretized) & y
|
||||||
states = fit_local_discretization(y);
|
states = fit_local_discretization(y, states_);
|
||||||
// We have discretized the input data
|
// We have discretized the input data
|
||||||
// 1st we need to fit the model to build the normal AODE structure, Ensemble::fit
|
// 1st we need to fit the model to build the normal AODE structure, Ensemble::fit
|
||||||
// calls buildModel to initialize the base models
|
// calls buildModel to initialize the base models
|
||||||
|
@@ -60,7 +60,7 @@ class BayesNetConan(ConanFile):
|
|||||||
self.requires("libtorch/2.7.1")
|
self.requires("libtorch/2.7.1")
|
||||||
self.requires("nlohmann_json/3.11.3")
|
self.requires("nlohmann_json/3.11.3")
|
||||||
self.requires("folding/1.1.2") # Custom package
|
self.requires("folding/1.1.2") # Custom package
|
||||||
self.requires("fimdlp/2.1.1") # Custom package
|
self.requires("fimdlp/2.1.2") # Custom package
|
||||||
|
|
||||||
def build_requirements(self):
|
def build_requirements(self):
|
||||||
self.build_requires("cmake/[>=3.27]")
|
self.build_requires("cmake/[>=3.27]")
|
||||||
|
@@ -20,7 +20,7 @@
|
|||||||
#include "bayesnet/ensembles/AODELd.h"
|
#include "bayesnet/ensembles/AODELd.h"
|
||||||
#include "bayesnet/ensembles/BoostAODE.h"
|
#include "bayesnet/ensembles/BoostAODE.h"
|
||||||
|
|
||||||
const std::string ACTUAL_VERSION = "1.2.1";
|
const std::string ACTUAL_VERSION = "1.2.2";
|
||||||
|
|
||||||
TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
|
TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
|
||||||
{
|
{
|
||||||
@@ -496,3 +496,58 @@ TEST_CASE("Local discretization hyperparameters", "[Models]")
|
|||||||
REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
|
REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
|
||||||
REQUIRE(clft.getStatus() == bayesnet::NORMAL);
|
REQUIRE(clft.getStatus() == bayesnet::NORMAL);
|
||||||
}
|
}
|
||||||
|
TEST_CASE("Test Dataset Loading", "[Datasets]")
|
||||||
|
{
|
||||||
|
int max_sample = 4;
|
||||||
|
// Test loading a dataset
|
||||||
|
RawDatasets dataset("iris", true);
|
||||||
|
REQUIRE(dataset.Xt.size(0) == 4);
|
||||||
|
REQUIRE(dataset.Xt.size(1) == 150);
|
||||||
|
REQUIRE(dataset.yt.size(0) == 150);
|
||||||
|
std::cout << "Dataset iris discretized " << std::endl;
|
||||||
|
for (int sample = 0; sample < max_sample; sample++) {
|
||||||
|
for (int feature = 0; feature < 4; feature++) {
|
||||||
|
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
|
||||||
|
}
|
||||||
|
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||||
|
}
|
||||||
|
dataset = RawDatasets("iris", false);
|
||||||
|
std::cout << "Dataset iris raw " << std::endl;
|
||||||
|
for (int sample = 0; sample < max_sample; sample++) {
|
||||||
|
for (int feature = 0; feature < 4; feature++) {
|
||||||
|
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
|
||||||
|
}
|
||||||
|
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||||
|
}
|
||||||
|
// Test loading a dataset
|
||||||
|
dataset = RawDatasets("adult", true);
|
||||||
|
REQUIRE(dataset.Xt.size(0) == 14);
|
||||||
|
REQUIRE(dataset.Xt.size(1) == 45222);
|
||||||
|
REQUIRE(dataset.yt.size(0) == 45222);
|
||||||
|
std::cout << "Dataset adult discretized " << std::endl;
|
||||||
|
for (int sample = 0; sample < max_sample; sample++) {
|
||||||
|
for (int feature = 0; feature < 14; feature++) {
|
||||||
|
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
|
||||||
|
}
|
||||||
|
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||||
|
}
|
||||||
|
auto features = dataset.features;
|
||||||
|
std::cout << "States:" << std::endl;
|
||||||
|
for (int i = 0; i < 14; i++) {
|
||||||
|
std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
|
||||||
|
}
|
||||||
|
dataset = RawDatasets("adult", false);
|
||||||
|
std::cout << "Dataset adult raw " << std::endl;
|
||||||
|
for (int sample = 0; sample < max_sample; sample++) {
|
||||||
|
for (int feature = 0; feature < 14; feature++) {
|
||||||
|
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
|
||||||
|
}
|
||||||
|
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||||
|
}
|
||||||
|
std::cout << "States:" << std::endl;
|
||||||
|
for (int i = 0; i < 14; i++) {
|
||||||
|
std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
|
||||||
|
}
|
||||||
|
auto clf = bayesnet::TANLd();
|
||||||
|
clf.fit(dataset.Xt, dataset.yt, dataset.features, dataset.className, dataset.states, dataset.smoothing);
|
||||||
|
}
|
||||||
|
@@ -16,7 +16,7 @@
|
|||||||
#include "TestUtils.h"
|
#include "TestUtils.h"
|
||||||
|
|
||||||
std::map<std::string, std::string> modules = {
|
std::map<std::string, std::string> modules = {
|
||||||
{ "mdlp", "2.1.1" },
|
{ "mdlp", "2.1.2" },
|
||||||
{ "Folding", "1.1.2" },
|
{ "Folding", "1.1.2" },
|
||||||
{ "json", "3.11" },
|
{ "json", "3.11" },
|
||||||
{ "ArffFiles", "1.2.1" }
|
{ "ArffFiles", "1.2.1" }
|
||||||
|
@@ -5,6 +5,7 @@
|
|||||||
// ***************************************************************
|
// ***************************************************************
|
||||||
|
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
#include "TestUtils.h"
|
#include "TestUtils.h"
|
||||||
#include "bayesnet/config.h"
|
#include "bayesnet/config.h"
|
||||||
|
|
||||||
@@ -51,6 +52,7 @@ private:
|
|||||||
|
|
||||||
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug)
|
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug)
|
||||||
{
|
{
|
||||||
|
catalog = loadCatalog();
|
||||||
num_samples = num_samples_;
|
num_samples = num_samples_;
|
||||||
shuffle = shuffle_;
|
shuffle = shuffle_;
|
||||||
discretize = discretize_;
|
discretize = discretize_;
|
||||||
@@ -62,7 +64,7 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
|
|||||||
nSamples = dataset.size(1);
|
nSamples = dataset.size(1);
|
||||||
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
|
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
|
||||||
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
|
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
|
||||||
classNumStates = discretize ? states.at(className).size() : 0;
|
classNumStates = states.at(className).size();
|
||||||
auto fold = folding::StratifiedKFold(5, yt, 271);
|
auto fold = folding::StratifiedKFold(5, yt, 271);
|
||||||
auto [train, test] = fold.getFold(0);
|
auto [train, test] = fold.getFold(0);
|
||||||
auto train_t = torch::tensor(train);
|
auto train_t = torch::tensor(train);
|
||||||
@@ -78,18 +80,90 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
|
|||||||
|
|
||||||
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
|
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
|
||||||
{
|
{
|
||||||
|
|
||||||
map<std::string, int> maxes;
|
map<std::string, int> maxes;
|
||||||
auto fimdlp = mdlp::CPPFImdlp();
|
auto fimdlp = mdlp::CPPFImdlp();
|
||||||
for (int i = 0; i < X.size(); i++) {
|
for (int i = 0; i < X.size(); i++) {
|
||||||
|
mdlp::labels_t xd;
|
||||||
|
if (is_numeric.at(i)) {
|
||||||
fimdlp.fit(X[i], yv);
|
fimdlp.fit(X[i], yv);
|
||||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
xd = fimdlp.transform(X[i]);
|
||||||
|
} else {
|
||||||
|
std::transform(X[i].begin(), X[i].end(), back_inserter(xd), [](const auto& val) {
|
||||||
|
return static_cast<int>(val);
|
||||||
|
});
|
||||||
|
}
|
||||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||||
Xv.push_back(xd);
|
Xv.push_back(xd);
|
||||||
}
|
}
|
||||||
return maxes;
|
return maxes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
map<std::string, std::vector<int>> RawDatasets::loadCatalog()
|
||||||
|
{
|
||||||
|
map<std::string, std::vector<int>> catalogNames;
|
||||||
|
ifstream catalog(Paths::datasets() + "all.txt");
|
||||||
|
std::vector<int> numericFeaturesIdx;
|
||||||
|
if (!catalog.is_open()) {
|
||||||
|
throw std::invalid_argument("Unable to open catalog file. [" + Paths::datasets() + +"all.txt" + "]");
|
||||||
|
}
|
||||||
|
std::string line;
|
||||||
|
std::vector<std::string> sorted_lines;
|
||||||
|
while (getline(catalog, line)) {
|
||||||
|
if (line.empty() || line[0] == '#') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
sorted_lines.push_back(line);
|
||||||
|
}
|
||||||
|
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
|
||||||
|
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
|
||||||
|
|
||||||
|
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
|
||||||
|
});
|
||||||
|
|
||||||
|
for (const auto& line : sorted_lines) {
|
||||||
|
std::vector<std::string> tokens = split(line, ';');
|
||||||
|
std::string name = tokens[0];
|
||||||
|
std::string className;
|
||||||
|
numericFeaturesIdx.clear();
|
||||||
|
int size = tokens.size();
|
||||||
|
switch (size) {
|
||||||
|
case 1:
|
||||||
|
className = "-1";
|
||||||
|
numericFeaturesIdx.push_back(-1);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
className = tokens[1];
|
||||||
|
numericFeaturesIdx.push_back(-1);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
{
|
||||||
|
className = tokens[1];
|
||||||
|
auto numericFeatures = tokens[2];
|
||||||
|
if (numericFeatures == "all") {
|
||||||
|
numericFeaturesIdx.push_back(-1);
|
||||||
|
} else {
|
||||||
|
if (numericFeatures != "none") {
|
||||||
|
auto features = nlohmann::json::parse(numericFeatures);
|
||||||
|
for (auto& f : features) {
|
||||||
|
numericFeaturesIdx.push_back(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw std::invalid_argument("Invalid catalog file format.");
|
||||||
|
|
||||||
|
}
|
||||||
|
catalogNames[name] = numericFeaturesIdx;
|
||||||
|
}
|
||||||
|
catalog.close();
|
||||||
|
if (catalogNames.empty()) {
|
||||||
|
throw std::invalid_argument("Catalog is empty. Please check the catalog file.");
|
||||||
|
}
|
||||||
|
return catalogNames;
|
||||||
|
}
|
||||||
|
|
||||||
void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
||||||
{
|
{
|
||||||
auto handler = ShuffleArffFiles(num_samples, shuffle);
|
auto handler = ShuffleArffFiles(num_samples, shuffle);
|
||||||
@@ -101,6 +175,26 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
|||||||
className = handler.getClassName();
|
className = handler.getClassName();
|
||||||
auto attributes = handler.getAttributes();
|
auto attributes = handler.getAttributes();
|
||||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
||||||
|
is_numeric.clear();
|
||||||
|
is_numeric.reserve(features.size());
|
||||||
|
auto numericFeaturesIdx = catalog.at(name);
|
||||||
|
if (numericFeaturesIdx.empty()) {
|
||||||
|
// no numeric features
|
||||||
|
is_numeric.assign(features.size(), false);
|
||||||
|
} else {
|
||||||
|
if (numericFeaturesIdx[0] == -1) {
|
||||||
|
// all features are numeric
|
||||||
|
is_numeric.assign(features.size(), true);
|
||||||
|
} else {
|
||||||
|
// some features are numeric
|
||||||
|
is_numeric.assign(features.size(), false);
|
||||||
|
for (const auto& idx : numericFeaturesIdx) {
|
||||||
|
if (idx >= 0 && idx < features.size()) {
|
||||||
|
is_numeric[idx] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
// Discretize Dataset
|
// Discretize Dataset
|
||||||
auto maxValues = discretizeDataset(X);
|
auto maxValues = discretizeDataset(X);
|
||||||
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
|
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
|
||||||
@@ -113,13 +207,23 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
|||||||
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
|
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
|
||||||
}
|
}
|
||||||
states[className] = std::vector<int>(maxValues[className]);
|
states[className] = std::vector<int>(maxValues[className]);
|
||||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
|
||||||
} else {
|
} else {
|
||||||
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
|
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
|
||||||
for (int i = 0; i < features.size(); ++i) {
|
for (int i = 0; i < features.size(); ++i) {
|
||||||
Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
|
Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
|
||||||
|
if (!is_numeric.at(i)) {
|
||||||
|
states[features[i]] = std::vector<int>(maxValues[features[i]]);
|
||||||
|
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
|
||||||
|
} else {
|
||||||
|
states[features[i]] = std::vector<int>();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
yt = torch::tensor(yv, torch::kInt32);
|
yt = torch::tensor(yv, torch::kInt32);
|
||||||
|
int maxy = *max_element(yv.begin(), yv.end()) + 1;
|
||||||
|
states[className] = std::vector<int>(maxy);
|
||||||
|
}
|
||||||
|
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||||
|
yt = torch::tensor(yv, torch::kInt32);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -27,7 +27,11 @@ public:
|
|||||||
std::vector<double> weightsv;
|
std::vector<double> weightsv;
|
||||||
std::vector<string> features;
|
std::vector<string> features;
|
||||||
std::string className;
|
std::string className;
|
||||||
|
std::vector<bool> is_numeric; // indicates whether each feature is numeric
|
||||||
map<std::string, std::vector<int>> states;
|
map<std::string, std::vector<int>> states;
|
||||||
|
//catalog holds the mapping between dataset names and their corresponding indices of numeric features (-1) means all are numeric
|
||||||
|
//and an empty vector means none are numeric
|
||||||
|
map<std::string, std::vector<int>> catalog;
|
||||||
int nSamples, classNumStates;
|
int nSamples, classNumStates;
|
||||||
double epsilon = 1e-5;
|
double epsilon = 1e-5;
|
||||||
bool discretize;
|
bool discretize;
|
||||||
@@ -65,8 +69,30 @@ private:
|
|||||||
+ "classNumStates: " + std::to_string(classNumStates) + "\n"
|
+ "classNumStates: " + std::to_string(classNumStates) + "\n"
|
||||||
+ "states: " + states_ + "\n";
|
+ "states: " + states_ + "\n";
|
||||||
}
|
}
|
||||||
|
std::string trim(const std::string& str)
|
||||||
|
{
|
||||||
|
std::string result = str;
|
||||||
|
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
|
||||||
|
return !std::isspace(ch);
|
||||||
|
}));
|
||||||
|
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
|
||||||
|
return !std::isspace(ch);
|
||||||
|
}).base(), result.end());
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
std::vector<std::string> split(const std::string& text, char delimiter)
|
||||||
|
{
|
||||||
|
std::vector<std::string> result;
|
||||||
|
std::stringstream ss(text);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(ss, token, delimiter)) {
|
||||||
|
result.push_back(trim(token));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
|
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
|
||||||
void loadDataset(const std::string& name, bool class_last);
|
void loadDataset(const std::string& name, bool class_last);
|
||||||
|
map<std::string, std::vector<int>> loadCatalog();
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif //TEST_UTILS_H
|
#endif //TEST_UTILS_H
|
48861
tests/data/adult.arff
Normal file
48861
tests/data/adult.arff
Normal file
File diff suppressed because it is too large
Load Diff
27
tests/data/all.txt
Normal file
27
tests/data/all.txt
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
adult;class;[0,2,4,10,11,12]
|
||||||
|
balance-scale;class; all
|
||||||
|
breast-w;Class; all
|
||||||
|
diabetes;class; all
|
||||||
|
ecoli;class; all
|
||||||
|
glass;Type; all
|
||||||
|
hayes-roth;class; none
|
||||||
|
heart-statlog;class; [0,3,4,7,9,11]
|
||||||
|
ionosphere;class; all
|
||||||
|
iris;class; all
|
||||||
|
kdd_JapaneseVowels;speaker; all
|
||||||
|
letter;class; all
|
||||||
|
liver-disorders;selector; all
|
||||||
|
mfeat-factors;class; all
|
||||||
|
mfeat-fourier;class; all
|
||||||
|
mfeat-karhunen;class; all
|
||||||
|
mfeat-morphological;class; all
|
||||||
|
mfeat-zernike;class; all
|
||||||
|
optdigits;class; all
|
||||||
|
page-blocks;class; all
|
||||||
|
pendigits;class; all
|
||||||
|
segment;class; all
|
||||||
|
sonar;Class; all
|
||||||
|
spambase;class; all
|
||||||
|
vehicle;Class; all
|
||||||
|
waveform-5000;class; all
|
||||||
|
wine;class; all
|
Reference in New Issue
Block a user