Compare commits
1 Commits
Author | SHA1 | Date | |
---|---|---|---|
baa631dd66
|
@@ -1,6 +1,6 @@
|
|||||||
FROM mcr.microsoft.com/devcontainers/cpp:ubuntu22.04
|
FROM mcr.microsoft.com/devcontainers/cpp:ubuntu22.04
|
||||||
|
|
||||||
ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.29.3"
|
ARG REINSTALL_CMAKE_VERSION_FROM_SOURCE="3.22.2"
|
||||||
|
|
||||||
# Optionally install the cmake for vcpkg
|
# Optionally install the cmake for vcpkg
|
||||||
COPY ./reinstall-cmake.sh /tmp/
|
COPY ./reinstall-cmake.sh /tmp/
|
||||||
@@ -23,7 +23,7 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test
|
|||||||
RUN apt-get update
|
RUN apt-get update
|
||||||
|
|
||||||
# Install GCC 13.1
|
# Install GCC 13.1
|
||||||
RUN apt-get install -y gcc-13 g++-13 doxygen
|
RUN apt-get install -y gcc-13 g++-13
|
||||||
|
|
||||||
# Install lcov 2.1
|
# Install lcov 2.1
|
||||||
RUN wget --quiet https://github.com/linux-test-project/lcov/releases/download/v2.1/lcov-2.1.tar.gz && \
|
RUN wget --quiet https://github.com/linux-test-project/lcov/releases/download/v2.1/lcov-2.1.tar.gz && \
|
||||||
|
8
.gitmodules
vendored
8
.gitmodules
vendored
@@ -1,3 +1,8 @@
|
|||||||
|
[submodule "lib/mdlp"]
|
||||||
|
path = lib/mdlp
|
||||||
|
url = https://github.com/rmontanana/mdlp
|
||||||
|
main = main
|
||||||
|
update = merge
|
||||||
[submodule "lib/json"]
|
[submodule "lib/json"]
|
||||||
path = lib/json
|
path = lib/json
|
||||||
url = https://github.com/nlohmann/json.git
|
url = https://github.com/nlohmann/json.git
|
||||||
@@ -16,6 +21,3 @@
|
|||||||
[submodule "tests/lib/Files"]
|
[submodule "tests/lib/Files"]
|
||||||
path = tests/lib/Files
|
path = tests/lib/Files
|
||||||
url = https://github.com/rmontanana/ArffFiles
|
url = https://github.com/rmontanana/ArffFiles
|
||||||
[submodule "lib/mdlp"]
|
|
||||||
path = lib/mdlp
|
|
||||||
url = https://github.com/rmontanana/mdlp
|
|
||||||
|
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@@ -16,7 +16,7 @@
|
|||||||
"name": "test",
|
"name": "test",
|
||||||
"program": "${workspaceFolder}/build_Debug/tests/TestBayesNet",
|
"program": "${workspaceFolder}/build_Debug/tests/TestBayesNet",
|
||||||
"args": [
|
"args": [
|
||||||
"No features selected"
|
"[Network]"
|
||||||
],
|
],
|
||||||
"cwd": "${workspaceFolder}/build_Debug/tests"
|
"cwd": "${workspaceFolder}/build_Debug/tests"
|
||||||
},
|
},
|
||||||
|
24
CHANGELOG.md
24
CHANGELOG.md
@@ -7,15 +7,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
## [1.0.6] 2024-11-23
|
|
||||||
|
|
||||||
### Fixed
|
|
||||||
|
|
||||||
- Prevent existing edges to be added to the network in the `add_edge` method.
|
|
||||||
- Don't allow to add nodes or edges on already fiited networks.
|
|
||||||
- Number of threads spawned
|
|
||||||
- Network class tests
|
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|
||||||
- Library logo generated with <https://openart.ai> to README.md
|
- Library logo generated with <https://openart.ai> to README.md
|
||||||
@@ -23,21 +14,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- *convergence_best* hyperparameter to the BoostAODE class, to control the way the prior accuracy is computed if convergence is set. Default value is *false*.
|
- *convergence_best* hyperparameter to the BoostAODE class, to control the way the prior accuracy is computed if convergence is set. Default value is *false*.
|
||||||
- SPnDE model.
|
- SPnDE model.
|
||||||
- A2DE model.
|
- A2DE model.
|
||||||
- BoostA2DE model.
|
|
||||||
- A2DE & SPnDE tests.
|
- A2DE & SPnDE tests.
|
||||||
- Add tests to reach 99% of coverage.
|
- Add tests to reach 99% of coverage.
|
||||||
- Add tests to check the correct version of the mdlp, folding and json libraries.
|
- Add tests to check the correct version of the mdlp, folding and json libraries.
|
||||||
- Library documentation generated with Doxygen.
|
- Library documentation generated with Doxygen.
|
||||||
- Link to documentation in the README.md.
|
- Link to documentation in the README.md.
|
||||||
- Three types of smoothing the Bayesian Network ORIGINAL, LAPLACE and CESTNIK.
|
- Three types of smoothing the Bayesian Network OLD_LAPLACE, LAPLACE and CESTNIK.
|
||||||
|
|
||||||
### Internal
|
### Internal
|
||||||
|
|
||||||
- Fixed doxygen optional dependency
|
|
||||||
- Add env parallel variable to Makefile
|
|
||||||
- Add CountingSemaphore class to manage the number of threads spawned.
|
|
||||||
- Ignore CUDA language in CMake CodeCoverage module.
|
|
||||||
- Update mdlp library as a git submodule.
|
|
||||||
- Create library ShuffleArffFile to limit the number of samples with a parameter and shuffle them.
|
- Create library ShuffleArffFile to limit the number of samples with a parameter and shuffle them.
|
||||||
- Refactor catch2 library location to test/lib
|
- Refactor catch2 library location to test/lib
|
||||||
- Refactor loadDataset function in tests.
|
- Refactor loadDataset function in tests.
|
||||||
@@ -48,13 +33,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Add a Makefile target (doc) to generate the documentation.
|
- Add a Makefile target (doc) to generate the documentation.
|
||||||
- Add a Makefile target (doc-install) to install the documentation.
|
- Add a Makefile target (doc-install) to install the documentation.
|
||||||
|
|
||||||
### Libraries versions
|
|
||||||
|
|
||||||
- mdlp: 2.0.1
|
|
||||||
- Folding: 1.1.0
|
|
||||||
- json: 3.11
|
|
||||||
- ArffFiles: 1.1.0
|
|
||||||
|
|
||||||
## [1.0.5] 2024-04-20
|
## [1.0.5] 2024-04-20
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
@@ -49,13 +49,17 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|||||||
set(CODE_COVERAGE ON)
|
set(CODE_COVERAGE ON)
|
||||||
endif (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
endif (CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||||
|
|
||||||
get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
|
|
||||||
message(STATUS "Languages=${LANGUAGES}")
|
|
||||||
if (CODE_COVERAGE)
|
if (CODE_COVERAGE)
|
||||||
|
get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
|
||||||
|
message("ALL LANGUAGES: ${LANGUAGES}")
|
||||||
|
foreach(LANG ${LANGUAGES})
|
||||||
|
message("${LANG} compiler is \"${CMAKE_${LANG}_COMPILER_ID}\"")
|
||||||
|
endforeach()
|
||||||
enable_testing()
|
enable_testing()
|
||||||
include(CodeCoverage)
|
#include(CodeCoverage)
|
||||||
MESSAGE(STATUS "Code coverage enabled")
|
#MESSAGE("Code coverage enabled")
|
||||||
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
#SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
|
||||||
endif (CODE_COVERAGE)
|
endif (CODE_COVERAGE)
|
||||||
|
|
||||||
if (ENABLE_CLANG_TIDY)
|
if (ENABLE_CLANG_TIDY)
|
||||||
@@ -64,7 +68,6 @@ endif (ENABLE_CLANG_TIDY)
|
|||||||
|
|
||||||
# External libraries - dependencies of BayesNet
|
# External libraries - dependencies of BayesNet
|
||||||
# ---------------------------------------------
|
# ---------------------------------------------
|
||||||
|
|
||||||
# include(FetchContent)
|
# include(FetchContent)
|
||||||
add_git_submodule("lib/json")
|
add_git_submodule("lib/json")
|
||||||
add_git_submodule("lib/mdlp")
|
add_git_submodule("lib/mdlp")
|
||||||
@@ -77,7 +80,7 @@ add_subdirectory(bayesnet)
|
|||||||
# Testing
|
# Testing
|
||||||
# -------
|
# -------
|
||||||
if (ENABLE_TESTING)
|
if (ENABLE_TESTING)
|
||||||
MESSAGE(STATUS "Testing enabled")
|
MESSAGE("Testing enabled")
|
||||||
add_subdirectory(tests/lib/catch2)
|
add_subdirectory(tests/lib/catch2)
|
||||||
include(CTest)
|
include(CTest)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
@@ -95,14 +98,10 @@ install(FILES ${CMAKE_BINARY_DIR}/configured_files/include/bayesnet/config.h DES
|
|||||||
# Documentation
|
# Documentation
|
||||||
# -------------
|
# -------------
|
||||||
find_package(Doxygen)
|
find_package(Doxygen)
|
||||||
if (Doxygen_FOUND)
|
set(DOC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/docs)
|
||||||
set(DOC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/docs)
|
set(doxyfile_in ${DOC_DIR}/Doxyfile.in)
|
||||||
set(doxyfile_in ${DOC_DIR}/Doxyfile.in)
|
set(doxyfile ${DOC_DIR}/Doxyfile)
|
||||||
set(doxyfile ${DOC_DIR}/Doxyfile)
|
configure_file(${doxyfile_in} ${doxyfile} @ONLY)
|
||||||
configure_file(${doxyfile_in} ${doxyfile} @ONLY)
|
doxygen_add_docs(doxygen
|
||||||
doxygen_add_docs(doxygen
|
WORKING_DIRECTORY ${DOC_DIR}
|
||||||
WORKING_DIRECTORY ${DOC_DIR}
|
|
||||||
CONFIG_FILE ${doxyfile})
|
CONFIG_FILE ${doxyfile})
|
||||||
else (Doxygen_FOUND)
|
|
||||||
MESSAGE("* Doxygen not found")
|
|
||||||
endif (Doxygen_FOUND)
|
|
||||||
|
8
Makefile
8
Makefile
@@ -43,7 +43,7 @@ setup: ## Install dependencies for tests and coverage
|
|||||||
fi
|
fi
|
||||||
@echo "* You should install plantuml & graphviz for the diagrams"
|
@echo "* You should install plantuml & graphviz for the diagrams"
|
||||||
|
|
||||||
diagrams: ## Create an UML class diagram & dependency of the project (diagrams/BayesNet.png)
|
diagrams: ## Create an UML class diagram & depnendency of the project (diagrams/BayesNet.png)
|
||||||
@which $(plantuml) || (echo ">>> Please install plantuml"; exit 1)
|
@which $(plantuml) || (echo ">>> Please install plantuml"; exit 1)
|
||||||
@which $(dot) || (echo ">>> Please install graphviz"; exit 1)
|
@which $(dot) || (echo ">>> Please install graphviz"; exit 1)
|
||||||
@which $(clang-uml) || (echo ">>> Please install clang-uml"; exit 1)
|
@which $(clang-uml) || (echo ">>> Please install clang-uml"; exit 1)
|
||||||
@@ -58,10 +58,10 @@ diagrams: ## Create an UML class diagram & dependency of the project (diagrams/B
|
|||||||
@$(dot) -Tsvg $(f_debug)/dependency.dot.BayesNet -o $(f_diagrams)/dependency.svg
|
@$(dot) -Tsvg $(f_debug)/dependency.dot.BayesNet -o $(f_diagrams)/dependency.svg
|
||||||
|
|
||||||
buildd: ## Build the debug targets
|
buildd: ## Build the debug targets
|
||||||
cmake --build $(f_debug) -t $(app_targets) --parallel $(CMAKE_BUILD_PARALLEL_LEVEL)
|
cmake --build $(f_debug) -t $(app_targets) --parallel
|
||||||
|
|
||||||
buildr: ## Build the release targets
|
buildr: ## Build the release targets
|
||||||
cmake --build $(f_release) -t $(app_targets) --parallel $(CMAKE_BUILD_PARALLEL_LEVEL)
|
cmake --build $(f_release) -t $(app_targets) --parallel
|
||||||
|
|
||||||
clean: ## Clean the tests info
|
clean: ## Clean the tests info
|
||||||
@echo ">>> Cleaning Debug BayesNet tests...";
|
@echo ">>> Cleaning Debug BayesNet tests...";
|
||||||
@@ -105,7 +105,7 @@ opt = ""
|
|||||||
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
|
test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section
|
||||||
@echo ">>> Running BayesNet tests...";
|
@echo ">>> Running BayesNet tests...";
|
||||||
@$(MAKE) clean
|
@$(MAKE) clean
|
||||||
@cmake --build $(f_debug) -t $(test_targets) --parallel $(CMAKE_BUILD_PARALLEL_LEVEL)
|
@cmake --build $(f_debug) -t $(test_targets) --parallel
|
||||||
@for t in $(test_targets); do \
|
@for t in $(test_targets); do \
|
||||||
echo ">>> Running $$t...";\
|
echo ">>> Running $$t...";\
|
||||||
if [ -f $(f_debug)/tests/$$t ]; then \
|
if [ -f $(f_debug)/tests/$$t ]; then \
|
||||||
|
@@ -7,9 +7,9 @@
|
|||||||
[](https://sonarcloud.io/summary/new_code?id=rmontanana_BayesNet)
|
[](https://sonarcloud.io/summary/new_code?id=rmontanana_BayesNet)
|
||||||
[](https://sonarcloud.io/summary/new_code?id=rmontanana_BayesNet)
|
[](https://sonarcloud.io/summary/new_code?id=rmontanana_BayesNet)
|
||||||

|

|
||||||
[](html/index.html)
|
[](html/index.html)
|
||||||
|
|
||||||
Bayesian Network Classifiers library
|
Bayesian Network Classifiers using libtorch from scratch
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
@@ -71,8 +71,6 @@ make sample fname=tests/data/glass.arff
|
|||||||
|
|
||||||
#### - AODE
|
#### - AODE
|
||||||
|
|
||||||
#### - A2DE
|
|
||||||
|
|
||||||
#### - [BoostAODE](docs/BoostAODE.md)
|
#### - [BoostAODE](docs/BoostAODE.md)
|
||||||
|
|
||||||
#### - BoostA2DE
|
#### - BoostA2DE
|
||||||
|
@@ -9,4 +9,4 @@ include_directories(
|
|||||||
file(GLOB_RECURSE Sources "*.cc")
|
file(GLOB_RECURSE Sources "*.cc")
|
||||||
|
|
||||||
add_library(BayesNet ${Sources})
|
add_library(BayesNet ${Sources})
|
||||||
target_link_libraries(BayesNet fimdlp "${TORCH_LIBRARIES}")
|
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
|
||||||
|
@@ -9,7 +9,15 @@
|
|||||||
#include "Classifier.h"
|
#include "Classifier.h"
|
||||||
|
|
||||||
namespace bayesnet {
|
namespace bayesnet {
|
||||||
Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {}
|
Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false), device(torch::kCPU)
|
||||||
|
{
|
||||||
|
if (torch::cuda::is_available()) {
|
||||||
|
device = torch::Device(torch::kCUDA);
|
||||||
|
std::cout << "CUDA is available! Using GPU." << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cout << "CUDA is not available. Using CPU." << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
|
const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted";
|
||||||
Classifier& Classifier::build(const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights, const Smoothing_t smoothing)
|
Classifier& Classifier::build(const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights, const Smoothing_t smoothing)
|
||||||
{
|
{
|
||||||
@@ -31,7 +39,7 @@ namespace bayesnet {
|
|||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
auto yresized = torch::transpose(ytmp.view({ ytmp.size(0), 1 }), 0, 1);
|
auto yresized = torch::transpose(ytmp.view({ ytmp.size(0), 1 }), 0, 1);
|
||||||
dataset = torch::cat({ dataset, yresized }, 0);
|
dataset = torch::cat({ dataset, yresized }, 0).to(device);
|
||||||
}
|
}
|
||||||
catch (const std::exception& e) {
|
catch (const std::exception& e) {
|
||||||
std::stringstream oss;
|
std::stringstream oss;
|
||||||
@@ -50,7 +58,7 @@ namespace bayesnet {
|
|||||||
{
|
{
|
||||||
dataset = X;
|
dataset = X;
|
||||||
buildDataset(y);
|
buildDataset(y);
|
||||||
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble);
|
const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble).to(device);
|
||||||
return build(features, className, states, weights, smoothing);
|
return build(features, className, states, weights, smoothing);
|
||||||
}
|
}
|
||||||
// X is nxm where n is the number of features and m the number of samples
|
// X is nxm where n is the number of features and m the number of samples
|
||||||
|
@@ -38,6 +38,7 @@ namespace bayesnet {
|
|||||||
std::string dump_cpt() const override;
|
std::string dump_cpt() const override;
|
||||||
void setHyperparameters(const nlohmann::json& hyperparameters) override; //For classifiers that don't have hyperparameters
|
void setHyperparameters(const nlohmann::json& hyperparameters) override; //For classifiers that don't have hyperparameters
|
||||||
protected:
|
protected:
|
||||||
|
torch::Device device;
|
||||||
bool fitted;
|
bool fitted;
|
||||||
unsigned int m, n; // m: number of samples, n: number of features
|
unsigned int m, n; // m: number of samples, n: number of features
|
||||||
Network model;
|
Network model;
|
||||||
|
@@ -59,9 +59,6 @@ namespace bayesnet {
|
|||||||
std::vector<int> featuresUsed;
|
std::vector<int> featuresUsed;
|
||||||
if (selectFeatures) {
|
if (selectFeatures) {
|
||||||
featuresUsed = initializeModels(smoothing);
|
featuresUsed = initializeModels(smoothing);
|
||||||
if (featuresUsed.size() == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
auto ypred = predict(X_train);
|
auto ypred = predict(X_train);
|
||||||
std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred, weights_);
|
std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred, weights_);
|
||||||
// Update significance of the models
|
// Update significance of the models
|
||||||
|
@@ -209,7 +209,7 @@ namespace bayesnet {
|
|||||||
pthread_setname_np(threadName.c_str());
|
pthread_setname_np(threadName.c_str());
|
||||||
#endif
|
#endif
|
||||||
double numStates = static_cast<double>(node.second->getNumStates());
|
double numStates = static_cast<double>(node.second->getNumStates());
|
||||||
double smoothing_factor;
|
double smoothing_factor = 0.0;
|
||||||
switch (smoothing) {
|
switch (smoothing) {
|
||||||
case Smoothing_t::ORIGINAL:
|
case Smoothing_t::ORIGINAL:
|
||||||
smoothing_factor = 1.0 / n_samples;
|
smoothing_factor = 1.0 / n_samples;
|
||||||
@@ -221,7 +221,7 @@ namespace bayesnet {
|
|||||||
smoothing_factor = 1 / numStates;
|
smoothing_factor = 1 / numStates;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
smoothing_factor = 0.0; // No smoothing
|
throw std::invalid_argument("Smoothing method not recognized " + std::to_string(static_cast<int>(smoothing)));
|
||||||
}
|
}
|
||||||
node.second->computeCPT(samples, features, smoothing_factor, weights);
|
node.second->computeCPT(samples, features, smoothing_factor, weights);
|
||||||
semaphore.release();
|
semaphore.release();
|
||||||
@@ -234,6 +234,16 @@ namespace bayesnet {
|
|||||||
for (auto& thread : threads) {
|
for (auto& thread : threads) {
|
||||||
thread.join();
|
thread.join();
|
||||||
}
|
}
|
||||||
|
// std::fstream file;
|
||||||
|
// file.open("cpt.txt", std::fstream::out | std::fstream::app);
|
||||||
|
// file << std::string(80, '*') << std::endl;
|
||||||
|
// for (const auto& item : graph("Test")) {
|
||||||
|
// file << item << std::endl;
|
||||||
|
// }
|
||||||
|
// file << std::string(80, '-') << std::endl;
|
||||||
|
// file << dump_cpt() << std::endl;
|
||||||
|
// file << std::string(80, '=') << std::endl;
|
||||||
|
// file.close();
|
||||||
fitted = true;
|
fitted = true;
|
||||||
}
|
}
|
||||||
torch::Tensor Network::predict_tensor(const torch::Tensor& samples, const bool proba)
|
torch::Tensor Network::predict_tensor(const torch::Tensor& samples, const bool proba)
|
||||||
|
@@ -97,7 +97,7 @@ namespace bayesnet {
|
|||||||
dimensions.push_back(numStates);
|
dimensions.push_back(numStates);
|
||||||
transform(parents.begin(), parents.end(), back_inserter(dimensions), [](const auto& parent) { return parent->getNumStates(); });
|
transform(parents.begin(), parents.end(), back_inserter(dimensions), [](const auto& parent) { return parent->getNumStates(); });
|
||||||
// Create a tensor of zeros with the dimensions of the CPT
|
// Create a tensor of zeros with the dimensions of the CPT
|
||||||
cpTable = torch::zeros(dimensions, torch::kDouble) + smoothing;
|
cpTable = torch::zeros(dimensions, torch::kDouble).to(device) + smoothing;
|
||||||
// Fill table with counts
|
// Fill table with counts
|
||||||
auto pos = find(features.begin(), features.end(), name);
|
auto pos = find(features.begin(), features.end(), name);
|
||||||
if (pos == features.end()) {
|
if (pos == features.end()) {
|
||||||
|
@@ -53,14 +53,14 @@ namespace bayesnet {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void MST::insertElement(std::list<int>& variables, int variable)
|
void insertElement(std::list<int>& variables, int variable)
|
||||||
{
|
{
|
||||||
if (std::find(variables.begin(), variables.end(), variable) == variables.end()) {
|
if (std::find(variables.begin(), variables.end(), variable) == variables.end()) {
|
||||||
variables.push_front(variable);
|
variables.push_front(variable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> MST::reorder(std::vector<std::pair<float, std::pair<int, int>>> T, int root_original)
|
std::vector<std::pair<int, int>> reorder(std::vector<std::pair<float, std::pair<int, int>>> T, int root_original)
|
||||||
{
|
{
|
||||||
// Create the edges of a DAG from the MST
|
// Create the edges of a DAG from the MST
|
||||||
// replacing unordered_set with list because unordered_set cannot guarantee the order of the elements inserted
|
// replacing unordered_set with list because unordered_set cannot guarantee the order of the elements inserted
|
||||||
|
@@ -14,8 +14,6 @@ namespace bayesnet {
|
|||||||
public:
|
public:
|
||||||
MST() = default;
|
MST() = default;
|
||||||
MST(const std::vector<std::string>& features, const torch::Tensor& weights, const int root);
|
MST(const std::vector<std::string>& features, const torch::Tensor& weights, const int root);
|
||||||
void insertElement(std::list<int>& variables, int variable);
|
|
||||||
std::vector<std::pair<int, int>> reorder(std::vector<std::pair<float, std::pair<int, int>>> T, int root_original);
|
|
||||||
std::vector<std::pair<int, int>> maximumSpanningTree();
|
std::vector<std::pair<int, int>> maximumSpanningTree();
|
||||||
private:
|
private:
|
||||||
torch::Tensor weights;
|
torch::Tensor weights;
|
||||||
|
@@ -137,7 +137,7 @@
|
|||||||
|
|
||||||
include(CMakeParseArguments)
|
include(CMakeParseArguments)
|
||||||
|
|
||||||
option(CODE_COVERAGE_VERBOSE "Verbose information" TRUE)
|
option(CODE_COVERAGE_VERBOSE "Verbose information" FALSE)
|
||||||
|
|
||||||
# Check prereqs
|
# Check prereqs
|
||||||
find_program( GCOV_PATH gcov )
|
find_program( GCOV_PATH gcov )
|
||||||
@@ -160,11 +160,7 @@ foreach(LANG ${LANGUAGES})
|
|||||||
endif()
|
endif()
|
||||||
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
|
elseif(NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "GNU"
|
||||||
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
|
AND NOT "${CMAKE_${LANG}_COMPILER_ID}" MATCHES "(LLVM)?[Ff]lang")
|
||||||
if ("${LANG}" MATCHES "CUDA")
|
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
|
||||||
message(STATUS "Ignoring CUDA")
|
|
||||||
else()
|
|
||||||
message(FATAL_ERROR "Compiler is not GNU or Flang! Aborting...")
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
|
2
lib/json
2
lib/json
Submodule lib/json updated: 378e091795...960b763ecd
2
lib/mdlp
2
lib/mdlp
Submodule lib/mdlp updated: 7d62d6af4a...2db60e007d
@@ -5,21 +5,15 @@ project(bayesnet_sample)
|
|||||||
set(CMAKE_CXX_STANDARD 17)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
find_library(BayesNet NAMES libBayesNet BayesNet libBayesNet.a REQUIRED)
|
find_library(BayesNet NAMES BayesNet.a libBayesNet.a REQUIRED)
|
||||||
find_path(Bayesnet_INCLUDE_DIRS REQUIRED NAMES bayesnet)
|
|
||||||
find_library(FImdlp NAMES libfimdlp.a PATHS REQUIRED)
|
|
||||||
|
|
||||||
message(STATUS "FImdlp=${FImdlp}")
|
|
||||||
message(STATUS "FImdlp_INCLUDE_DIRS=${FImdlp_INCLUDE_DIRS}")
|
|
||||||
message(STATUS "BayesNet=${BayesNet}")
|
|
||||||
message(STATUS "Bayesnet_INCLUDE_DIRS=${Bayesnet_INCLUDE_DIRS}")
|
|
||||||
|
|
||||||
include_directories(
|
include_directories(
|
||||||
../tests/lib/Files
|
../tests/lib/Files
|
||||||
|
lib/mdlp
|
||||||
lib/json/include
|
lib/json/include
|
||||||
/usr/local/include
|
/usr/local/include
|
||||||
${FImdlp_INCLUDE_DIRS}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
add_subdirectory(lib/mdlp)
|
||||||
add_executable(bayesnet_sample sample.cc)
|
add_executable(bayesnet_sample sample.cc)
|
||||||
target_link_libraries(bayesnet_sample fimdlp "${TORCH_LIBRARIES}" "${BayesNet}")
|
target_link_libraries(bayesnet_sample mdlp "${TORCH_LIBRARIES}" "${BayesNet}")
|
11
sample/lib/mdlp/CMakeLists.txt
Normal file
11
sample/lib/mdlp/CMakeLists.txt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.20)
|
||||||
|
project(mdlp)
|
||||||
|
|
||||||
|
if (POLICY CMP0135)
|
||||||
|
cmake_policy(SET CMP0135 NEW)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 11)
|
||||||
|
|
||||||
|
add_library(mdlp CPPFImdlp.cpp Metrics.cpp)
|
||||||
|
|
222
sample/lib/mdlp/CPPFImdlp.cpp
Normal file
222
sample/lib/mdlp/CPPFImdlp.cpp
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
#include <numeric>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <set>
|
||||||
|
#include <cmath>
|
||||||
|
#include "CPPFImdlp.h"
|
||||||
|
#include "Metrics.h"
|
||||||
|
|
||||||
|
namespace mdlp {
|
||||||
|
|
||||||
|
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_),
|
||||||
|
max_depth(max_depth_),
|
||||||
|
proposed_cuts(proposed)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
CPPFImdlp::CPPFImdlp() = default;
|
||||||
|
|
||||||
|
CPPFImdlp::~CPPFImdlp() = default;
|
||||||
|
|
||||||
|
size_t CPPFImdlp::compute_max_num_cut_points() const
|
||||||
|
{
|
||||||
|
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
|
||||||
|
if (proposed_cuts == 0) {
|
||||||
|
return numeric_limits<size_t>::max();
|
||||||
|
}
|
||||||
|
if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
|
||||||
|
throw invalid_argument("wrong proposed num_cuts value");
|
||||||
|
}
|
||||||
|
if (proposed_cuts < 1)
|
||||||
|
return static_cast<size_t>(round(static_cast<float>(X.size()) * proposed_cuts));
|
||||||
|
return static_cast<size_t>(proposed_cuts);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
|
||||||
|
{
|
||||||
|
X = X_;
|
||||||
|
y = y_;
|
||||||
|
num_cut_points = compute_max_num_cut_points();
|
||||||
|
depth = 0;
|
||||||
|
discretizedData.clear();
|
||||||
|
cutPoints.clear();
|
||||||
|
if (X.size() != y.size()) {
|
||||||
|
throw invalid_argument("X and y must have the same size");
|
||||||
|
}
|
||||||
|
if (X.empty() || y.empty()) {
|
||||||
|
throw invalid_argument("X and y must have at least one element");
|
||||||
|
}
|
||||||
|
if (min_length < 3) {
|
||||||
|
throw invalid_argument("min_length must be greater than 2");
|
||||||
|
}
|
||||||
|
if (max_depth < 1) {
|
||||||
|
throw invalid_argument("max_depth must be greater than 0");
|
||||||
|
}
|
||||||
|
indices = sortIndices(X_, y_);
|
||||||
|
metrics.setData(y, indices);
|
||||||
|
computeCutPoints(0, X.size(), 1);
|
||||||
|
sort(cutPoints.begin(), cutPoints.end());
|
||||||
|
if (num_cut_points > 0) {
|
||||||
|
// Select the best (with lower entropy) cut points
|
||||||
|
while (cutPoints.size() > num_cut_points) {
|
||||||
|
resizeCutPoints();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
|
||||||
|
{
|
||||||
|
size_t n;
|
||||||
|
size_t m;
|
||||||
|
size_t idxPrev = cut - 1 >= start ? cut - 1 : cut;
|
||||||
|
size_t idxNext = cut + 1 < end ? cut + 1 : cut;
|
||||||
|
bool backWall; // true if duplicates reach beginning of the interval
|
||||||
|
precision_t previous;
|
||||||
|
precision_t actual;
|
||||||
|
precision_t next;
|
||||||
|
previous = X[indices[idxPrev]];
|
||||||
|
actual = X[indices[cut]];
|
||||||
|
next = X[indices[idxNext]];
|
||||||
|
// definition 2 of the paper => X[t-1] < X[t]
|
||||||
|
// get the first equal value of X in the interval
|
||||||
|
while (idxPrev > start && actual == previous) {
|
||||||
|
previous = X[indices[--idxPrev]];
|
||||||
|
}
|
||||||
|
backWall = idxPrev == start && actual == previous;
|
||||||
|
// get the last equal value of X in the interval
|
||||||
|
while (idxNext < end - 1 && actual == next) {
|
||||||
|
next = X[indices[++idxNext]];
|
||||||
|
}
|
||||||
|
// # of duplicates before cutpoint
|
||||||
|
n = cut - 1 - idxPrev;
|
||||||
|
// # of duplicates after cutpoint
|
||||||
|
m = idxNext - cut - 1;
|
||||||
|
// Decide which values to use
|
||||||
|
cut = cut + (backWall ? m + 1 : -n);
|
||||||
|
actual = X[indices[cut]];
|
||||||
|
return { (actual + previous) / 2, cut };
|
||||||
|
}
|
||||||
|
|
||||||
|
void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_)
|
||||||
|
{
|
||||||
|
size_t cut;
|
||||||
|
pair<precision_t, size_t> result;
|
||||||
|
// Check if the interval length and the depth are Ok
|
||||||
|
if (end - start < min_length || depth_ > max_depth)
|
||||||
|
return;
|
||||||
|
depth = depth_ > depth ? depth_ : depth;
|
||||||
|
cut = getCandidate(start, end);
|
||||||
|
if (cut == numeric_limits<size_t>::max())
|
||||||
|
return;
|
||||||
|
if (mdlp(start, cut, end)) {
|
||||||
|
result = valueCutPoint(start, cut, end);
|
||||||
|
cut = result.second;
|
||||||
|
cutPoints.push_back(result.first);
|
||||||
|
computeCutPoints(start, cut, depth_ + 1);
|
||||||
|
computeCutPoints(cut, end, depth_ + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t CPPFImdlp::getCandidate(size_t start, size_t end)
|
||||||
|
{
|
||||||
|
/* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which
|
||||||
|
E(A, TA; S) is minimal amongst all the candidate cut points. */
|
||||||
|
size_t candidate = numeric_limits<size_t>::max();
|
||||||
|
size_t elements = end - start;
|
||||||
|
bool sameValues = true;
|
||||||
|
precision_t entropy_left;
|
||||||
|
precision_t entropy_right;
|
||||||
|
precision_t minEntropy;
|
||||||
|
// Check if all the values of the variable in the interval are the same
|
||||||
|
for (size_t idx = start + 1; idx < end; idx++) {
|
||||||
|
if (X[indices[idx]] != X[indices[start]]) {
|
||||||
|
sameValues = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (sameValues)
|
||||||
|
return candidate;
|
||||||
|
minEntropy = metrics.entropy(start, end);
|
||||||
|
for (size_t idx = start + 1; idx < end; idx++) {
|
||||||
|
// Cutpoints are always on boundaries (definition 2)
|
||||||
|
if (y[indices[idx]] == y[indices[idx - 1]])
|
||||||
|
continue;
|
||||||
|
entropy_left = precision_t(idx - start) / static_cast<precision_t>(elements) * metrics.entropy(start, idx);
|
||||||
|
entropy_right = precision_t(end - idx) / static_cast<precision_t>(elements) * metrics.entropy(idx, end);
|
||||||
|
if (entropy_left + entropy_right < minEntropy) {
|
||||||
|
minEntropy = entropy_left + entropy_right;
|
||||||
|
candidate = idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return candidate;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end)
|
||||||
|
{
|
||||||
|
int k;
|
||||||
|
int k1;
|
||||||
|
int k2;
|
||||||
|
precision_t ig;
|
||||||
|
precision_t delta;
|
||||||
|
precision_t ent;
|
||||||
|
precision_t ent1;
|
||||||
|
precision_t ent2;
|
||||||
|
auto N = precision_t(end - start);
|
||||||
|
k = metrics.computeNumClasses(start, end);
|
||||||
|
k1 = metrics.computeNumClasses(start, cut);
|
||||||
|
k2 = metrics.computeNumClasses(cut, end);
|
||||||
|
ent = metrics.entropy(start, end);
|
||||||
|
ent1 = metrics.entropy(start, cut);
|
||||||
|
ent2 = metrics.entropy(cut, end);
|
||||||
|
ig = metrics.informationGain(start, cut, end);
|
||||||
|
delta = static_cast<precision_t>(log2(pow(3, precision_t(k)) - 2) -
|
||||||
|
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
||||||
|
precision_t term = 1 / N * (log2(N - 1) + delta);
|
||||||
|
return ig > term;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes
|
||||||
|
indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_)
|
||||||
|
{
|
||||||
|
indices_t idx(X_.size());
|
||||||
|
iota(idx.begin(), idx.end(), 0);
|
||||||
|
stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) {
|
||||||
|
if (X_[i1] == X_[i2])
|
||||||
|
return y_[i1] < y_[i2];
|
||||||
|
else
|
||||||
|
return X_[i1] < X_[i2];
|
||||||
|
});
|
||||||
|
return idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CPPFImdlp::resizeCutPoints()
|
||||||
|
{
|
||||||
|
//Compute entropy of each of the whole cutpoint set and discards the biggest value
|
||||||
|
precision_t maxEntropy = 0;
|
||||||
|
precision_t entropy;
|
||||||
|
size_t maxEntropyIdx = 0;
|
||||||
|
size_t begin = 0;
|
||||||
|
size_t end;
|
||||||
|
for (size_t idx = 0; idx < cutPoints.size(); idx++) {
|
||||||
|
end = begin;
|
||||||
|
while (X[indices[end]] < cutPoints[idx] && end < X.size())
|
||||||
|
end++;
|
||||||
|
entropy = metrics.entropy(begin, end);
|
||||||
|
if (entropy > maxEntropy) {
|
||||||
|
maxEntropy = entropy;
|
||||||
|
maxEntropyIdx = idx;
|
||||||
|
}
|
||||||
|
begin = end;
|
||||||
|
}
|
||||||
|
cutPoints.erase(cutPoints.begin() + static_cast<long>(maxEntropyIdx));
|
||||||
|
}
|
||||||
|
labels_t& CPPFImdlp::transform(const samples_t& data)
|
||||||
|
{
|
||||||
|
discretizedData.clear();
|
||||||
|
discretizedData.reserve(data.size());
|
||||||
|
for (const precision_t& item : data) {
|
||||||
|
auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item);
|
||||||
|
discretizedData.push_back(upper - cutPoints.begin());
|
||||||
|
}
|
||||||
|
return discretizedData;
|
||||||
|
}
|
||||||
|
}
|
51
sample/lib/mdlp/CPPFImdlp.h
Normal file
51
sample/lib/mdlp/CPPFImdlp.h
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
// ***************************************************************
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||||
|
// SPDX-FileType: SOURCE
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
// ***************************************************************
|
||||||
|
|
||||||
|
#ifndef CPPFIMDLP_H
|
||||||
|
#define CPPFIMDLP_H
|
||||||
|
|
||||||
|
#include "typesFImdlp.h"
|
||||||
|
#include "Metrics.h"
|
||||||
|
#include <limits>
|
||||||
|
#include <utility>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
namespace mdlp {
|
||||||
|
class CPPFImdlp {
|
||||||
|
protected:
|
||||||
|
size_t min_length = 3;
|
||||||
|
int depth = 0;
|
||||||
|
int max_depth = numeric_limits<int>::max();
|
||||||
|
float proposed_cuts = 0;
|
||||||
|
indices_t indices = indices_t();
|
||||||
|
samples_t X = samples_t();
|
||||||
|
labels_t y = labels_t();
|
||||||
|
Metrics metrics = Metrics(y, indices);
|
||||||
|
cutPoints_t cutPoints;
|
||||||
|
size_t num_cut_points = numeric_limits<size_t>::max();
|
||||||
|
labels_t discretizedData = labels_t();
|
||||||
|
|
||||||
|
static indices_t sortIndices(samples_t&, labels_t&);
|
||||||
|
|
||||||
|
void computeCutPoints(size_t, size_t, int);
|
||||||
|
void resizeCutPoints();
|
||||||
|
bool mdlp(size_t, size_t, size_t);
|
||||||
|
size_t getCandidate(size_t, size_t);
|
||||||
|
size_t compute_max_num_cut_points() const;
|
||||||
|
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
|
||||||
|
|
||||||
|
public:
|
||||||
|
CPPFImdlp();
|
||||||
|
CPPFImdlp(size_t, int, float);
|
||||||
|
~CPPFImdlp();
|
||||||
|
void fit(samples_t&, labels_t&);
|
||||||
|
inline cutPoints_t getCutPoints() const { return cutPoints; };
|
||||||
|
labels_t& transform(const samples_t&);
|
||||||
|
inline int get_depth() const { return depth; };
|
||||||
|
static inline string version() { return "1.1.2"; };
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
21
sample/lib/mdlp/LICENSE
Normal file
21
sample/lib/mdlp/LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2022 Ricardo Montañana Gómez
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
78
sample/lib/mdlp/Metrics.cpp
Normal file
78
sample/lib/mdlp/Metrics.cpp
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
#include "Metrics.h"
|
||||||
|
#include <set>
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
namespace mdlp {
|
||||||
|
Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_),
|
||||||
|
numClasses(computeNumClasses(0, indices.size()))
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
int Metrics::computeNumClasses(size_t start, size_t end)
|
||||||
|
{
|
||||||
|
set<int> nClasses;
|
||||||
|
for (auto i = start; i < end; ++i) {
|
||||||
|
nClasses.insert(y[indices[i]]);
|
||||||
|
}
|
||||||
|
return static_cast<int>(nClasses.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
void Metrics::setData(const labels_t& y_, const indices_t& indices_)
|
||||||
|
{
|
||||||
|
indices = indices_;
|
||||||
|
y = y_;
|
||||||
|
numClasses = computeNumClasses(0, indices.size());
|
||||||
|
entropyCache.clear();
|
||||||
|
igCache.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
precision_t Metrics::entropy(size_t start, size_t end)
|
||||||
|
{
|
||||||
|
precision_t p;
|
||||||
|
precision_t ventropy = 0;
|
||||||
|
int nElements = 0;
|
||||||
|
labels_t counts(numClasses + 1, 0);
|
||||||
|
if (end - start < 2)
|
||||||
|
return 0;
|
||||||
|
if (entropyCache.find({ start, end }) != entropyCache.end()) {
|
||||||
|
return entropyCache[{start, end}];
|
||||||
|
}
|
||||||
|
for (auto i = &indices[start]; i != &indices[end]; ++i) {
|
||||||
|
counts[y[*i]]++;
|
||||||
|
nElements++;
|
||||||
|
}
|
||||||
|
for (auto count : counts) {
|
||||||
|
if (count > 0) {
|
||||||
|
p = static_cast<precision_t>(count) / static_cast<precision_t>(nElements);
|
||||||
|
ventropy -= p * log2(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entropyCache[{start, end}] = ventropy;
|
||||||
|
return ventropy;
|
||||||
|
}
|
||||||
|
|
||||||
|
precision_t Metrics::informationGain(size_t start, size_t cut, size_t end)
|
||||||
|
{
|
||||||
|
precision_t iGain;
|
||||||
|
precision_t entropyInterval;
|
||||||
|
precision_t entropyLeft;
|
||||||
|
precision_t entropyRight;
|
||||||
|
size_t nElementsLeft = cut - start;
|
||||||
|
size_t nElementsRight = end - cut;
|
||||||
|
size_t nElements = end - start;
|
||||||
|
if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) {
|
||||||
|
return igCache[make_tuple(start, cut, end)];
|
||||||
|
}
|
||||||
|
entropyInterval = entropy(start, end);
|
||||||
|
entropyLeft = entropy(start, cut);
|
||||||
|
entropyRight = entropy(cut, end);
|
||||||
|
iGain = entropyInterval -
|
||||||
|
(static_cast<precision_t>(nElementsLeft) * entropyLeft +
|
||||||
|
static_cast<precision_t>(nElementsRight) * entropyRight) /
|
||||||
|
static_cast<precision_t>(nElements);
|
||||||
|
igCache[make_tuple(start, cut, end)] = iGain;
|
||||||
|
return iGain;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
28
sample/lib/mdlp/Metrics.h
Normal file
28
sample/lib/mdlp/Metrics.h
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
// ***************************************************************
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||||
|
// SPDX-FileType: SOURCE
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
// ***************************************************************
|
||||||
|
|
||||||
|
#ifndef CCMETRICS_H
|
||||||
|
#define CCMETRICS_H
|
||||||
|
|
||||||
|
#include "typesFImdlp.h"
|
||||||
|
|
||||||
|
namespace mdlp {
|
||||||
|
class Metrics {
|
||||||
|
protected:
|
||||||
|
labels_t& y;
|
||||||
|
indices_t& indices;
|
||||||
|
int numClasses;
|
||||||
|
cacheEnt_t entropyCache = cacheEnt_t();
|
||||||
|
cacheIg_t igCache = cacheIg_t();
|
||||||
|
public:
|
||||||
|
Metrics(labels_t&, indices_t&);
|
||||||
|
void setData(const labels_t&, const indices_t&);
|
||||||
|
int computeNumClasses(size_t, size_t);
|
||||||
|
precision_t entropy(size_t, size_t);
|
||||||
|
precision_t informationGain(size_t, size_t, size_t);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
41
sample/lib/mdlp/README.md
Normal file
41
sample/lib/mdlp/README.md
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
[](https://github.com/rmontanana/mdlp/actions/workflows/build.yml)
|
||||||
|
[](https://sonarcloud.io/summary/new_code?id=rmontanana_mdlp)
|
||||||
|
[](https://sonarcloud.io/summary/new_code?id=rmontanana_mdlp)
|
||||||
|
|
||||||
|
# mdlp
|
||||||
|
|
||||||
|
Discretization algorithm based on the paper by Fayyad & Irani [Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning](https://www.ijcai.org/Proceedings/93-2/Papers/022.pdf)
|
||||||
|
|
||||||
|
The implementation tries to mitigate the problem of different label values with the same value of the variable:
|
||||||
|
|
||||||
|
- Sorts the values of the variable using the label values as a tie-breaker
|
||||||
|
- Once found a valid candidate for the split, it checks if the previous value is the same as actual one, and tries to get previous one, or next if the former is not possible.
|
||||||
|
|
||||||
|
Other features:
|
||||||
|
|
||||||
|
- Intervals with the same value of the variable are not taken into account for cutpoints.
|
||||||
|
- Intervals have to have more than two examples to be evaluated.
|
||||||
|
|
||||||
|
The algorithm returns the cut points for the variable.
|
||||||
|
|
||||||
|
## Sample
|
||||||
|
|
||||||
|
To run the sample, just execute the following commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd sample
|
||||||
|
cmake -B build
|
||||||
|
cd build
|
||||||
|
make
|
||||||
|
./sample -f iris -m 2
|
||||||
|
./sample -h
|
||||||
|
```
|
||||||
|
|
||||||
|
## Test
|
||||||
|
|
||||||
|
To run the tests and see coverage (llvm & gcovr have to be installed), execute the following commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd tests
|
||||||
|
./test
|
||||||
|
```
|
24
sample/lib/mdlp/typesFImdlp.h
Normal file
24
sample/lib/mdlp/typesFImdlp.h
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
// ***************************************************************
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||||
|
// SPDX-FileType: SOURCE
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
// ***************************************************************
|
||||||
|
|
||||||
|
#ifndef TYPES_H
|
||||||
|
#define TYPES_H
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <map>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
namespace mdlp {
|
||||||
|
typedef float precision_t;
|
||||||
|
typedef vector<precision_t> samples_t;
|
||||||
|
typedef vector<int> labels_t;
|
||||||
|
typedef vector<size_t> indices_t;
|
||||||
|
typedef vector<precision_t> cutPoints_t;
|
||||||
|
typedef map<pair<int, int>, precision_t> cacheEnt_t;
|
||||||
|
typedef map<tuple<int, int, int>, precision_t> cacheIg_t;
|
||||||
|
}
|
||||||
|
#endif
|
@@ -7,6 +7,7 @@
|
|||||||
#include <ArffFiles.hpp>
|
#include <ArffFiles.hpp>
|
||||||
#include <CPPFImdlp.h>
|
#include <CPPFImdlp.h>
|
||||||
#include <bayesnet/ensembles/BoostAODE.h>
|
#include <bayesnet/ensembles/BoostAODE.h>
|
||||||
|
#include <torch/torch.h>
|
||||||
|
|
||||||
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||||
{
|
{
|
||||||
@@ -19,7 +20,8 @@ std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, m
|
|||||||
}
|
}
|
||||||
return Xd;
|
return Xd;
|
||||||
}
|
}
|
||||||
tuple<torch::Tensor, torch::Tensor, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last)
|
|
||||||
|
tuple<torch::Tensor, torch::Tensor, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last, torch::Device device)
|
||||||
{
|
{
|
||||||
auto handler = ArffFiles();
|
auto handler = ArffFiles();
|
||||||
handler.load(name, class_last);
|
handler.load(name, class_last);
|
||||||
@@ -34,16 +36,16 @@ tuple<torch::Tensor, torch::Tensor, std::vector<std::string>, std::string, map<s
|
|||||||
torch::Tensor Xd;
|
torch::Tensor Xd;
|
||||||
auto states = map<std::string, std::vector<int>>();
|
auto states = map<std::string, std::vector<int>>();
|
||||||
auto Xr = discretizeDataset(X, y);
|
auto Xr = discretizeDataset(X, y);
|
||||||
Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
|
Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32).to(device);
|
||||||
for (int i = 0; i < features.size(); ++i) {
|
for (int i = 0; i < features.size(); ++i) {
|
||||||
states[features[i]] = std::vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
|
states[features[i]] = std::vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
|
||||||
auto item = states.at(features[i]);
|
auto item = states.at(features[i]);
|
||||||
iota(begin(item), end(item), 0);
|
iota(begin(item), end(item), 0);
|
||||||
Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
|
Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32).to(device));
|
||||||
}
|
}
|
||||||
states[className] = std::vector<int>(*max_element(y.begin(), y.end()) + 1);
|
states[className] = std::vector<int>(*max_element(y.begin(), y.end()) + 1);
|
||||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||||
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
|
return { Xd, torch::tensor(y, torch::kInt32).to(device), features, className, states };
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
@@ -53,16 +55,22 @@ int main(int argc, char* argv[])
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
std::string file_name = argv[1];
|
std::string file_name = argv[1];
|
||||||
|
torch::Device device(torch::kCPU);
|
||||||
|
if (torch::cuda::is_available()) {
|
||||||
|
device = torch::Device(torch::kCUDA);
|
||||||
|
std::cout << "CUDA is available! Using GPU." << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cout << "CUDA is not available. Using CPU." << std::endl;
|
||||||
|
}
|
||||||
torch::Tensor X, y;
|
torch::Tensor X, y;
|
||||||
std::vector<std::string> features;
|
std::vector<std::string> features;
|
||||||
std::string className;
|
std::string className;
|
||||||
map<std::string, std::vector<int>> states;
|
map<std::string, std::vector<int>> states;
|
||||||
auto clf = bayesnet::BoostAODE(false); // false for not using voting in predict
|
auto clf = bayesnet::BoostAODE(false); // false for not using voting in predict
|
||||||
std::cout << "Library version: " << clf.getVersion() << std::endl;
|
std::cout << "Library version: " << clf.getVersion() << std::endl;
|
||||||
tie(X, y, features, className, states) = loadDataset(file_name, true);
|
tie(X, y, features, className, states) = loadDataset(file_name, true, device);
|
||||||
clf.fit(X, y, features, className, states, bayesnet::Smoothing_t::LAPLACE);
|
clf.fit(X, y, features, className, states, bayesnet::Smoothing_t::LAPLACE);
|
||||||
auto score = clf.score(X, y);
|
auto score = clf.score(X, y);
|
||||||
std::cout << "File: " << file_name << " Model: BoostAODE score: " << score << std::endl;
|
std::cout << "File: " << file_name << " Model: BoostAODE score: " << score << std::endl;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@@ -10,8 +10,8 @@ if(ENABLE_TESTING)
|
|||||||
file(GLOB_RECURSE BayesNet_SOURCES "${BayesNet_SOURCE_DIR}/bayesnet/*.cc")
|
file(GLOB_RECURSE BayesNet_SOURCES "${BayesNet_SOURCE_DIR}/bayesnet/*.cc")
|
||||||
add_executable(TestBayesNet TestBayesNetwork.cc TestBayesNode.cc TestBayesClassifier.cc
|
add_executable(TestBayesNet TestBayesNetwork.cc TestBayesNode.cc TestBayesClassifier.cc
|
||||||
TestBayesModels.cc TestBayesMetrics.cc TestFeatureSelection.cc TestBoostAODE.cc TestA2DE.cc
|
TestBayesModels.cc TestBayesMetrics.cc TestFeatureSelection.cc TestBoostAODE.cc TestA2DE.cc
|
||||||
TestUtils.cc TestBayesEnsemble.cc TestModulesVersions.cc TestBoostA2DE.cc TestMST.cc ${BayesNet_SOURCES})
|
TestUtils.cc TestBayesEnsemble.cc TestModulesVersions.cc TestBoostA2DE.cc ${BayesNet_SOURCES})
|
||||||
target_link_libraries(TestBayesNet PUBLIC "${TORCH_LIBRARIES}" fimdlp PRIVATE Catch2::Catch2WithMain)
|
target_link_libraries(TestBayesNet PUBLIC "${TORCH_LIBRARIES}" mdlp PRIVATE Catch2::Catch2WithMain)
|
||||||
add_test(NAME BayesNetworkTest COMMAND TestBayesNet)
|
add_test(NAME BayesNetworkTest COMMAND TestBayesNet)
|
||||||
add_test(NAME A2DE COMMAND TestBayesNet "[A2DE]")
|
add_test(NAME A2DE COMMAND TestBayesNet "[A2DE]")
|
||||||
add_test(NAME BoostA2DE COMMAND TestBayesNet "[BoostA2DE]")
|
add_test(NAME BoostA2DE COMMAND TestBayesNet "[BoostA2DE]")
|
||||||
@@ -24,5 +24,4 @@ if(ENABLE_TESTING)
|
|||||||
add_test(NAME Modules COMMAND TestBayesNet "[Modules]")
|
add_test(NAME Modules COMMAND TestBayesNet "[Modules]")
|
||||||
add_test(NAME Network COMMAND TestBayesNet "[Network]")
|
add_test(NAME Network COMMAND TestBayesNet "[Network]")
|
||||||
add_test(NAME Node COMMAND TestBayesNet "[Node]")
|
add_test(NAME Node COMMAND TestBayesNet "[Node]")
|
||||||
add_test(NAME MST COMMAND TestBayesNet "[MST]")
|
|
||||||
endif(ENABLE_TESTING)
|
endif(ENABLE_TESTING)
|
||||||
|
@@ -45,5 +45,5 @@ TEST_CASE("Test graph", "[A2DE]")
|
|||||||
auto graph = clf.graph();
|
auto graph = clf.graph();
|
||||||
REQUIRE(graph.size() == 78);
|
REQUIRE(graph.size() == 78);
|
||||||
REQUIRE(graph[0] == "digraph BayesNet {\nlabel=<BayesNet A2DE_0>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n");
|
REQUIRE(graph[0] == "digraph BayesNet {\nlabel=<BayesNet A2DE_0>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n");
|
||||||
REQUIRE(graph[1] == "\"class\" [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ] \n");
|
REQUIRE(graph[1] == "class [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ] \n");
|
||||||
}
|
}
|
||||||
|
@@ -85,7 +85,7 @@ TEST_CASE("Dump_cpt", "[Classifier]")
|
|||||||
auto raw = RawDatasets("iris", true);
|
auto raw = RawDatasets("iris", true);
|
||||||
model.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
|
model.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
auto cpt = model.dump_cpt();
|
auto cpt = model.dump_cpt();
|
||||||
REQUIRE(cpt.size() == 1718);
|
REQUIRE(cpt.size() == 1713);
|
||||||
}
|
}
|
||||||
TEST_CASE("Not fitted model", "[Classifier]")
|
TEST_CASE("Not fitted model", "[Classifier]")
|
||||||
{
|
{
|
||||||
|
@@ -27,13 +27,13 @@ TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
|
|||||||
map <pair<std::string, std::string>, float> scores{
|
map <pair<std::string, std::string>, float> scores{
|
||||||
// Diabetes
|
// Diabetes
|
||||||
{{"diabetes", "AODE"}, 0.82161}, {{"diabetes", "KDB"}, 0.852865}, {{"diabetes", "SPODE"}, 0.802083}, {{"diabetes", "TAN"}, 0.821615},
|
{{"diabetes", "AODE"}, 0.82161}, {{"diabetes", "KDB"}, 0.852865}, {{"diabetes", "SPODE"}, 0.802083}, {{"diabetes", "TAN"}, 0.821615},
|
||||||
{{"diabetes", "AODELd"}, 0.8125f}, {{"diabetes", "KDBLd"}, 0.80208f}, {{"diabetes", "SPODELd"}, 0.7890625f}, {{"diabetes", "TANLd"}, 0.803385437f}, {{"diabetes", "BoostAODE"}, 0.83984f},
|
{{"diabetes", "AODELd"}, 0.8138f}, {{"diabetes", "KDBLd"}, 0.80208f}, {{"diabetes", "SPODELd"}, 0.78646f}, {{"diabetes", "TANLd"}, 0.8099f}, {{"diabetes", "BoostAODE"}, 0.83984f},
|
||||||
// Ecoli
|
// Ecoli
|
||||||
{{"ecoli", "AODE"}, 0.889881}, {{"ecoli", "KDB"}, 0.889881}, {{"ecoli", "SPODE"}, 0.880952}, {{"ecoli", "TAN"}, 0.892857},
|
{{"ecoli", "AODE"}, 0.889881}, {{"ecoli", "KDB"}, 0.889881}, {{"ecoli", "SPODE"}, 0.880952}, {{"ecoli", "TAN"}, 0.892857},
|
||||||
{{"ecoli", "AODELd"}, 0.875f}, {{"ecoli", "KDBLd"}, 0.880952358f}, {{"ecoli", "SPODELd"}, 0.839285731f}, {{"ecoli", "TANLd"}, 0.848214269f}, {{"ecoli", "BoostAODE"}, 0.89583f},
|
{{"ecoli", "AODELd"}, 0.8869f}, {{"ecoli", "KDBLd"}, 0.875f}, {{"ecoli", "SPODELd"}, 0.84226f}, {{"ecoli", "TANLd"}, 0.86905f}, {{"ecoli", "BoostAODE"}, 0.89583f},
|
||||||
// Glass
|
// Glass
|
||||||
{{"glass", "AODE"}, 0.79439}, {{"glass", "KDB"}, 0.827103}, {{"glass", "SPODE"}, 0.775701}, {{"glass", "TAN"}, 0.827103},
|
{{"glass", "AODE"}, 0.79439}, {{"glass", "KDB"}, 0.827103}, {{"glass", "SPODE"}, 0.775701}, {{"glass", "TAN"}, 0.827103},
|
||||||
{{"glass", "AODELd"}, 0.799065411f}, {{"glass", "KDBLd"}, 0.82710278f}, {{"glass", "SPODELd"}, 0.780373812f}, {{"glass", "TANLd"}, 0.869158864f}, {{"glass", "BoostAODE"}, 0.84579f},
|
{{"glass", "AODELd"}, 0.79439f}, {{"glass", "KDBLd"}, 0.85047f}, {{"glass", "SPODELd"}, 0.79439f}, {{"glass", "TANLd"}, 0.86449f}, {{"glass", "BoostAODE"}, 0.84579f},
|
||||||
// Iris
|
// Iris
|
||||||
{{"iris", "AODE"}, 0.973333}, {{"iris", "KDB"}, 0.973333}, {{"iris", "SPODE"}, 0.973333}, {{"iris", "TAN"}, 0.973333},
|
{{"iris", "AODE"}, 0.973333}, {{"iris", "KDB"}, 0.973333}, {{"iris", "SPODE"}, 0.973333}, {{"iris", "TAN"}, 0.973333},
|
||||||
{{"iris", "AODELd"}, 0.973333}, {{"iris", "KDBLd"}, 0.973333}, {{"iris", "SPODELd"}, 0.96f}, {{"iris", "TANLd"}, 0.97333f}, {{"iris", "BoostAODE"}, 0.98f}
|
{{"iris", "AODELd"}, 0.973333}, {{"iris", "KDBLd"}, 0.973333}, {{"iris", "SPODELd"}, 0.96f}, {{"iris", "TANLd"}, 0.97333f}, {{"iris", "BoostAODE"}, 0.98f}
|
||||||
@@ -71,10 +71,10 @@ TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
|
|||||||
TEST_CASE("Models features & Graph", "[Models]")
|
TEST_CASE("Models features & Graph", "[Models]")
|
||||||
{
|
{
|
||||||
auto graph = std::vector<std::string>({ "digraph BayesNet {\nlabel=<BayesNet Test>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n",
|
auto graph = std::vector<std::string>({ "digraph BayesNet {\nlabel=<BayesNet Test>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n",
|
||||||
"\"class\" [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ] \n",
|
"class [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ] \n",
|
||||||
"\"class\" -> \"sepallength\"", "\"class\" -> \"sepalwidth\"", "\"class\" -> \"petallength\"", "\"class\" -> \"petalwidth\"", "\"petallength\" [shape=circle] \n",
|
"class -> sepallength", "class -> sepalwidth", "class -> petallength", "class -> petalwidth", "petallength [shape=circle] \n",
|
||||||
"\"petallength\" -> \"sepallength\"", "\"petalwidth\" [shape=circle] \n", "\"sepallength\" [shape=circle] \n",
|
"petallength -> sepallength", "petalwidth [shape=circle] \n", "sepallength [shape=circle] \n",
|
||||||
"\"sepallength\" -> \"sepalwidth\"", "\"sepalwidth\" [shape=circle] \n", "\"sepalwidth\" -> \"petalwidth\"", "}\n"
|
"sepallength -> sepalwidth", "sepalwidth [shape=circle] \n", "sepalwidth -> petalwidth", "}\n"
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
SECTION("Test TAN")
|
SECTION("Test TAN")
|
||||||
@@ -96,7 +96,7 @@ TEST_CASE("Models features & Graph", "[Models]")
|
|||||||
clf.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
|
clf.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
REQUIRE(clf.getNumberOfNodes() == 5);
|
REQUIRE(clf.getNumberOfNodes() == 5);
|
||||||
REQUIRE(clf.getNumberOfEdges() == 7);
|
REQUIRE(clf.getNumberOfEdges() == 7);
|
||||||
REQUIRE(clf.getNumberOfStates() == 27);
|
REQUIRE(clf.getNumberOfStates() == 19);
|
||||||
REQUIRE(clf.getClassNumStates() == 3);
|
REQUIRE(clf.getClassNumStates() == 3);
|
||||||
REQUIRE(clf.show() == std::vector<std::string>{"class -> sepallength, sepalwidth, petallength, petalwidth, ", "petallength -> sepallength, ", "petalwidth -> ", "sepallength -> sepalwidth, ", "sepalwidth -> petalwidth, "});
|
REQUIRE(clf.show() == std::vector<std::string>{"class -> sepallength, sepalwidth, petallength, petalwidth, ", "petallength -> sepallength, ", "petalwidth -> ", "sepallength -> sepalwidth, ", "sepalwidth -> petalwidth, "});
|
||||||
REQUIRE(clf.graph("Test") == graph);
|
REQUIRE(clf.graph("Test") == graph);
|
||||||
|
@@ -186,11 +186,11 @@ TEST_CASE("Test Bayesian Network", "[Network]")
|
|||||||
auto str = net.graph("Test Graph");
|
auto str = net.graph("Test Graph");
|
||||||
REQUIRE(str.size() == 7);
|
REQUIRE(str.size() == 7);
|
||||||
REQUIRE(str[0] == "digraph BayesNet {\nlabel=<BayesNet Test Graph>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n");
|
REQUIRE(str[0] == "digraph BayesNet {\nlabel=<BayesNet Test Graph>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n");
|
||||||
REQUIRE(str[1] == "\"A\" [shape=circle] \n");
|
REQUIRE(str[1] == "A [shape=circle] \n");
|
||||||
REQUIRE(str[2] == "\"A\" -> \"B\"");
|
REQUIRE(str[2] == "A -> B");
|
||||||
REQUIRE(str[3] == "\"A\" -> \"C\"");
|
REQUIRE(str[3] == "A -> C");
|
||||||
REQUIRE(str[4] == "\"B\" [shape=circle] \n");
|
REQUIRE(str[4] == "B [shape=circle] \n");
|
||||||
REQUIRE(str[5] == "\"C\" [shape=circle] \n");
|
REQUIRE(str[5] == "C [shape=circle] \n");
|
||||||
REQUIRE(str[6] == "}\n");
|
REQUIRE(str[6] == "}\n");
|
||||||
}
|
}
|
||||||
SECTION("Test predict")
|
SECTION("Test predict")
|
||||||
@@ -257,9 +257,9 @@ TEST_CASE("Test Bayesian Network", "[Network]")
|
|||||||
REQUIRE(node->getCPT().equal(node2->getCPT()));
|
REQUIRE(node->getCPT().equal(node2->getCPT()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
SECTION("Network oddities")
|
SECTION("Test oddities")
|
||||||
{
|
{
|
||||||
INFO("Network oddities");
|
INFO("Test oddities");
|
||||||
buildModel(net, raw.features, raw.className);
|
buildModel(net, raw.features, raw.className);
|
||||||
// predict without fitting
|
// predict without fitting
|
||||||
std::vector<std::vector<int>> test = { {1, 2, 0, 1, 1}, {0, 1, 2, 0, 1}, {0, 0, 0, 0, 1}, {2, 2, 2, 2, 1} };
|
std::vector<std::vector<int>> test = { {1, 2, 0, 1, 1}, {0, 1, 2, 0, 1}, {0, 0, 0, 0, 1}, {2, 2, 2, 2, 1} };
|
||||||
@@ -329,14 +329,6 @@ TEST_CASE("Test Bayesian Network", "[Network]")
|
|||||||
std::string invalid_state = "Feature sepallength not found in states";
|
std::string invalid_state = "Feature sepallength not found in states";
|
||||||
REQUIRE_THROWS_AS(net4.fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, std::map<std::string, std::vector<int>>(), raw.smoothing), std::invalid_argument);
|
REQUIRE_THROWS_AS(net4.fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, std::map<std::string, std::vector<int>>(), raw.smoothing), std::invalid_argument);
|
||||||
REQUIRE_THROWS_WITH(net4.fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, std::map<std::string, std::vector<int>>(), raw.smoothing), invalid_state);
|
REQUIRE_THROWS_WITH(net4.fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, std::map<std::string, std::vector<int>>(), raw.smoothing), invalid_state);
|
||||||
// Try to add node or edge to a fitted network
|
|
||||||
auto net5 = bayesnet::Network();
|
|
||||||
buildModel(net5, raw.features, raw.className);
|
|
||||||
net5.fit(raw.Xv, raw.yv, raw.weightsv, raw.features, raw.className, raw.states, raw.smoothing);
|
|
||||||
REQUIRE_THROWS_AS(net5.addNode("A"), std::logic_error);
|
|
||||||
REQUIRE_THROWS_WITH(net5.addNode("A"), "Cannot add node to a fitted network. Initialize first.");
|
|
||||||
REQUIRE_THROWS_AS(net5.addEdge("A", "B"), std::logic_error);
|
|
||||||
REQUIRE_THROWS_WITH(net5.addEdge("A", "B"), "Cannot add edge to a fitted network. Initialize first.");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -381,7 +373,7 @@ TEST_CASE("Dump CPT", "[Network]")
|
|||||||
0.3333
|
0.3333
|
||||||
0.3333
|
0.3333
|
||||||
0.3333
|
0.3333
|
||||||
[ CPUDoubleType{3} ]
|
[ CPUFloatType{3} ]
|
||||||
* petallength: (4) : [4, 3, 3]
|
* petallength: (4) : [4, 3, 3]
|
||||||
(1,.,.) =
|
(1,.,.) =
|
||||||
0.9388 0.1000 0.2000
|
0.9388 0.1000 0.2000
|
||||||
@@ -402,7 +394,7 @@ TEST_CASE("Dump CPT", "[Network]")
|
|||||||
0.0204 0.1000 0.2000
|
0.0204 0.1000 0.2000
|
||||||
0.1250 0.0526 0.1667
|
0.1250 0.0526 0.1667
|
||||||
0.2000 0.0606 0.8235
|
0.2000 0.0606 0.8235
|
||||||
[ CPUDoubleType{4,3,3} ]
|
[ CPUFloatType{4,3,3} ]
|
||||||
* petalwidth: (3) : [3, 6, 3]
|
* petalwidth: (3) : [3, 6, 3]
|
||||||
(1,.,.) =
|
(1,.,.) =
|
||||||
0.5000 0.0417 0.0714
|
0.5000 0.0417 0.0714
|
||||||
@@ -427,12 +419,12 @@ TEST_CASE("Dump CPT", "[Network]")
|
|||||||
0.1111 0.0909 0.8000
|
0.1111 0.0909 0.8000
|
||||||
0.0667 0.2000 0.8667
|
0.0667 0.2000 0.8667
|
||||||
0.0303 0.2500 0.7500
|
0.0303 0.2500 0.7500
|
||||||
[ CPUDoubleType{3,6,3} ]
|
[ CPUFloatType{3,6,3} ]
|
||||||
* sepallength: (3) : [3, 3]
|
* sepallength: (3) : [3, 3]
|
||||||
0.8679 0.1321 0.0377
|
0.8679 0.1321 0.0377
|
||||||
0.0943 0.3019 0.0566
|
0.0943 0.3019 0.0566
|
||||||
0.0377 0.5660 0.9057
|
0.0377 0.5660 0.9057
|
||||||
[ CPUDoubleType{3,3} ]
|
[ CPUFloatType{3,3} ]
|
||||||
* sepalwidth: (6) : [6, 3, 3]
|
* sepalwidth: (6) : [6, 3, 3]
|
||||||
(1,.,.) =
|
(1,.,.) =
|
||||||
0.0392 0.5000 0.2857
|
0.0392 0.5000 0.2857
|
||||||
@@ -463,7 +455,7 @@ TEST_CASE("Dump CPT", "[Network]")
|
|||||||
0.5098 0.0833 0.1429
|
0.5098 0.0833 0.1429
|
||||||
0.5000 0.0476 0.1250
|
0.5000 0.0476 0.1250
|
||||||
0.2857 0.0571 0.1132
|
0.2857 0.0571 0.1132
|
||||||
[ CPUDoubleType{6,3,3} ]
|
[ CPUFloatType{6,3,3} ]
|
||||||
)";
|
)";
|
||||||
REQUIRE(res == expected);
|
REQUIRE(res == expected);
|
||||||
}
|
}
|
||||||
@@ -533,7 +525,6 @@ TEST_CASE("Test Smoothing A", "[Network]")
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE("Test Smoothing B", "[Network]")
|
TEST_CASE("Test Smoothing B", "[Network]")
|
||||||
{
|
{
|
||||||
auto net = bayesnet::Network();
|
auto net = bayesnet::Network();
|
||||||
@@ -558,41 +549,19 @@ TEST_CASE("Test Smoothing B", "[Network]")
|
|||||||
{ "C", {0, 1} }
|
{ "C", {0, 1} }
|
||||||
};
|
};
|
||||||
auto weights = std::vector<double>(C.size(), 1);
|
auto weights = std::vector<double>(C.size(), 1);
|
||||||
// See https://www.overleaf.com/read/tfnhpfysfkfx#2d576c example for calculations
|
// Simple
|
||||||
INFO("Test Smoothing B - Laplace");
|
std::cout << "LAPLACE\n";
|
||||||
net.fit(Data, C, weights, { "X", "Y", "Z" }, "C", states, bayesnet::Smoothing_t::LAPLACE);
|
net.fit(Data, C, weights, { "X", "Y", "Z" }, "C", states, bayesnet::Smoothing_t::LAPLACE);
|
||||||
auto laplace_values = std::vector<std::vector<float>>({ {0.377418, 0.622582}, {0.217821, 0.782179} });
|
std::cout << net.dump_cpt();
|
||||||
auto laplace_score = net.predict_proba({ {0, 1}, {1, 2}, {2, 3} });
|
std::cout << "Predict proba of {0, 1, 2} y {1, 2, 3} = " << net.predict_proba({ {0, 1}, {1, 2}, {2, 3} }) << std::endl;
|
||||||
for (auto i = 0; i < 2; ++i) {
|
std::cout << "ORIGINAL\n";
|
||||||
for (auto j = 0; j < 2; ++j) {
|
|
||||||
REQUIRE(laplace_score.at(i).at(j) == Catch::Approx(laplace_values.at(i).at(j)).margin(threshold));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
INFO("Test Smoothing B - Original");
|
|
||||||
net.fit(Data, C, weights, { "X", "Y", "Z" }, "C", states, bayesnet::Smoothing_t::ORIGINAL);
|
net.fit(Data, C, weights, { "X", "Y", "Z" }, "C", states, bayesnet::Smoothing_t::ORIGINAL);
|
||||||
auto original_values = std::vector<std::vector<float>>({ {0.344769, 0.655231}, {0.0421263, 0.957874} });
|
std::cout << net.dump_cpt();
|
||||||
auto original_score = net.predict_proba({ {0, 1}, {1, 2}, {2, 3} });
|
std::cout << "Predict proba of {0, 1, 2} y {1, 2, 3} = " << net.predict_proba({ {0, 1}, {1, 2}, {2, 3} }) << std::endl;
|
||||||
for (auto i = 0; i < 2; ++i) {
|
std::cout << "CESTNIK\n";
|
||||||
for (auto j = 0; j < 2; ++j) {
|
|
||||||
REQUIRE(original_score.at(i).at(j) == Catch::Approx(original_values.at(i).at(j)).margin(threshold));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
INFO("Test Smoothing B - Cestnik");
|
|
||||||
net.fit(Data, C, weights, { "X", "Y", "Z" }, "C", states, bayesnet::Smoothing_t::CESTNIK);
|
net.fit(Data, C, weights, { "X", "Y", "Z" }, "C", states, bayesnet::Smoothing_t::CESTNIK);
|
||||||
auto cestnik_values = std::vector<std::vector<float>>({ {0.353422, 0.646578}, {0.12364, 0.87636} });
|
std::cout << net.dump_cpt();
|
||||||
auto cestnik_score = net.predict_proba({ {0, 1}, {1, 2}, {2, 3} });
|
std::cout << "Predict proba of {0, 1, 2} y {1, 2, 3} = " << net.predict_proba({ {0, 1}, {1, 2}, {2, 3} }) << std::endl;
|
||||||
for (auto i = 0; i < 2; ++i) {
|
|
||||||
for (auto j = 0; j < 2; ++j) {
|
|
||||||
REQUIRE(cestnik_score.at(i).at(j) == Catch::Approx(cestnik_values.at(i).at(j)).margin(threshold));
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
INFO("Test Smoothing B - No smoothing");
|
|
||||||
net.fit(Data, C, weights, { "X", "Y", "Z" }, "C", states, bayesnet::Smoothing_t::NONE);
|
|
||||||
auto nosmooth_values = std::vector<std::vector<float>>({ {0.342465753, 0.65753424}, {0.0, 1.0} });
|
|
||||||
auto nosmooth_score = net.predict_proba({ {0, 1}, {1, 2}, {2, 3} });
|
|
||||||
for (auto i = 0; i < 2; ++i) {
|
|
||||||
for (auto j = 0; j < 2; ++j) {
|
|
||||||
REQUIRE(nosmooth_score.at(i).at(j) == Catch::Approx(nosmooth_values.at(i).at(j)).margin(threshold));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@@ -62,17 +62,15 @@ TEST_CASE("Test Node computeCPT", "[Node]")
|
|||||||
// Create a vector with the names of the classes
|
// Create a vector with the names of the classes
|
||||||
auto className = std::string("Class");
|
auto className = std::string("Class");
|
||||||
// weights
|
// weights
|
||||||
auto weights = torch::tensor({ 1.0, 1.0, 1.0, 1.0 }, torch::kDouble);
|
auto weights = torch::tensor({ 1.0, 1.0, 1.0, 1.0 });
|
||||||
std::vector<bayesnet::Node> nodes;
|
std::vector<bayesnet::Node> nodes;
|
||||||
for (int i = 0; i < features.size(); i++) {
|
for (int i = 0; i < features.size(); i++) {
|
||||||
auto node = bayesnet::Node(features[i]);
|
auto node = bayesnet::Node(features[i]);
|
||||||
node.setNumStates(states[i]);
|
node.setNumStates(states[i]);
|
||||||
nodes.push_back(node);
|
nodes.push_back(node);
|
||||||
}
|
}
|
||||||
// Create node class with 2 states
|
|
||||||
nodes.push_back(bayesnet::Node(className));
|
nodes.push_back(bayesnet::Node(className));
|
||||||
nodes[features.size()].setNumStates(2);
|
nodes[features.size()].setNumStates(2);
|
||||||
// The network is c->f1, f2, f3 y f1->f2, f3
|
|
||||||
for (int i = 0; i < features.size(); i++) {
|
for (int i = 0; i < features.size(); i++) {
|
||||||
// Add class node as parent of all feature nodes
|
// Add class node as parent of all feature nodes
|
||||||
nodes[i].addParent(&nodes[features.size()]);
|
nodes[i].addParent(&nodes[features.size()]);
|
||||||
|
@@ -27,192 +27,189 @@ TEST_CASE("Build basic model", "[BoostA2DE]")
|
|||||||
auto score = clf.score(raw.Xv, raw.yv);
|
auto score = clf.score(raw.Xv, raw.yv);
|
||||||
REQUIRE(score == Catch::Approx(0.919271).epsilon(raw.epsilon));
|
REQUIRE(score == Catch::Approx(0.919271).epsilon(raw.epsilon));
|
||||||
}
|
}
|
||||||
TEST_CASE("Feature_select IWSS", "[BoostA2DE]")
|
// TEST_CASE("Feature_select IWSS", "[BoostAODE]")
|
||||||
{
|
// {
|
||||||
auto raw = RawDatasets("glass", true);
|
// auto raw = RawDatasets("glass", true);
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// auto clf = bayesnet::BoostAODE();
|
||||||
clf.setHyperparameters({ {"select_features", "IWSS"}, {"threshold", 0.5 } });
|
// clf.setHyperparameters({ {"select_features", "IWSS"}, {"threshold", 0.5 } });
|
||||||
clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
// clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
REQUIRE(clf.getNumberOfNodes() == 140);
|
// REQUIRE(clf.getNumberOfNodes() == 90);
|
||||||
REQUIRE(clf.getNumberOfEdges() == 294);
|
// REQUIRE(clf.getNumberOfEdges() == 153);
|
||||||
REQUIRE(clf.getNotes().size() == 4);
|
// REQUIRE(clf.getNotes().size() == 2);
|
||||||
REQUIRE(clf.getNotes()[0] == "Used features in initialization: 4 of 9 with IWSS");
|
// REQUIRE(clf.getNotes()[0] == "Used features in initialization: 4 of 9 with IWSS");
|
||||||
REQUIRE(clf.getNotes()[1] == "Convergence threshold reached & 15 models eliminated");
|
// REQUIRE(clf.getNotes()[1] == "Number of models: 9");
|
||||||
REQUIRE(clf.getNotes()[2] == "Pairs not used in train: 2");
|
// }
|
||||||
REQUIRE(clf.getNotes()[3] == "Number of models: 14");
|
// TEST_CASE("Feature_select FCBF", "[BoostAODE]")
|
||||||
}
|
// {
|
||||||
TEST_CASE("Feature_select FCBF", "[BoostA2DE]")
|
// auto raw = RawDatasets("glass", true);
|
||||||
{
|
// auto clf = bayesnet::BoostAODE();
|
||||||
auto raw = RawDatasets("glass", true);
|
// clf.setHyperparameters({ {"select_features", "FCBF"}, {"threshold", 1e-7 } });
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
clf.setHyperparameters({ {"select_features", "FCBF"}, {"threshold", 1e-7 } });
|
// REQUIRE(clf.getNumberOfNodes() == 90);
|
||||||
clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
// REQUIRE(clf.getNumberOfEdges() == 153);
|
||||||
REQUIRE(clf.getNumberOfNodes() == 110);
|
// REQUIRE(clf.getNotes().size() == 2);
|
||||||
REQUIRE(clf.getNumberOfEdges() == 231);
|
// REQUIRE(clf.getNotes()[0] == "Used features in initialization: 4 of 9 with FCBF");
|
||||||
REQUIRE(clf.getNotes()[0] == "Used features in initialization: 4 of 9 with FCBF");
|
// REQUIRE(clf.getNotes()[1] == "Number of models: 9");
|
||||||
REQUIRE(clf.getNotes()[1] == "Convergence threshold reached & 15 models eliminated");
|
// }
|
||||||
REQUIRE(clf.getNotes()[2] == "Pairs not used in train: 2");
|
// TEST_CASE("Test used features in train note and score", "[BoostAODE]")
|
||||||
REQUIRE(clf.getNotes()[3] == "Number of models: 11");
|
// {
|
||||||
}
|
// auto raw = RawDatasets("diabetes", true);
|
||||||
TEST_CASE("Test used features in train note and score", "[BoostA2DE]")
|
// auto clf = bayesnet::BoostAODE(true);
|
||||||
{
|
// clf.setHyperparameters({
|
||||||
auto raw = RawDatasets("diabetes", true);
|
// {"order", "asc"},
|
||||||
auto clf = bayesnet::BoostA2DE(true);
|
// {"convergence", true},
|
||||||
clf.setHyperparameters({
|
// {"select_features","CFS"},
|
||||||
{"order", "asc"},
|
// });
|
||||||
{"convergence", true},
|
// clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
{"select_features","CFS"},
|
// REQUIRE(clf.getNumberOfNodes() == 72);
|
||||||
});
|
// REQUIRE(clf.getNumberOfEdges() == 120);
|
||||||
clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
// REQUIRE(clf.getNotes().size() == 2);
|
||||||
REQUIRE(clf.getNumberOfNodes() == 144);
|
// REQUIRE(clf.getNotes()[0] == "Used features in initialization: 6 of 8 with CFS");
|
||||||
REQUIRE(clf.getNumberOfEdges() == 288);
|
// REQUIRE(clf.getNotes()[1] == "Number of models: 8");
|
||||||
REQUIRE(clf.getNotes().size() == 2);
|
// auto score = clf.score(raw.Xv, raw.yv);
|
||||||
REQUIRE(clf.getNotes()[0] == "Used features in initialization: 6 of 8 with CFS");
|
// auto scoret = clf.score(raw.Xt, raw.yt);
|
||||||
REQUIRE(clf.getNotes()[1] == "Number of models: 16");
|
// REQUIRE(score == Catch::Approx(0.809895813).epsilon(raw.epsilon));
|
||||||
auto score = clf.score(raw.Xv, raw.yv);
|
// REQUIRE(scoret == Catch::Approx(0.809895813).epsilon(raw.epsilon));
|
||||||
auto scoret = clf.score(raw.Xt, raw.yt);
|
// }
|
||||||
REQUIRE(score == Catch::Approx(0.856771).epsilon(raw.epsilon));
|
// TEST_CASE("Voting vs proba", "[BoostAODE]")
|
||||||
REQUIRE(scoret == Catch::Approx(0.856771).epsilon(raw.epsilon));
|
// {
|
||||||
}
|
// auto raw = RawDatasets("iris", true);
|
||||||
TEST_CASE("Voting vs proba", "[BoostA2DE]")
|
// auto clf = bayesnet::BoostAODE(false);
|
||||||
{
|
// clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
auto raw = RawDatasets("iris", true);
|
// auto score_proba = clf.score(raw.Xv, raw.yv);
|
||||||
auto clf = bayesnet::BoostA2DE(false);
|
// auto pred_proba = clf.predict_proba(raw.Xv);
|
||||||
clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
// clf.setHyperparameters({
|
||||||
auto score_proba = clf.score(raw.Xv, raw.yv);
|
// {"predict_voting",true},
|
||||||
auto pred_proba = clf.predict_proba(raw.Xv);
|
// });
|
||||||
clf.setHyperparameters({
|
// auto score_voting = clf.score(raw.Xv, raw.yv);
|
||||||
{"predict_voting",true},
|
// auto pred_voting = clf.predict_proba(raw.Xv);
|
||||||
});
|
// REQUIRE(score_proba == Catch::Approx(0.97333).epsilon(raw.epsilon));
|
||||||
auto score_voting = clf.score(raw.Xv, raw.yv);
|
// REQUIRE(score_voting == Catch::Approx(0.98).epsilon(raw.epsilon));
|
||||||
auto pred_voting = clf.predict_proba(raw.Xv);
|
// REQUIRE(pred_voting[83][2] == Catch::Approx(1.0).epsilon(raw.epsilon));
|
||||||
REQUIRE(score_proba == Catch::Approx(0.98).epsilon(raw.epsilon));
|
// REQUIRE(pred_proba[83][2] == Catch::Approx(0.86121525).epsilon(raw.epsilon));
|
||||||
REQUIRE(score_voting == Catch::Approx(0.946667).epsilon(raw.epsilon));
|
// REQUIRE(clf.dump_cpt() == "");
|
||||||
REQUIRE(pred_voting[83][2] == Catch::Approx(0.53508).epsilon(raw.epsilon));
|
// REQUIRE(clf.topological_order() == std::vector<std::string>());
|
||||||
REQUIRE(pred_proba[83][2] == Catch::Approx(0.48394).epsilon(raw.epsilon));
|
// }
|
||||||
REQUIRE(clf.dump_cpt() == "");
|
// TEST_CASE("Order asc, desc & random", "[BoostAODE]")
|
||||||
REQUIRE(clf.topological_order() == std::vector<std::string>());
|
// {
|
||||||
}
|
// auto raw = RawDatasets("glass", true);
|
||||||
TEST_CASE("Order asc, desc & random", "[BoostA2DE]")
|
// std::map<std::string, double> scores{
|
||||||
{
|
// {"asc", 0.83645f }, { "desc", 0.84579f }, { "rand", 0.84112 }
|
||||||
auto raw = RawDatasets("glass", true);
|
// };
|
||||||
std::map<std::string, double> scores{
|
// for (const std::string& order : { "asc", "desc", "rand" }) {
|
||||||
{"asc", 0.752336f }, { "desc", 0.813084f }, { "rand", 0.850467 }
|
// auto clf = bayesnet::BoostAODE();
|
||||||
};
|
// clf.setHyperparameters({
|
||||||
for (const std::string& order : { "asc", "desc", "rand" }) {
|
// {"order", order},
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// {"bisection", false},
|
||||||
clf.setHyperparameters({
|
// {"maxTolerance", 1},
|
||||||
{"order", order},
|
// {"convergence", false},
|
||||||
{"bisection", false},
|
// });
|
||||||
{"maxTolerance", 1},
|
// clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
{"convergence", false},
|
// auto score = clf.score(raw.Xv, raw.yv);
|
||||||
});
|
// auto scoret = clf.score(raw.Xt, raw.yt);
|
||||||
clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
// INFO("BoostAODE order: " + order);
|
||||||
auto score = clf.score(raw.Xv, raw.yv);
|
// REQUIRE(score == Catch::Approx(scores[order]).epsilon(raw.epsilon));
|
||||||
auto scoret = clf.score(raw.Xt, raw.yt);
|
// REQUIRE(scoret == Catch::Approx(scores[order]).epsilon(raw.epsilon));
|
||||||
INFO("BoostA2DE order: " + order);
|
// }
|
||||||
REQUIRE(score == Catch::Approx(scores[order]).epsilon(raw.epsilon));
|
// }
|
||||||
REQUIRE(scoret == Catch::Approx(scores[order]).epsilon(raw.epsilon));
|
// TEST_CASE("Oddities", "[BoostAODE]")
|
||||||
}
|
// {
|
||||||
}
|
// auto clf = bayesnet::BoostAODE();
|
||||||
TEST_CASE("Oddities2", "[BoostA2DE]")
|
// auto raw = RawDatasets("iris", true);
|
||||||
{
|
// auto bad_hyper = nlohmann::json{
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// { { "order", "duck" } },
|
||||||
auto raw = RawDatasets("iris", true);
|
// { { "select_features", "duck" } },
|
||||||
auto bad_hyper = nlohmann::json{
|
// { { "maxTolerance", 0 } },
|
||||||
{ { "order", "duck" } },
|
// { { "maxTolerance", 5 } },
|
||||||
{ { "select_features", "duck" } },
|
// };
|
||||||
{ { "maxTolerance", 0 } },
|
// for (const auto& hyper : bad_hyper.items()) {
|
||||||
{ { "maxTolerance", 5 } },
|
// INFO("BoostAODE hyper: " + hyper.value().dump());
|
||||||
};
|
// REQUIRE_THROWS_AS(clf.setHyperparameters(hyper.value()), std::invalid_argument);
|
||||||
for (const auto& hyper : bad_hyper.items()) {
|
// }
|
||||||
INFO("BoostA2DE hyper: " + hyper.value().dump());
|
// REQUIRE_THROWS_AS(clf.setHyperparameters({ {"maxTolerance", 0 } }), std::invalid_argument);
|
||||||
REQUIRE_THROWS_AS(clf.setHyperparameters(hyper.value()), std::invalid_argument);
|
// auto bad_hyper_fit = nlohmann::json{
|
||||||
}
|
// { { "select_features","IWSS" }, { "threshold", -0.01 } },
|
||||||
REQUIRE_THROWS_AS(clf.setHyperparameters({ {"maxTolerance", 0 } }), std::invalid_argument);
|
// { { "select_features","IWSS" }, { "threshold", 0.51 } },
|
||||||
auto bad_hyper_fit = nlohmann::json{
|
// { { "select_features","FCBF" }, { "threshold", 1e-8 } },
|
||||||
{ { "select_features","IWSS" }, { "threshold", -0.01 } },
|
// { { "select_features","FCBF" }, { "threshold", 1.01 } },
|
||||||
{ { "select_features","IWSS" }, { "threshold", 0.51 } },
|
// };
|
||||||
{ { "select_features","FCBF" }, { "threshold", 1e-8 } },
|
// for (const auto& hyper : bad_hyper_fit.items()) {
|
||||||
{ { "select_features","FCBF" }, { "threshold", 1.01 } },
|
// INFO("BoostAODE hyper: " + hyper.value().dump());
|
||||||
};
|
// clf.setHyperparameters(hyper.value());
|
||||||
for (const auto& hyper : bad_hyper_fit.items()) {
|
// REQUIRE_THROWS_AS(clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing, std::invalid_argument);
|
||||||
INFO("BoostA2DE hyper: " + hyper.value().dump());
|
// }
|
||||||
clf.setHyperparameters(hyper.value());
|
// }
|
||||||
REQUIRE_THROWS_AS(clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing), std::invalid_argument);
|
|
||||||
}
|
// TEST_CASE("Bisection Best", "[BoostAODE]")
|
||||||
}
|
// {
|
||||||
TEST_CASE("No features selected", "[BoostA2DE]")
|
// auto clf = bayesnet::BoostAODE();
|
||||||
{
|
// auto raw = RawDatasets("kdd_JapaneseVowels", true, 1200, true, false);
|
||||||
// Check that the note "No features selected in initialization" is added
|
// clf.setHyperparameters({
|
||||||
//
|
// {"bisection", true},
|
||||||
auto raw = RawDatasets("iris", true);
|
// {"maxTolerance", 3},
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// {"convergence", true},
|
||||||
clf.setHyperparameters({ {"select_features","FCBF"}, {"threshold", 1 } });
|
// {"block_update", false},
|
||||||
clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
// {"convergence_best", false},
|
||||||
REQUIRE(clf.getNotes().size() == 1);
|
// });
|
||||||
REQUIRE(clf.getNotes()[0] == "No features selected in initialization");
|
// clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
}
|
// REQUIRE(clf.getNumberOfNodes() == 210);
|
||||||
TEST_CASE("Bisection Best", "[BoostA2DE]")
|
// REQUIRE(clf.getNumberOfEdges() == 378);
|
||||||
{
|
// REQUIRE(clf.getNotes().size() == 1);
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// REQUIRE(clf.getNotes().at(0) == "Number of models: 14");
|
||||||
auto raw = RawDatasets("kdd_JapaneseVowels", true, 1200, true, false);
|
// auto score = clf.score(raw.X_test, raw.y_test);
|
||||||
clf.setHyperparameters({
|
// auto scoret = clf.score(raw.X_test, raw.y_test);
|
||||||
{"bisection", true},
|
// REQUIRE(score == Catch::Approx(0.991666675f).epsilon(raw.epsilon));
|
||||||
{"maxTolerance", 3},
|
// REQUIRE(scoret == Catch::Approx(0.991666675f).epsilon(raw.epsilon));
|
||||||
{"convergence", true},
|
// }
|
||||||
{"block_update", false},
|
// TEST_CASE("Bisection Best vs Last", "[BoostAODE]")
|
||||||
{"convergence_best", false},
|
// {
|
||||||
});
|
// auto raw = RawDatasets("kdd_JapaneseVowels", true, 1500, true, false);
|
||||||
clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states, raw.smoothing);
|
// auto clf = bayesnet::BoostAODE(true);
|
||||||
REQUIRE(clf.getNumberOfNodes() == 480);
|
// auto hyperparameters = nlohmann::json{
|
||||||
REQUIRE(clf.getNumberOfEdges() == 1152);
|
// {"bisection", true},
|
||||||
REQUIRE(clf.getNotes().size() == 3);
|
// {"maxTolerance", 3},
|
||||||
REQUIRE(clf.getNotes().at(0) == "Convergence threshold reached & 15 models eliminated");
|
// {"convergence", true},
|
||||||
REQUIRE(clf.getNotes().at(1) == "Pairs not used in train: 83");
|
// {"convergence_best", true},
|
||||||
REQUIRE(clf.getNotes().at(2) == "Number of models: 32");
|
// };
|
||||||
auto score = clf.score(raw.X_test, raw.y_test);
|
// clf.setHyperparameters(hyperparameters);
|
||||||
auto scoret = clf.score(raw.X_test, raw.y_test);
|
// clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
REQUIRE(score == Catch::Approx(0.966667f).epsilon(raw.epsilon));
|
// auto score_best = clf.score(raw.X_test, raw.y_test);
|
||||||
REQUIRE(scoret == Catch::Approx(0.966667f).epsilon(raw.epsilon));
|
// REQUIRE(score_best == Catch::Approx(0.980000019f).epsilon(raw.epsilon));
|
||||||
}
|
// // Now we will set the hyperparameter to use the last accuracy
|
||||||
TEST_CASE("Block Update", "[BoostA2DE]")
|
// hyperparameters["convergence_best"] = false;
|
||||||
{
|
// clf.setHyperparameters(hyperparameters);
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
auto raw = RawDatasets("spambase", true, 500);
|
// auto score_last = clf.score(raw.X_test, raw.y_test);
|
||||||
clf.setHyperparameters({
|
// REQUIRE(score_last == Catch::Approx(0.976666689f).epsilon(raw.epsilon));
|
||||||
{"bisection", true},
|
// }
|
||||||
{"block_update", true},
|
|
||||||
{"maxTolerance", 3},
|
// TEST_CASE("Block Update", "[BoostAODE]")
|
||||||
{"convergence", true},
|
// {
|
||||||
});
|
// auto clf = bayesnet::BoostAODE();
|
||||||
clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states, raw.smoothing);
|
// auto raw = RawDatasets("mfeat-factors", true, 500);
|
||||||
REQUIRE(clf.getNumberOfNodes() == 58);
|
// clf.setHyperparameters({
|
||||||
REQUIRE(clf.getNumberOfEdges() == 165);
|
// {"bisection", true},
|
||||||
REQUIRE(clf.getNotes().size() == 3);
|
// {"block_update", true},
|
||||||
REQUIRE(clf.getNotes()[0] == "Convergence threshold reached & 15 models eliminated");
|
// {"maxTolerance", 3},
|
||||||
REQUIRE(clf.getNotes()[1] == "Pairs not used in train: 1588");
|
// {"convergence", true},
|
||||||
REQUIRE(clf.getNotes()[2] == "Number of models: 1");
|
// });
|
||||||
auto score = clf.score(raw.X_test, raw.y_test);
|
// clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states, raw.smoothing);
|
||||||
auto scoret = clf.score(raw.X_test, raw.y_test);
|
// REQUIRE(clf.getNumberOfNodes() == 868);
|
||||||
REQUIRE(score == Catch::Approx(1.0f).epsilon(raw.epsilon));
|
// REQUIRE(clf.getNumberOfEdges() == 1724);
|
||||||
REQUIRE(scoret == Catch::Approx(1.0f).epsilon(raw.epsilon));
|
// REQUIRE(clf.getNotes().size() == 3);
|
||||||
//
|
// REQUIRE(clf.getNotes()[0] == "Convergence threshold reached & 15 models eliminated");
|
||||||
// std::cout << "Number of nodes " << clf.getNumberOfNodes() << std::endl;
|
// REQUIRE(clf.getNotes()[1] == "Used features in train: 19 of 216");
|
||||||
// std::cout << "Number of edges " << clf.getNumberOfEdges() << std::endl;
|
// REQUIRE(clf.getNotes()[2] == "Number of models: 4");
|
||||||
// std::cout << "Notes size " << clf.getNotes().size() << std::endl;
|
// auto score = clf.score(raw.X_test, raw.y_test);
|
||||||
// for (auto note : clf.getNotes()) {
|
// auto scoret = clf.score(raw.X_test, raw.y_test);
|
||||||
// std::cout << note << std::endl;
|
// REQUIRE(score == Catch::Approx(0.99f).epsilon(raw.epsilon));
|
||||||
// }
|
// REQUIRE(scoret == Catch::Approx(0.99f).epsilon(raw.epsilon));
|
||||||
// std::cout << "Score " << score << std::endl;
|
// //
|
||||||
}
|
// // std::cout << "Number of nodes " << clf.getNumberOfNodes() << std::endl;
|
||||||
TEST_CASE("Test graph b2a2de", "[BoostA2DE]")
|
// // std::cout << "Number of edges " << clf.getNumberOfEdges() << std::endl;
|
||||||
{
|
// // std::cout << "Notes size " << clf.getNotes().size() << std::endl;
|
||||||
auto raw = RawDatasets("iris", true);
|
// // for (auto note : clf.getNotes()) {
|
||||||
auto clf = bayesnet::BoostA2DE();
|
// // std::cout << note << std::endl;
|
||||||
clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states, raw.smoothing);
|
// // }
|
||||||
auto graph = clf.graph();
|
// // std::cout << "Score " << score << std::endl;
|
||||||
REQUIRE(graph.size() == 26);
|
// }
|
||||||
REQUIRE(graph[0] == "digraph BayesNet {\nlabel=<BayesNet BoostA2DE_0>\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n");
|
|
||||||
REQUIRE(graph[1] == "\"class\" [shape=circle, fontcolor=red, fillcolor=lightblue, style=filled ] \n");
|
|
||||||
}
|
|
@@ -1,72 +0,0 @@
|
|||||||
// ***************************************************************
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
|
||||||
// SPDX-FileType: SOURCE
|
|
||||||
// SPDX-License-Identifier: MIT
|
|
||||||
// ***************************************************************
|
|
||||||
|
|
||||||
#include <catch2/catch_test_macros.hpp>
|
|
||||||
#include <catch2/catch_approx.hpp>
|
|
||||||
#include <catch2/generators/catch_generators.hpp>
|
|
||||||
#include <catch2/matchers/catch_matchers.hpp>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
#include "TestUtils.h"
|
|
||||||
#include "bayesnet/utils/Mst.h"
|
|
||||||
|
|
||||||
|
|
||||||
TEST_CASE("MST::insertElement tests", "[MST]")
|
|
||||||
{
|
|
||||||
bayesnet::MST mst({}, torch::tensor({}), 0);
|
|
||||||
SECTION("Insert into an empty list")
|
|
||||||
{
|
|
||||||
std::list<int> variables;
|
|
||||||
mst.insertElement(variables, 5);
|
|
||||||
REQUIRE(variables == std::list<int>{5});
|
|
||||||
}
|
|
||||||
SECTION("Insert a non-duplicate element")
|
|
||||||
{
|
|
||||||
std::list<int> variables = { 1, 2, 3 };
|
|
||||||
mst.insertElement(variables, 4);
|
|
||||||
REQUIRE(variables == std::list<int>{4, 1, 2, 3});
|
|
||||||
}
|
|
||||||
SECTION("Insert a duplicate element")
|
|
||||||
{
|
|
||||||
std::list<int> variables = { 1, 2, 3 };
|
|
||||||
mst.insertElement(variables, 2);
|
|
||||||
REQUIRE(variables == std::list<int>{1, 2, 3});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_CASE("MST::reorder tests", "[MST]")
|
|
||||||
{
|
|
||||||
bayesnet::MST mst({}, torch::tensor({}), 0);
|
|
||||||
SECTION("Reorder simple graph")
|
|
||||||
{
|
|
||||||
std::vector<std::pair<float, std::pair<int, int>>> T = { {2.0, {1, 2}}, {1.0, {0, 1}} };
|
|
||||||
auto result = mst.reorder(T, 0);
|
|
||||||
REQUIRE(result == std::vector<std::pair<int, int>>{{0, 1}, { 1, 2 }});
|
|
||||||
}
|
|
||||||
SECTION("Reorder with disconnected graph")
|
|
||||||
{
|
|
||||||
std::vector<std::pair<float, std::pair<int, int>>> T = { {2.0, {2, 3}}, {1.0, {0, 1}} };
|
|
||||||
auto result = mst.reorder(T, 0);
|
|
||||||
REQUIRE(result == std::vector<std::pair<int, int>>{{0, 1}, { 2, 3 }});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST_CASE("MST::maximumSpanningTree tests", "[MST]")
|
|
||||||
{
|
|
||||||
std::vector<std::string> features = { "A", "B", "C" };
|
|
||||||
auto weights = torch::tensor({
|
|
||||||
{0.0, 1.0, 2.0},
|
|
||||||
{1.0, 0.0, 3.0},
|
|
||||||
{2.0, 3.0, 0.0}
|
|
||||||
});
|
|
||||||
bayesnet::MST mst(features, weights, 0);
|
|
||||||
|
|
||||||
SECTION("MST of a complete graph")
|
|
||||||
{
|
|
||||||
auto result = mst.maximumSpanningTree();
|
|
||||||
REQUIRE(result.size() == 2); // Un MST para 3 nodos tiene 2 aristas
|
|
||||||
}
|
|
||||||
}
|
|
@@ -16,7 +16,7 @@
|
|||||||
#include "TestUtils.h"
|
#include "TestUtils.h"
|
||||||
|
|
||||||
std::map<std::string, std::string> modules = {
|
std::map<std::string, std::string> modules = {
|
||||||
{ "mdlp", "2.0.1" },
|
{ "mdlp", "2.0.0" },
|
||||||
{ "Folding", "1.1.0" },
|
{ "Folding", "1.1.0" },
|
||||||
{ "json", "3.11" },
|
{ "json", "3.11" },
|
||||||
{ "ArffFiles", "1.1.0" }
|
{ "ArffFiles", "1.1.0" }
|
||||||
|
File diff suppressed because it is too large
Load Diff
Submodule tests/lib/Files updated: a4329f5f9d...a5316928d4
Submodule tests/lib/catch2 updated: 506276c592...4e8d92bf02
Reference in New Issue
Block a user