diff --git a/.clang-tidy b/.clang-tidy index 81f7209..ef88702 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -5,6 +5,7 @@ Checks: '-*, cppcoreguidelines-*, modernize-*, performance-*, + -modernize-use-nodiscard, -cppcoreguidelines-pro-type-vararg, -modernize-use-trailing-return-type, -bugprone-exception-escape' diff --git a/.vscode/launch.json b/.vscode/launch.json index a384091..65760c3 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,102 +5,10 @@ "type": "lldb", "request": "launch", "name": "sample", - "program": "${workspaceFolder}/build_debug/sample/BayesNetSample", + "program": "${workspaceFolder}/build_release/sample/bayesnet_sample", "args": [ - "-d", - "iris", - "-m", - "TANLd", - "-s", - "271", - "-p", - "/Users/rmontanana/Code/discretizbench/datasets/", + "${workspaceFolder}/tests/data/glass.arff" ], - //"cwd": "${workspaceFolder}/build/sample/", - }, - { - "type": "lldb", - "request": "launch", - "name": "experimentPy", - "program": "${workspaceFolder}/build_debug/src/Platform/b_main", - "args": [ - "-m", - "STree", - "--stratified", - "-d", - "iris", - //"--discretize" - // "--hyperparameters", - // "{\"repeatSparent\": true, \"maxModels\": 12}" - ], - "cwd": "${workspaceFolder}/../discretizbench", - }, - { - "type": "lldb", - "request": "launch", - "name": "gridsearch", - "program": "${workspaceFolder}/build_debug/src/Platform/b_grid", - "args": [ - "-m", - "KDB", - "--discretize", - "--continue", - "glass", - "--only", - "--compute" - ], - "cwd": "${workspaceFolder}/../discretizbench", - }, - { - "type": "lldb", - "request": "launch", - "name": "experimentBayes", - "program": "${workspaceFolder}/build_debug/src/Platform/b_main", - "args": [ - "-m", - "TAN", - "--stratified", - "--discretize", - "-d", - "iris", - "--hyperparameters", - "{\"repeatSparent\": true, \"maxModels\": 12}" - ], - "cwd": "/home/rmontanana/Code/discretizbench", - }, - { - "type": "lldb", - "request": "launch", - "name": "best", - "program": "${workspaceFolder}/build_debug/src/Platform/b_best", - "args": [ - "-m", - "BoostAODE", - "-s", - "accuracy", - "--build", - ], - "cwd": "${workspaceFolder}/../discretizbench", - }, - { - "type": "lldb", - "request": "launch", - "name": "manage", - "program": "${workspaceFolder}/build_debug/src/Platform/b_manage", - "args": [ - "-n", - "20" - ], - "cwd": "${workspaceFolder}/../discretizbench", - }, - { - "type": "lldb", - "request": "launch", - "name": "list", - "program": "${workspaceFolder}/build_debug/src/Platform/b_list", - "args": [], - //"cwd": "/Users/rmontanana/Code/discretizbench", - "cwd": "${workspaceFolder}/../discretizbench", }, { "type": "lldb", @@ -112,19 +20,6 @@ // "-s", ], "cwd": "${workspaceFolder}/build_debug/tests", - }, - { - "name": "Build & debug active file", - "type": "cppdbg", - "request": "launch", - "program": "${workspaceFolder}/build_debug/bayesnet", - "args": [], - "stopAtEntry": false, - "cwd": "${workspaceFolder}", - "environment": [], - "externalConsole": false, - "MIMode": "lldb", - "preLaunchTask": "CMake: build" } ] } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bea3e3..dc0a6a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [1.0.4] ### Added -- Change _ascending_ hyperparameter to _order_ with these possible values _{"asc", "desc", "rand"}_ +- Change _ascending_ hyperparameter to _order_ with these possible values _{"asc", "desc", "rand"}_, Default is _"desc"_. +- Add the _predict_single_ hyperparameter to control if only the last model created is used to predict in boost training or the whole ensemble (all the models built so far). Default is true. +- sample app to show how to use the library (make sample) + +### Changed + +- Change the library structure adding folders for each group of classes (classifiers, ensembles, etc). +- The significances of the models generated under the feature selection algorithm are now computed after all the models have been generated and an αt value is computed and assigned to each model. ## [1.0.3] diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d42041..327f09b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.20) project(BayesNet - VERSION 1.0.3 + VERSION 1.0.4 DESCRIPTION "Bayesian Network and basic classifiers Library." HOMEPAGE_URL "https://github.com/rmontanana/bayesnet" LANGUAGES CXX @@ -36,12 +36,19 @@ option(CODE_COVERAGE "Collect coverage from test library" OFF) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) include(AddGitSubmodule) +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + MESSAGE("Debug mode") + set(ENABLE_TESTING ON) + set(CODE_COVERAGE ON) +endif (CMAKE_BUILD_TYPE STREQUAL "Debug") + + if (CODE_COVERAGE) - enable_testing() - include(CodeCoverage) - MESSAGE("Code coverage enabled") - set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g") - SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage") + enable_testing() + include(CodeCoverage) + MESSAGE("Code coverage enabled") + set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g") + SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage") endif (CODE_COVERAGE) if (ENABLE_CLANG_TIDY) @@ -58,10 +65,9 @@ add_git_submodule("lib/json") # -------------- add_subdirectory(config) add_subdirectory(lib/Files) +add_subdirectory(sample) add_subdirectory(src) -file(GLOB BayesNet_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/*.cc) - # Testing # ------- if (ENABLE_TESTING) diff --git a/Makefile b/Makefile index 2cda612..d376787 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ SHELL := /bin/bash .DEFAULT_GOAL := help -.PHONY: coverage setup help buildr buildd test clean debug release +.PHONY: coverage setup help buildr buildd test clean debug release sample f_release = build_release f_debug = build_debug app_targets = BayesNet -test_targets = unit_tests_bayesnet +test_targets = unit_tests_bayesnet n_procs = -j 16 define ClearTests @@ -59,6 +59,13 @@ release: ## Build a Release version of the project @if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi @mkdir $(f_release); @cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release + @echo ">>> Done"; + +fname = "tests/data/iris.arff" +sample: ## Build sample + @echo ">>> Building Sample..."; + cmake --build $(f_release) -t bayesnet_sample $(n_procs) + $(f_release)/sample/bayesnet_sample $(fname) @echo ">>> Done"; opt = "" diff --git a/README.md b/README.md index ee21fa4..2dd4bb3 100644 --- a/README.md +++ b/README.md @@ -19,4 +19,14 @@ make test make coverage ``` -## 1. Introduction +### Sample app + +```bash +make release +make sample +make sample fname=tests/data/glass.arff +``` + +## Models + +### [BoostAODE](docs/BoostAODE.md) diff --git a/docs/BoostAODE.docx b/docs/BoostAODE.docx deleted file mode 100644 index ec04a70..0000000 Binary files a/docs/BoostAODE.docx and /dev/null differ diff --git a/docs/BoostAODE.md b/docs/BoostAODE.md new file mode 100644 index 0000000..a04cc0e --- /dev/null +++ b/docs/BoostAODE.md @@ -0,0 +1,71 @@ +# BoostAODE Algorithm Operation + +The algorithm is based on the AdaBoost algorithm with some new proposals that can be activated using the following hyperparameters. + +## Hyperparameters + +The hyperparameters defined in the algorithm are: + +- ***repeatSparent*** (*boolean*): Allows dataset variables to be repeated as parents of an *SPODE*. Default value: *false*. + +- ***maxModels*** (*int*): Maximum number of models (*SPODEs*) to build. This hyperparameter is only taken into account if ***repeatSparent*** is set to *true*. Default value: *0*. + +- ***order*** (*{"asc", "desc", "rand"}*): Sets the order (ascending/descending/random) in which dataset variables will be processed to choose the parents of the *SPODEs*. Default value: *"desc"*. + +- ***convergence*** (*boolean*): Sets whether the convergence of the result will be used as a termination condition. If this hyperparameter is set to true, the training dataset passed to the model is divided into two sets, one serving as training data and the other as a test set (so the original test partition will become a validation partition in this case). The partition is made by taking the first partition generated by a process of generating a 5 fold partition with stratification using a predetermined seed. The exit condition used in this *convergence* is that the difference between the accuracy obtained by the current model and that obtained by the previous model is greater than *1e-4*; otherwise, one will be added to the number of models that worsen the result (see next hyperparameter). Default value: *false*. + +- ***tolerance*** (*int*): Sets the maximum number of models that can worsen the result without constituting a termination condition. Default value: *0*. + +- ***select_features*** (*{"IWSS", "FCBF", "CFS", ""}*): Selects the variable selection method to be used to build initial models for the ensemble that will be included without considering any of the other exit conditions. Once the models of the selected variables are built, the algorithm will update the weights using the ensemble and set the significance of all the models built with the same αt. Default value: *""*. + +- ***threshold*** (*double*): Sets the necessary value for the IWSS and FCBF algorithms to function. Accepted values are: + - IWSS: $threshold \in [0, 0.5]$ + - FCBF: $threshold \in [10^{-7}, 1]$ + + Default value is *-1* so every time any of those algorithms are called, the threshold has to be set to the desired value. + +- ***predict_voting*** (*boolean*): Sets whether the algorithm will use *model voting* to predict the result. If set to false, the weighted average of the probabilities of each model's prediction will be used. Default value: *true*. + +- ***predict_single*** (*boolean*): Sets whether the algorithm will use single-model prediction in the learning process. If set to *false*, all models trained up to that point will be used to calculate the prediction necessary to update the weights in the learning process. Default value: *true*. + +## Operation + +The algorithm performs the following steps: + +1. **Initialization** + + - If ***select_features*** is set, as many *SPODEs* are created as variables selected by the corresponding feature selection algorithm, and these variables are marked as used. + + - Initial weights of the examples are set to *1/m*. + +1. **Main Training Loop:** + + - Variables are sorted by mutual information order with the class variable and processed in ascending, descending or random order, according to the value of the *order* hyperparameter. If it is random, the variables are shuffled. + + - If the parent repetition is not established, the variable is marked as used. + + - A *SPODE* is created using the selected variable as the parent. + + - The model is trained, and the class variable corresponding to the training dataset is calculated. The calculation can be done using the last trained model or the set of models trained up to that point, according to the value of the *predict_single* hyperparameter. + + - The weights associated with the examples are updated using this expression: + + - wi · eαt (if the example has been misclassified) + + - wi · et (if the example has been correctly classified) + + - The model significance is set to αt. + + - If the ***convergence*** hyperparameter is set, the accuracy value on the test dataset that we separated in an initial step is calculated. + +1. **Exit Conditions:** + + - εt > 0.5 => misclassified examples are penalized. + + - Number of models with worse accuracy greater than ***tolerance*** and ***convergence*** established. + + - There are no more variables to create models, and ***repeatSparent*** is not set. + + - Number of models > ***maxModels*** if ***repeatSparent*** is set. + +### [Proposal for *predict_single = false*](./BoostAODE_train_predict.pdf) diff --git a/docs/BoostAODE_train_predict.odp b/docs/BoostAODE_train_predict.odp new file mode 100644 index 0000000..4931f0c Binary files /dev/null and b/docs/BoostAODE_train_predict.odp differ diff --git a/docs/BoostAODE_train_predict.pdf b/docs/BoostAODE_train_predict.pdf new file mode 100644 index 0000000..51fa031 Binary files /dev/null and b/docs/BoostAODE_train_predict.pdf differ diff --git a/gcovr.cfg b/gcovr.cfg index 89e0877..01ad57e 100644 --- a/gcovr.cfg +++ b/gcovr.cfg @@ -1,4 +1,4 @@ filter = src/ exclude-directories = build_debug/lib/ print-summary = yes -sort-percentage = yes +sort = uncovered-percent diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt new file mode 100644 index 0000000..b56a20c --- /dev/null +++ b/sample/CMakeLists.txt @@ -0,0 +1,14 @@ +include_directories( + ${BayesNet_SOURCE_DIR}/src + ${BayesNet_SOURCE_DIR}/src/classifiers + ${BayesNet_SOURCE_DIR}/src/ensembles + ${BayesNet_SOURCE_DIR}/src/bayesian_network + ${BayesNet_SOURCE_DIR}/src/utils + ${BayesNet_SOURCE_DIR}/src/feature_selection + ${BayesNet_SOURCE_DIR}/lib/Files + ${BayesNet_SOURCE_DIR}/lib/mdlp + ${BayesNet_SOURCE_DIR}/lib/json/include + ${CMAKE_BINARY_DIR}/configured_files/include +) +add_executable(bayesnet_sample sample.cc) +target_link_libraries(bayesnet_sample ArffFiles BayesNet) \ No newline at end of file diff --git a/sample/sample.cc b/sample/sample.cc new file mode 100644 index 0000000..54b8639 --- /dev/null +++ b/sample/sample.cc @@ -0,0 +1,62 @@ +#include "ArffFiles.h" +#include "CPPFImdlp.h" +#include "BoostAODE.h" + +std::vector discretizeDataset(std::vector& X, mdlp::labels_t& y) +{ + std::vector Xd; + auto fimdlp = mdlp::CPPFImdlp(); + for (int i = 0; i < X.size(); i++) { + fimdlp.fit(X[i], y); + mdlp::labels_t& xd = fimdlp.transform(X[i]); + Xd.push_back(xd); + } + return Xd; +} +tuple, std::string, map>> loadDataset(const std::string& name, bool class_last) +{ + auto handler = ArffFiles(); + handler.load(name, class_last); + // Get Dataset X, y + std::vector& X = handler.getX(); + mdlp::labels_t& y = handler.getY(); + // Get className & Features + auto className = handler.getClassName(); + std::vector features; + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); + torch::Tensor Xd; + auto states = map>(); + auto Xr = discretizeDataset(X, y); + Xd = torch::zeros({ static_cast(Xr.size()), static_cast(Xr[0].size()) }, torch::kInt32); + for (int i = 0; i < features.size(); ++i) { + states[features[i]] = std::vector(*max_element(Xr[i].begin(), Xr[i].end()) + 1); + auto item = states.at(features[i]); + iota(begin(item), end(item), 0); + Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32)); + } + states[className] = std::vector(*max_element(y.begin(), y.end()) + 1); + iota(begin(states.at(className)), end(states.at(className)), 0); + return { Xd, torch::tensor(y, torch::kInt32), features, className, states }; +} + +int main(int argc, char* argv[]) +{ + if (argc < 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + std::string file_name = argv[1]; + torch::Tensor X, y; + std::vector features; + std::string className; + map> states; + auto clf = bayesnet::BoostAODE(false); // false for not using voting in predict + std::cout << "Library version: " << clf.getVersion() << std::endl; + tie(X, y, features, className, states) = loadDataset(file_name, true); + clf.fit(X, y, features, className, states); + auto score = clf.score(X, y); + std::cout << "File: " << file_name << " score: " << score << std::endl; + return 0; +} + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 461c6a9..e798319 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -4,10 +4,15 @@ include_directories( ${BayesNet_SOURCE_DIR}/lib/folding ${BayesNet_SOURCE_DIR}/lib/json/include ${BayesNet_SOURCE_DIR}/src + ${BayesNet_SOURCE_DIR}/src/feature_selection + ${BayesNet_SOURCE_DIR}/src/bayesian_network + ${BayesNet_SOURCE_DIR}/src/classifiers + ${BayesNet_SOURCE_DIR}/src/ensembles + ${BayesNet_SOURCE_DIR}/src/utils ${CMAKE_BINARY_DIR}/configured_files/include ) -add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc - KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ) +file(GLOB_RECURSE Sources "*.cc") + +add_library(BayesNet ${Sources}) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Network.cc b/src/bayesian_network/Network.cc similarity index 99% rename from src/Network.cc rename to src/bayesian_network/Network.cc index 32e6ecf..0357ba6 100644 --- a/src/Network.cc +++ b/src/bayesian_network/Network.cc @@ -71,7 +71,7 @@ namespace bayesnet { for (Node* child : nodes[nodeId]->getChildren()) { if (visited.find(child->getName()) == visited.end() && isCyclic(child->getName(), visited, recStack)) return true; - else if (recStack.find(child->getName()) != recStack.end()) + if (recStack.find(child->getName()) != recStack.end()) return true; } } diff --git a/src/Network.h b/src/bayesian_network/Network.h similarity index 100% rename from src/Network.h rename to src/bayesian_network/Network.h diff --git a/src/Node.cc b/src/bayesian_network/Node.cc similarity index 100% rename from src/Node.cc rename to src/bayesian_network/Node.cc diff --git a/src/Node.h b/src/bayesian_network/Node.h similarity index 100% rename from src/Node.h rename to src/bayesian_network/Node.h diff --git a/src/Classifier.cc b/src/classifiers/Classifier.cc similarity index 100% rename from src/Classifier.cc rename to src/classifiers/Classifier.cc diff --git a/src/Classifier.h b/src/classifiers/Classifier.h similarity index 100% rename from src/Classifier.h rename to src/classifiers/Classifier.h diff --git a/src/KDB.cc b/src/classifiers/KDB.cc similarity index 100% rename from src/KDB.cc rename to src/classifiers/KDB.cc diff --git a/src/KDB.h b/src/classifiers/KDB.h similarity index 100% rename from src/KDB.h rename to src/classifiers/KDB.h diff --git a/src/KDBLd.cc b/src/classifiers/KDBLd.cc similarity index 100% rename from src/KDBLd.cc rename to src/classifiers/KDBLd.cc diff --git a/src/KDBLd.h b/src/classifiers/KDBLd.h similarity index 100% rename from src/KDBLd.h rename to src/classifiers/KDBLd.h diff --git a/src/Proposal.cc b/src/classifiers/Proposal.cc similarity index 100% rename from src/Proposal.cc rename to src/classifiers/Proposal.cc diff --git a/src/Proposal.h b/src/classifiers/Proposal.h similarity index 100% rename from src/Proposal.h rename to src/classifiers/Proposal.h diff --git a/src/SPODE.cc b/src/classifiers/SPODE.cc similarity index 100% rename from src/SPODE.cc rename to src/classifiers/SPODE.cc diff --git a/src/SPODE.h b/src/classifiers/SPODE.h similarity index 100% rename from src/SPODE.h rename to src/classifiers/SPODE.h diff --git a/src/SPODELd.cc b/src/classifiers/SPODELd.cc similarity index 100% rename from src/SPODELd.cc rename to src/classifiers/SPODELd.cc diff --git a/src/SPODELd.h b/src/classifiers/SPODELd.h similarity index 100% rename from src/SPODELd.h rename to src/classifiers/SPODELd.h diff --git a/src/TAN.cc b/src/classifiers/TAN.cc similarity index 100% rename from src/TAN.cc rename to src/classifiers/TAN.cc diff --git a/src/TAN.h b/src/classifiers/TAN.h similarity index 100% rename from src/TAN.h rename to src/classifiers/TAN.h diff --git a/src/TANLd.cc b/src/classifiers/TANLd.cc similarity index 100% rename from src/TANLd.cc rename to src/classifiers/TANLd.cc diff --git a/src/TANLd.h b/src/classifiers/TANLd.h similarity index 100% rename from src/TANLd.h rename to src/classifiers/TANLd.h diff --git a/src/AODE.cc b/src/ensembles/AODE.cc similarity index 100% rename from src/AODE.cc rename to src/ensembles/AODE.cc diff --git a/src/AODE.h b/src/ensembles/AODE.h similarity index 100% rename from src/AODE.h rename to src/ensembles/AODE.h diff --git a/src/AODELd.cc b/src/ensembles/AODELd.cc similarity index 100% rename from src/AODELd.cc rename to src/ensembles/AODELd.cc diff --git a/src/AODELd.h b/src/ensembles/AODELd.h similarity index 100% rename from src/AODELd.h rename to src/ensembles/AODELd.h diff --git a/src/BoostAODE.cc b/src/ensembles/BoostAODE.cc similarity index 70% rename from src/BoostAODE.cc rename to src/ensembles/BoostAODE.cc index 9f11f02..e8a5166 100644 --- a/src/BoostAODE.cc +++ b/src/ensembles/BoostAODE.cc @@ -1,6 +1,7 @@ #include #include #include +#include #include "BoostAODE.h" #include "CFS.h" #include "FCBF.h" @@ -8,9 +9,22 @@ #include "folding.hpp" namespace bayesnet { + struct { + std::string CFS = "CFS"; + std::string FCBF = "FCBF"; + std::string IWSS = "IWSS"; + }SelectFeatures; + struct { + std::string ASC = "asc"; + std::string DESC = "desc"; + std::string RAND = "rand"; + }Orders; BoostAODE::BoostAODE(bool predict_voting) : Ensemble(predict_voting) { - validHyperparameters = { "repeatSparent", "maxModels", "order", "convergence", "threshold", "select_features", "tolerance", "predict_voting" }; + validHyperparameters = { + "repeatSparent", "maxModels", "order", "convergence", "threshold", + "select_features", "tolerance", "predict_voting", "predict_single" + }; } void BoostAODE::buildModel(const torch::Tensor& weights) @@ -58,10 +72,10 @@ namespace bayesnet { hyperparameters.erase("maxModels"); } if (hyperparameters.contains("order")) { - std::vector algos = { "asc", "desc", "rand" }; + std::vector algos = { Orders.ASC, Orders.DESC, Orders.RAND }; order_algorithm = hyperparameters["order"]; if (std::find(algos.begin(), algos.end(), order_algorithm) == algos.end()) { - throw std::invalid_argument("Invalid order algorithm, valid values [asc, desc, rand]"); + throw std::invalid_argument("Invalid order algorithm, valid values [" + Orders.ASC + ", " + Orders.DESC + ", " + Orders.RAND + "]"); } hyperparameters.erase("order"); } @@ -69,6 +83,10 @@ namespace bayesnet { convergence = hyperparameters["convergence"]; hyperparameters.erase("convergence"); } + if (hyperparameters.contains("predict_single")) { + predict_single = hyperparameters["predict_single"]; + hyperparameters.erase("predict_single"); + } if (hyperparameters.contains("threshold")) { threshold = hyperparameters["threshold"]; hyperparameters.erase("threshold"); @@ -83,11 +101,11 @@ namespace bayesnet { } if (hyperparameters.contains("select_features")) { auto selectedAlgorithm = hyperparameters["select_features"]; - std::vector algos = { "IWSS", "FCBF", "CFS" }; + std::vector algos = { SelectFeatures.IWSS, SelectFeatures.CFS, SelectFeatures.CFS }; selectFeatures = true; select_features_algorithm = selectedAlgorithm; if (std::find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) { - throw std::invalid_argument("Invalid selectFeatures value, valid values [IWSS, FCBF, CFS]"); + throw std::invalid_argument("Invalid selectFeatures value, valid values [" + SelectFeatures.IWSS + ", " + SelectFeatures.CFS + ", " + SelectFeatures.FCBF + "]"); } hyperparameters.erase("select_features"); } @@ -95,28 +113,54 @@ namespace bayesnet { throw std::invalid_argument("Invalid hyperparameters" + hyperparameters.dump()); } } + std::tuple update_weights(torch::Tensor& ytrain, torch::Tensor& ypred, torch::Tensor& weights) + { + bool terminate = false; + double alpha_t = 0; + auto mask_wrong = ypred != ytrain; + auto mask_right = ypred == ytrain; + auto masked_weights = weights * mask_wrong.to(weights.dtype()); + double epsilon_t = masked_weights.sum().item(); + if (epsilon_t > 0.5) { + // Inverse the weights policy (plot ln(wt)) + // "In each round of AdaBoost, there is a sanity check to ensure that the current base + // learner is better than random guess" (Zhi-Hua Zhou, 2012) + terminate = true; + } else { + double wt = (1 - epsilon_t) / epsilon_t; + alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt); + // Step 3.2: Update weights for next classifier + // Step 3.2.1: Update weights of wrong samples + weights += mask_wrong.to(weights.dtype()) * exp(alpha_t) * weights; + // Step 3.2.2: Update weights of right samples + weights += mask_right.to(weights.dtype()) * exp(-alpha_t) * weights; + // Step 3.3: Normalise the weights + double totalWeights = torch::sum(weights).item(); + weights = weights / totalWeights; + } + return { weights, alpha_t, terminate }; + } std::unordered_set BoostAODE::initializeModels() { std::unordered_set featuresUsed; torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); int maxFeatures = 0; - if (select_features_algorithm == "CFS") { + if (select_features_algorithm == SelectFeatures.CFS) { featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); - } else if (select_features_algorithm == "IWSS") { + } else if (select_features_algorithm == SelectFeatures.IWSS) { if (threshold < 0 || threshold >0.5) { - throw std::invalid_argument("Invalid threshold value for IWSS [0, 0.5]"); + throw std::invalid_argument("Invalid threshold value for " + SelectFeatures.IWSS + " [0, 0.5]"); } featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); - } else if (select_features_algorithm == "FCBF") { + } else if (select_features_algorithm == SelectFeatures.FCBF) { if (threshold < 1e-7 || threshold > 1) { - throw std::invalid_argument("Invalid threshold value [1e-7, 1]"); + throw std::invalid_argument("Invalid threshold value for " + SelectFeatures.FCBF + " [1e-7, 1]"); } featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); } featureSelector->fit(); auto cfsFeatures = featureSelector->getFeatures(); for (const int& feature : cfsFeatures) { - // std::cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << std::endl; featuresUsed.insert(feature); std::unique_ptr model = std::make_unique(feature); model->fit(dataset, features, className, states, weights_); @@ -128,22 +172,46 @@ namespace bayesnet { delete featureSelector; return featuresUsed; } + torch::Tensor BoostAODE::ensemble_predict(torch::Tensor& X, SPODE* model) + { + if (initialize_prob_table) { + initialize_prob_table = false; + prob_table = model->predict_proba(X) * 1.0; + } else { + prob_table += model->predict_proba(X) * 1.0; + } + // prob_table doesn't store probabilities but the sum of them + // to have them we need to divide by the sum of the "weights" used to + // consider the results obtanined in the model's predict_proba. + return prob_table.argmax(1); + } void BoostAODE::trainModel(const torch::Tensor& weights) { - fitted = true; // Algorithm based on the adaboost algorithm for classification // as explained in Ensemble methods (Zhi-Hua Zhou, 2012) + initialize_prob_table = true; + fitted = true; + double alpha_t = 0; + torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + bool exitCondition = false; std::unordered_set featuresUsed; if (selectFeatures) { featuresUsed = initializeModels(); + auto ypred = predict(X_train); + std::tie(weights_, alpha_t, exitCondition) = update_weights(y_train, ypred, weights_); + // Update significance of the models + for (int i = 0; i < n_models; ++i) { + significanceModels[i] = alpha_t; + } + if (exitCondition) { + return; + } } bool resetMaxModels = false; if (maxModels == 0) { maxModels = .1 * n > 10 ? .1 * n : n; resetMaxModels = true; // Flag to unset maxModels } - torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); - bool exitCondition = false; // Variables to control the accuracy finish condition double priorAccuracy = 0.0; double delta = 1.0; @@ -154,12 +222,12 @@ namespace bayesnet { // n_models == maxModels // epsilon sub t > 0.5 => inverse the weights policy // validation error is not decreasing - bool ascending = order_algorithm == "asc"; + bool ascending = order_algorithm == Orders.ASC; std::mt19937 g{ 173 }; while (!exitCondition) { // Step 1: Build ranking with mutual information auto featureSelection = metrics.SelectKBestWeighted(weights_, ascending, n); // Get all the features sorted - if (order_algorithm == "rand") { + if (order_algorithm == Orders.RAND) { std::shuffle(featureSelection.begin(), featureSelection.end(), g); } auto feature = featureSelection[0]; @@ -181,28 +249,17 @@ namespace bayesnet { std::unique_ptr model; model = std::make_unique(feature); model->fit(dataset, features, className, states, weights_); - auto ypred = model->predict(X_train); + torch::Tensor ypred; + if (predict_single) { + ypred = model->predict(X_train); + } else { + ypred = ensemble_predict(X_train, dynamic_cast(model.get())); + } // Step 3.1: Compute the classifier amout of say - auto mask_wrong = ypred != y_train; - auto mask_right = ypred == y_train; - auto masked_weights = weights_ * mask_wrong.to(weights_.dtype()); - double epsilon_t = masked_weights.sum().item(); - if (epsilon_t > 0.5) { - // Inverse the weights policy (plot ln(wt)) - // "In each round of AdaBoost, there is a sanity check to ensure that the current base - // learner is better than random guess" (Zhi-Hua Zhou, 2012) + std::tie(weights_, alpha_t, exitCondition) = update_weights(y_train, ypred, weights_); + if (exitCondition) { break; } - double wt = (1 - epsilon_t) / epsilon_t; - double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt); - // Step 3.2: Update weights for next classifier - // Step 3.2.1: Update weights of wrong samples - weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_; - // Step 3.2.2: Update weights of right samples - weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_; - // Step 3.3: Normalise the weights - double totalWeights = torch::sum(weights_).item(); - weights_ = weights_ / totalWeights; // Step 3.4: Store classifier and its accuracy to weigh its future vote featuresUsed.insert(feature); models.push_back(std::move(model)); diff --git a/src/BoostAODE.h b/src/ensembles/BoostAODE.h similarity index 71% rename from src/BoostAODE.h rename to src/ensembles/BoostAODE.h index 7119194..c58bc3e 100644 --- a/src/BoostAODE.h +++ b/src/ensembles/BoostAODE.h @@ -15,17 +15,21 @@ namespace bayesnet { void buildModel(const torch::Tensor& weights) override; void trainModel(const torch::Tensor& weights) override; private: + std::unordered_set initializeModels(); + torch::Tensor ensemble_predict(torch::Tensor& X, SPODE* model); torch::Tensor dataset_; torch::Tensor X_train, y_train, X_test, y_test; - std::unordered_set initializeModels(); // Hyperparameters bool repeatSparent = false; // if true, a feature can be selected more than once int maxModels = 0; int tolerance = 0; + bool predict_single = true; // wether the last model is used to predict in training or the whole ensemble std::string order_algorithm; // order to process the KBest features asc, desc, rand bool convergence = false; //if true, stop when the model does not improve bool selectFeatures = false; // if true, use feature selection - std::string select_features_algorithm = ""; // Selected feature selection algorithm + std::string select_features_algorithm = "desc"; // Selected feature selection algorithm + bool initialize_prob_table; // if true, initialize the prob_table with the first model (used in train) + torch::Tensor prob_table; // Table of probabilities for ensemble predicting if predict_single is false FeatureSelect* featureSelector = nullptr; double threshold = -1; }; diff --git a/src/Ensemble.cc b/src/ensembles/Ensemble.cc similarity index 100% rename from src/Ensemble.cc rename to src/ensembles/Ensemble.cc diff --git a/src/Ensemble.h b/src/ensembles/Ensemble.h similarity index 100% rename from src/Ensemble.h rename to src/ensembles/Ensemble.h diff --git a/src/CFS.cc b/src/feature_selection/CFS.cc similarity index 100% rename from src/CFS.cc rename to src/feature_selection/CFS.cc diff --git a/src/CFS.h b/src/feature_selection/CFS.h similarity index 100% rename from src/CFS.h rename to src/feature_selection/CFS.h diff --git a/src/FCBF.cc b/src/feature_selection/FCBF.cc similarity index 100% rename from src/FCBF.cc rename to src/feature_selection/FCBF.cc diff --git a/src/FCBF.h b/src/feature_selection/FCBF.h similarity index 100% rename from src/FCBF.h rename to src/feature_selection/FCBF.h diff --git a/src/FeatureSelect.cc b/src/feature_selection/FeatureSelect.cc similarity index 99% rename from src/FeatureSelect.cc rename to src/feature_selection/FeatureSelect.cc index b8300a5..fba1228 100644 --- a/src/FeatureSelect.cc +++ b/src/feature_selection/FeatureSelect.cc @@ -50,7 +50,6 @@ namespace bayesnet { } double FeatureSelect::computeMeritCFS() { - double result; double rcf = 0; for (auto feature : selectedFeatures) { rcf += suLabels[feature]; diff --git a/src/FeatureSelect.h b/src/feature_selection/FeatureSelect.h similarity index 100% rename from src/FeatureSelect.h rename to src/feature_selection/FeatureSelect.h diff --git a/src/IWSS.cc b/src/feature_selection/IWSS.cc similarity index 95% rename from src/IWSS.cc rename to src/feature_selection/IWSS.cc index 4fd11ea..e63bf6b 100644 --- a/src/IWSS.cc +++ b/src/feature_selection/IWSS.cc @@ -28,7 +28,7 @@ namespace bayesnet { selectedFeatures.push_back(feature); // Compute merit with selectedFeatures auto meritNew = computeMeritCFS(); - double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0; + double delta = merit != 0.0 ? std::abs(merit - meritNew) / merit : 0.0; if (meritNew > merit || delta < threshold) { if (meritNew > merit) { merit = meritNew; diff --git a/src/IWSS.h b/src/feature_selection/IWSS.h similarity index 100% rename from src/IWSS.h rename to src/feature_selection/IWSS.h diff --git a/src/BayesMetrics.cc b/src/utils/BayesMetrics.cc similarity index 100% rename from src/BayesMetrics.cc rename to src/utils/BayesMetrics.cc diff --git a/src/BayesMetrics.h b/src/utils/BayesMetrics.h similarity index 100% rename from src/BayesMetrics.h rename to src/utils/BayesMetrics.h diff --git a/src/Mst.cc b/src/utils/Mst.cc similarity index 100% rename from src/Mst.cc rename to src/utils/Mst.cc diff --git a/src/Mst.h b/src/utils/Mst.h similarity index 100% rename from src/Mst.h rename to src/utils/Mst.h diff --git a/src/bayesnetUtils.cc b/src/utils/bayesnetUtils.cc similarity index 100% rename from src/bayesnetUtils.cc rename to src/utils/bayesnetUtils.cc diff --git a/src/bayesnetUtils.h b/src/utils/bayesnetUtils.h similarity index 100% rename from src/bayesnetUtils.h rename to src/utils/bayesnetUtils.h diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index efccc48..630beab 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,13 +2,18 @@ if(ENABLE_TESTING) set(TEST_BAYESNET "unit_tests_bayesnet") include_directories( ${BayesNet_SOURCE_DIR}/src - ${BayesNet_SOURCE_DIR}/src/Platform + ${BayesNet_SOURCE_DIR}/src/feature_selection + ${BayesNet_SOURCE_DIR}/src/bayesian_network + ${BayesNet_SOURCE_DIR}/src/classifiers + ${BayesNet_SOURCE_DIR}/src/utils + ${BayesNet_SOURCE_DIR}/src/ensembles ${BayesNet_SOURCE_DIR}/lib/Files ${BayesNet_SOURCE_DIR}/lib/mdlp ${BayesNet_SOURCE_DIR}/lib/folding ${BayesNet_SOURCE_DIR}/lib/json/include ${CMAKE_BINARY_DIR}/configured_files/include ) + file(GLOB_RECURSE BayesNet_SOURCES "${BayesNet_SOURCE_DIR}/src/*.cc") set(TEST_SOURCES_BAYESNET TestBayesModels.cc TestBayesNetwork.cc TestBayesMetrics.cc TestUtils.cc ${BayesNet_SOURCES}) add_executable(${TEST_BAYESNET} ${TEST_SOURCES_BAYESNET}) target_link_libraries(${TEST_BAYESNET} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain ) diff --git a/tests/TestBayesModels.cc b/tests/TestBayesModels.cc index 0234747..8488f05 100644 --- a/tests/TestBayesModels.cc +++ b/tests/TestBayesModels.cc @@ -224,6 +224,8 @@ TEST_CASE("BoostAODE voting-proba", "[BayesNet]") REQUIRE(score_voting == Catch::Approx(0.98).epsilon(raw.epsilon)); REQUIRE(pred_voting[83][2] == Catch::Approx(0.552091).epsilon(raw.epsilon)); REQUIRE(pred_proba[83][2] == Catch::Approx(0.546017).epsilon(raw.epsilon)); + clf.dump_cpt(); + REQUIRE(clf.topological_order() == std::vector()); } TEST_CASE("BoostAODE order asc, desc & random", "[BayesNet]") { @@ -240,10 +242,28 @@ TEST_CASE("BoostAODE order asc, desc & random", "[BayesNet]") clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv); auto score = clf.score(raw.Xv, raw.yv); auto scoret = clf.score(raw.Xt, raw.yt); - auto score2 = clf.score(raw.Xv, raw.yv); - auto scoret2 = clf.score(raw.Xt, raw.yt); INFO("order: " + order); REQUIRE(score == Catch::Approx(scores[order]).epsilon(raw.epsilon)); REQUIRE(scoret == Catch::Approx(scores[order]).epsilon(raw.epsilon)); } } +TEST_CASE("BoostAODE predict_single", "[BayesNet]") +{ + + auto raw = RawDatasets("glass", true); + std::map scores{ + {true, 0.84579f }, { false, 0.80841f } + }; + for (const bool kind : { true, false}) { + auto clf = bayesnet::BoostAODE(); + clf.setHyperparameters({ + {"predict_single", kind}, {"order", "desc" }, + }); + clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv); + auto score = clf.score(raw.Xv, raw.yv); + auto scoret = clf.score(raw.Xt, raw.yt); + INFO("kind: " + std::string(kind ? "true" : "false")); + REQUIRE(score == Catch::Approx(scores[kind]).epsilon(raw.epsilon)); + REQUIRE(scoret == Catch::Approx(scores[kind]).epsilon(raw.epsilon)); + } +}