predict_single #22

Merged
rmontanana merged 11 commits from predict_single into main 2024-03-06 16:16:19 +00:00
58 changed files with 330 additions and 167 deletions

View File

@ -5,6 +5,7 @@ Checks: '-*,
cppcoreguidelines-*,
modernize-*,
performance-*,
-modernize-use-nodiscard,
-cppcoreguidelines-pro-type-vararg,
-modernize-use-trailing-return-type,
-bugprone-exception-escape'

109
.vscode/launch.json vendored
View File

@ -5,102 +5,10 @@
"type": "lldb",
"request": "launch",
"name": "sample",
"program": "${workspaceFolder}/build_debug/sample/BayesNetSample",
"program": "${workspaceFolder}/build_release/sample/bayesnet_sample",
"args": [
"-d",
"iris",
"-m",
"TANLd",
"-s",
"271",
"-p",
"/Users/rmontanana/Code/discretizbench/datasets/",
"${workspaceFolder}/tests/data/glass.arff"
],
//"cwd": "${workspaceFolder}/build/sample/",
},
{
"type": "lldb",
"request": "launch",
"name": "experimentPy",
"program": "${workspaceFolder}/build_debug/src/Platform/b_main",
"args": [
"-m",
"STree",
"--stratified",
"-d",
"iris",
//"--discretize"
// "--hyperparameters",
// "{\"repeatSparent\": true, \"maxModels\": 12}"
],
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "gridsearch",
"program": "${workspaceFolder}/build_debug/src/Platform/b_grid",
"args": [
"-m",
"KDB",
"--discretize",
"--continue",
"glass",
"--only",
"--compute"
],
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "experimentBayes",
"program": "${workspaceFolder}/build_debug/src/Platform/b_main",
"args": [
"-m",
"TAN",
"--stratified",
"--discretize",
"-d",
"iris",
"--hyperparameters",
"{\"repeatSparent\": true, \"maxModels\": 12}"
],
"cwd": "/home/rmontanana/Code/discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "best",
"program": "${workspaceFolder}/build_debug/src/Platform/b_best",
"args": [
"-m",
"BoostAODE",
"-s",
"accuracy",
"--build",
],
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "manage",
"program": "${workspaceFolder}/build_debug/src/Platform/b_manage",
"args": [
"-n",
"20"
],
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"type": "lldb",
"request": "launch",
"name": "list",
"program": "${workspaceFolder}/build_debug/src/Platform/b_list",
"args": [],
//"cwd": "/Users/rmontanana/Code/discretizbench",
"cwd": "${workspaceFolder}/../discretizbench",
},
{
"type": "lldb",
@ -112,19 +20,6 @@
// "-s",
],
"cwd": "${workspaceFolder}/build_debug/tests",
},
{
"name": "Build & debug active file",
"type": "cppdbg",
"request": "launch",
"program": "${workspaceFolder}/build_debug/bayesnet",
"args": [],
"stopAtEntry": false,
"cwd": "${workspaceFolder}",
"environment": [],
"externalConsole": false,
"MIMode": "lldb",
"preLaunchTask": "CMake: build"
}
]
}

View File

@ -5,11 +5,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.0.4]
### Added
- Change _ascending_ hyperparameter to _order_ with these possible values _{"asc", "desc", "rand"}_
- Change _ascending_ hyperparameter to _order_ with these possible values _{"asc", "desc", "rand"}_, Default is _"desc"_.
- Add the _predict_single_ hyperparameter to control if only the last model created is used to predict in boost training or the whole ensemble (all the models built so far). Default is true.
- sample app to show how to use the library (make sample)
### Changed
- Change the library structure adding folders for each group of classes (classifiers, ensembles, etc).
- The significances of the models generated under the feature selection algorithm are now computed after all the models have been generated and an &alpha;<sub>t</sub> value is computed and assigned to each model.
## [1.0.3]

View File

@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.20)
project(BayesNet
VERSION 1.0.3
VERSION 1.0.4
DESCRIPTION "Bayesian Network and basic classifiers Library."
HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
LANGUAGES CXX
@ -36,12 +36,19 @@ option(CODE_COVERAGE "Collect coverage from test library" OFF)
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
include(AddGitSubmodule)
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
MESSAGE("Debug mode")
set(ENABLE_TESTING ON)
set(CODE_COVERAGE ON)
endif (CMAKE_BUILD_TYPE STREQUAL "Debug")
if (CODE_COVERAGE)
enable_testing()
include(CodeCoverage)
MESSAGE("Code coverage enabled")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
enable_testing()
include(CodeCoverage)
MESSAGE("Code coverage enabled")
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS} -fprofile-arcs -ftest-coverage -O0 -g")
SET(GCC_COVERAGE_LINK_FLAGS " ${GCC_COVERAGE_LINK_FLAGS} -lgcov --coverage")
endif (CODE_COVERAGE)
if (ENABLE_CLANG_TIDY)
@ -58,10 +65,9 @@ add_git_submodule("lib/json")
# --------------
add_subdirectory(config)
add_subdirectory(lib/Files)
add_subdirectory(sample)
add_subdirectory(src)
file(GLOB BayesNet_SOURCES CONFIGURE_DEPENDS ${BayesNet_SOURCE_DIR}/src/*.cc)
# Testing
# -------
if (ENABLE_TESTING)

View File

@ -1,11 +1,11 @@
SHELL := /bin/bash
.DEFAULT_GOAL := help
.PHONY: coverage setup help buildr buildd test clean debug release
.PHONY: coverage setup help buildr buildd test clean debug release sample
f_release = build_release
f_debug = build_debug
app_targets = BayesNet
test_targets = unit_tests_bayesnet
test_targets = unit_tests_bayesnet
n_procs = -j 16
define ClearTests
@ -59,6 +59,13 @@ release: ## Build a Release version of the project
@if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi
@mkdir $(f_release);
@cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release
@echo ">>> Done";
fname = "tests/data/iris.arff"
sample: ## Build sample
@echo ">>> Building Sample...";
cmake --build $(f_release) -t bayesnet_sample $(n_procs)
$(f_release)/sample/bayesnet_sample $(fname)
@echo ">>> Done";
opt = ""

View File

@ -19,4 +19,14 @@ make test
make coverage
```
## 1. Introduction
### Sample app
```bash
make release
make sample
make sample fname=tests/data/glass.arff
```
## Models
### [BoostAODE](docs/BoostAODE.md)

Binary file not shown.

71
docs/BoostAODE.md Normal file
View File

@ -0,0 +1,71 @@
# BoostAODE Algorithm Operation
The algorithm is based on the AdaBoost algorithm with some new proposals that can be activated using the following hyperparameters.
## Hyperparameters
The hyperparameters defined in the algorithm are:
- ***repeatSparent*** (*boolean*): Allows dataset variables to be repeated as parents of an *SPODE*. Default value: *false*.
- ***maxModels*** (*int*): Maximum number of models (*SPODEs*) to build. This hyperparameter is only taken into account if ***repeatSparent*** is set to *true*. Default value: *0*.
- ***order*** (*{"asc", "desc", "rand"}*): Sets the order (ascending/descending/random) in which dataset variables will be processed to choose the parents of the *SPODEs*. Default value: *"desc"*.
- ***convergence*** (*boolean*): Sets whether the convergence of the result will be used as a termination condition. If this hyperparameter is set to true, the training dataset passed to the model is divided into two sets, one serving as training data and the other as a test set (so the original test partition will become a validation partition in this case). The partition is made by taking the first partition generated by a process of generating a 5 fold partition with stratification using a predetermined seed. The exit condition used in this *convergence* is that the difference between the accuracy obtained by the current model and that obtained by the previous model is greater than *1e-4*; otherwise, one will be added to the number of models that worsen the result (see next hyperparameter). Default value: *false*.
- ***tolerance*** (*int*): Sets the maximum number of models that can worsen the result without constituting a termination condition. Default value: *0*.
- ***select_features*** (*{"IWSS", "FCBF", "CFS", ""}*): Selects the variable selection method to be used to build initial models for the ensemble that will be included without considering any of the other exit conditions. Once the models of the selected variables are built, the algorithm will update the weights using the ensemble and set the significance of all the models built with the same &alpha;<sub>t</sub>. Default value: *""*.
- ***threshold*** (*double*): Sets the necessary value for the IWSS and FCBF algorithms to function. Accepted values are:
- IWSS: $threshold \in [0, 0.5]$
- FCBF: $threshold \in [10^{-7}, 1]$
Default value is *-1* so every time any of those algorithms are called, the threshold has to be set to the desired value.
- ***predict_voting*** (*boolean*): Sets whether the algorithm will use *model voting* to predict the result. If set to false, the weighted average of the probabilities of each model's prediction will be used. Default value: *true*.
- ***predict_single*** (*boolean*): Sets whether the algorithm will use single-model prediction in the learning process. If set to *false*, all models trained up to that point will be used to calculate the prediction necessary to update the weights in the learning process. Default value: *true*.
## Operation
The algorithm performs the following steps:
1. **Initialization**
- If ***select_features*** is set, as many *SPODEs* are created as variables selected by the corresponding feature selection algorithm, and these variables are marked as used.
- Initial weights of the examples are set to *1/m*.
1. **Main Training Loop:**
- Variables are sorted by mutual information order with the class variable and processed in ascending, descending or random order, according to the value of the *order* hyperparameter. If it is random, the variables are shuffled.
- If the parent repetition is not established, the variable is marked as used.
- A *SPODE* is created using the selected variable as the parent.
- The model is trained, and the class variable corresponding to the training dataset is calculated. The calculation can be done using the last trained model or the set of models trained up to that point, according to the value of the *predict_single* hyperparameter.
- The weights associated with the examples are updated using this expression:
- w<sub>i</sub> · e<sup>&alpha;<sub>t</sub></sup> (if the example has been misclassified)
- w<sub>i</sub> · e<sup>-&alpha;<sub>t</sub></sup> (if the example has been correctly classified)
- The model significance is set to &alpha;<sub>t</sub>.
- If the ***convergence*** hyperparameter is set, the accuracy value on the test dataset that we separated in an initial step is calculated.
1. **Exit Conditions:**
- &epsilon;<sub>t</sub> > 0.5 => misclassified examples are penalized.
- Number of models with worse accuracy greater than ***tolerance*** and ***convergence*** established.
- There are no more variables to create models, and ***repeatSparent*** is not set.
- Number of models > ***maxModels*** if ***repeatSparent*** is set.
### [Proposal for *predict_single = false*](./BoostAODE_train_predict.pdf)

Binary file not shown.

Binary file not shown.

View File

@ -1,4 +1,4 @@
filter = src/
exclude-directories = build_debug/lib/
print-summary = yes
sort-percentage = yes
sort = uncovered-percent

14
sample/CMakeLists.txt Normal file
View File

@ -0,0 +1,14 @@
include_directories(
${BayesNet_SOURCE_DIR}/src
${BayesNet_SOURCE_DIR}/src/classifiers
${BayesNet_SOURCE_DIR}/src/ensembles
${BayesNet_SOURCE_DIR}/src/bayesian_network
${BayesNet_SOURCE_DIR}/src/utils
${BayesNet_SOURCE_DIR}/src/feature_selection
${BayesNet_SOURCE_DIR}/lib/Files
${BayesNet_SOURCE_DIR}/lib/mdlp
${BayesNet_SOURCE_DIR}/lib/json/include
${CMAKE_BINARY_DIR}/configured_files/include
)
add_executable(bayesnet_sample sample.cc)
target_link_libraries(bayesnet_sample ArffFiles BayesNet)

62
sample/sample.cc Normal file
View File

@ -0,0 +1,62 @@
#include "ArffFiles.h"
#include "CPPFImdlp.h"
#include "BoostAODE.h"
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
std::vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
tuple<torch::Tensor, torch::Tensor, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last)
{
auto handler = ArffFiles();
handler.load(name, class_last);
// Get Dataset X, y
std::vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
std::vector<std::string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
torch::Tensor Xd;
auto states = map<std::string, std::vector<int>>();
auto Xr = discretizeDataset(X, y);
Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
}
states[className] = std::vector<int>(*max_element(y.begin(), y.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
}
int main(int argc, char* argv[])
{
if (argc < 2) {
std::cerr << "Usage: " << argv[0] << " <file_name>" << std::endl;
return 1;
}
std::string file_name = argv[1];
torch::Tensor X, y;
std::vector<std::string> features;
std::string className;
map<std::string, std::vector<int>> states;
auto clf = bayesnet::BoostAODE(false); // false for not using voting in predict
std::cout << "Library version: " << clf.getVersion() << std::endl;
tie(X, y, features, className, states) = loadDataset(file_name, true);
clf.fit(X, y, features, className, states);
auto score = clf.score(X, y);
std::cout << "File: " << file_name << " score: " << score << std::endl;
return 0;
}

View File

@ -4,10 +4,15 @@ include_directories(
${BayesNet_SOURCE_DIR}/lib/folding
${BayesNet_SOURCE_DIR}/lib/json/include
${BayesNet_SOURCE_DIR}/src
${BayesNet_SOURCE_DIR}/src/feature_selection
${BayesNet_SOURCE_DIR}/src/bayesian_network
${BayesNet_SOURCE_DIR}/src/classifiers
${BayesNet_SOURCE_DIR}/src/ensembles
${BayesNet_SOURCE_DIR}/src/utils
${CMAKE_BINARY_DIR}/configured_files/include
)
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc )
file(GLOB_RECURSE Sources "*.cc")
add_library(BayesNet ${Sources})
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")

View File

@ -71,7 +71,7 @@ namespace bayesnet {
for (Node* child : nodes[nodeId]->getChildren()) {
if (visited.find(child->getName()) == visited.end() && isCyclic(child->getName(), visited, recStack))
return true;
else if (recStack.find(child->getName()) != recStack.end())
if (recStack.find(child->getName()) != recStack.end())
return true;
}
}

View File

@ -1,6 +1,7 @@
#include <set>
#include <functional>
#include <limits.h>
#include <tuple>
#include "BoostAODE.h"
#include "CFS.h"
#include "FCBF.h"
@ -8,9 +9,22 @@
#include "folding.hpp"
namespace bayesnet {
struct {
std::string CFS = "CFS";
std::string FCBF = "FCBF";
std::string IWSS = "IWSS";
}SelectFeatures;
struct {
std::string ASC = "asc";
std::string DESC = "desc";
std::string RAND = "rand";
}Orders;
BoostAODE::BoostAODE(bool predict_voting) : Ensemble(predict_voting)
{
validHyperparameters = { "repeatSparent", "maxModels", "order", "convergence", "threshold", "select_features", "tolerance", "predict_voting" };
validHyperparameters = {
"repeatSparent", "maxModels", "order", "convergence", "threshold",
"select_features", "tolerance", "predict_voting", "predict_single"
};
}
void BoostAODE::buildModel(const torch::Tensor& weights)
@ -58,10 +72,10 @@ namespace bayesnet {
hyperparameters.erase("maxModels");
}
if (hyperparameters.contains("order")) {
std::vector<std::string> algos = { "asc", "desc", "rand" };
std::vector<std::string> algos = { Orders.ASC, Orders.DESC, Orders.RAND };
order_algorithm = hyperparameters["order"];
if (std::find(algos.begin(), algos.end(), order_algorithm) == algos.end()) {
throw std::invalid_argument("Invalid order algorithm, valid values [asc, desc, rand]");
throw std::invalid_argument("Invalid order algorithm, valid values [" + Orders.ASC + ", " + Orders.DESC + ", " + Orders.RAND + "]");
}
hyperparameters.erase("order");
}
@ -69,6 +83,10 @@ namespace bayesnet {
convergence = hyperparameters["convergence"];
hyperparameters.erase("convergence");
}
if (hyperparameters.contains("predict_single")) {
predict_single = hyperparameters["predict_single"];
hyperparameters.erase("predict_single");
}
if (hyperparameters.contains("threshold")) {
threshold = hyperparameters["threshold"];
hyperparameters.erase("threshold");
@ -83,11 +101,11 @@ namespace bayesnet {
}
if (hyperparameters.contains("select_features")) {
auto selectedAlgorithm = hyperparameters["select_features"];
std::vector<std::string> algos = { "IWSS", "FCBF", "CFS" };
std::vector<std::string> algos = { SelectFeatures.IWSS, SelectFeatures.CFS, SelectFeatures.CFS };
selectFeatures = true;
select_features_algorithm = selectedAlgorithm;
if (std::find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) {
throw std::invalid_argument("Invalid selectFeatures value, valid values [IWSS, FCBF, CFS]");
throw std::invalid_argument("Invalid selectFeatures value, valid values [" + SelectFeatures.IWSS + ", " + SelectFeatures.CFS + ", " + SelectFeatures.FCBF + "]");
}
hyperparameters.erase("select_features");
}
@ -95,28 +113,54 @@ namespace bayesnet {
throw std::invalid_argument("Invalid hyperparameters" + hyperparameters.dump());
}
}
std::tuple<torch::Tensor&, double, bool> update_weights(torch::Tensor& ytrain, torch::Tensor& ypred, torch::Tensor& weights)
{
bool terminate = false;
double alpha_t = 0;
auto mask_wrong = ypred != ytrain;
auto mask_right = ypred == ytrain;
auto masked_weights = weights * mask_wrong.to(weights.dtype());
double epsilon_t = masked_weights.sum().item<double>();
if (epsilon_t > 0.5) {
// Inverse the weights policy (plot ln(wt))
// "In each round of AdaBoost, there is a sanity check to ensure that the current base
// learner is better than random guess" (Zhi-Hua Zhou, 2012)
terminate = true;
} else {
double wt = (1 - epsilon_t) / epsilon_t;
alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
weights += mask_wrong.to(weights.dtype()) * exp(alpha_t) * weights;
// Step 3.2.2: Update weights of right samples
weights += mask_right.to(weights.dtype()) * exp(-alpha_t) * weights;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights).item<double>();
weights = weights / totalWeights;
}
return { weights, alpha_t, terminate };
}
std::unordered_set<int> BoostAODE::initializeModels()
{
std::unordered_set<int> featuresUsed;
torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
int maxFeatures = 0;
if (select_features_algorithm == "CFS") {
if (select_features_algorithm == SelectFeatures.CFS) {
featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
} else if (select_features_algorithm == "IWSS") {
} else if (select_features_algorithm == SelectFeatures.IWSS) {
if (threshold < 0 || threshold >0.5) {
throw std::invalid_argument("Invalid threshold value for IWSS [0, 0.5]");
throw std::invalid_argument("Invalid threshold value for " + SelectFeatures.IWSS + " [0, 0.5]");
}
featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
} else if (select_features_algorithm == "FCBF") {
} else if (select_features_algorithm == SelectFeatures.FCBF) {
if (threshold < 1e-7 || threshold > 1) {
throw std::invalid_argument("Invalid threshold value [1e-7, 1]");
throw std::invalid_argument("Invalid threshold value for " + SelectFeatures.FCBF + " [1e-7, 1]");
}
featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
}
featureSelector->fit();
auto cfsFeatures = featureSelector->getFeatures();
for (const int& feature : cfsFeatures) {
// std::cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << std::endl;
featuresUsed.insert(feature);
std::unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
@ -128,22 +172,46 @@ namespace bayesnet {
delete featureSelector;
return featuresUsed;
}
torch::Tensor BoostAODE::ensemble_predict(torch::Tensor& X, SPODE* model)
{
if (initialize_prob_table) {
initialize_prob_table = false;
prob_table = model->predict_proba(X) * 1.0;
} else {
prob_table += model->predict_proba(X) * 1.0;
}
// prob_table doesn't store probabilities but the sum of them
// to have them we need to divide by the sum of the "weights" used to
// consider the results obtanined in the model's predict_proba.
return prob_table.argmax(1);
}
void BoostAODE::trainModel(const torch::Tensor& weights)
{
fitted = true;
// Algorithm based on the adaboost algorithm for classification
// as explained in Ensemble methods (Zhi-Hua Zhou, 2012)
initialize_prob_table = true;
fitted = true;
double alpha_t = 0;
torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
bool exitCondition = false;
std::unordered_set<int> featuresUsed;
if (selectFeatures) {
featuresUsed = initializeModels();
auto ypred = predict(X_train);
std::tie(weights_, alpha_t, exitCondition) = update_weights(y_train, ypred, weights_);
// Update significance of the models
for (int i = 0; i < n_models; ++i) {
significanceModels[i] = alpha_t;
}
if (exitCondition) {
return;
}
}
bool resetMaxModels = false;
if (maxModels == 0) {
maxModels = .1 * n > 10 ? .1 * n : n;
resetMaxModels = true; // Flag to unset maxModels
}
torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
bool exitCondition = false;
// Variables to control the accuracy finish condition
double priorAccuracy = 0.0;
double delta = 1.0;
@ -154,12 +222,12 @@ namespace bayesnet {
// n_models == maxModels
// epsilon sub t > 0.5 => inverse the weights policy
// validation error is not decreasing
bool ascending = order_algorithm == "asc";
bool ascending = order_algorithm == Orders.ASC;
std::mt19937 g{ 173 };
while (!exitCondition) {
// Step 1: Build ranking with mutual information
auto featureSelection = metrics.SelectKBestWeighted(weights_, ascending, n); // Get all the features sorted
if (order_algorithm == "rand") {
if (order_algorithm == Orders.RAND) {
std::shuffle(featureSelection.begin(), featureSelection.end(), g);
}
auto feature = featureSelection[0];
@ -181,28 +249,17 @@ namespace bayesnet {
std::unique_ptr<Classifier> model;
model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
auto ypred = model->predict(X_train);
torch::Tensor ypred;
if (predict_single) {
ypred = model->predict(X_train);
} else {
ypred = ensemble_predict(X_train, dynamic_cast<SPODE*>(model.get()));
}
// Step 3.1: Compute the classifier amout of say
auto mask_wrong = ypred != y_train;
auto mask_right = ypred == y_train;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double epsilon_t = masked_weights.sum().item<double>();
if (epsilon_t > 0.5) {
// Inverse the weights policy (plot ln(wt))
// "In each round of AdaBoost, there is a sanity check to ensure that the current base
// learner is better than random guess" (Zhi-Hua Zhou, 2012)
std::tie(weights_, alpha_t, exitCondition) = update_weights(y_train, ypred, weights_);
if (exitCondition) {
break;
}
double wt = (1 - epsilon_t) / epsilon_t;
double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_;
// Step 3.2.2: Update weights of right samples
weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights_).item<double>();
weights_ = weights_ / totalWeights;
// Step 3.4: Store classifier and its accuracy to weigh its future vote
featuresUsed.insert(feature);
models.push_back(std::move(model));

View File

@ -15,17 +15,21 @@ namespace bayesnet {
void buildModel(const torch::Tensor& weights) override;
void trainModel(const torch::Tensor& weights) override;
private:
std::unordered_set<int> initializeModels();
torch::Tensor ensemble_predict(torch::Tensor& X, SPODE* model);
torch::Tensor dataset_;
torch::Tensor X_train, y_train, X_test, y_test;
std::unordered_set<int> initializeModels();
// Hyperparameters
bool repeatSparent = false; // if true, a feature can be selected more than once
int maxModels = 0;
int tolerance = 0;
bool predict_single = true; // wether the last model is used to predict in training or the whole ensemble
std::string order_algorithm; // order to process the KBest features asc, desc, rand
bool convergence = false; //if true, stop when the model does not improve
bool selectFeatures = false; // if true, use feature selection
std::string select_features_algorithm = ""; // Selected feature selection algorithm
std::string select_features_algorithm = "desc"; // Selected feature selection algorithm
bool initialize_prob_table; // if true, initialize the prob_table with the first model (used in train)
torch::Tensor prob_table; // Table of probabilities for ensemble predicting if predict_single is false
FeatureSelect* featureSelector = nullptr;
double threshold = -1;
};

View File

@ -50,7 +50,6 @@ namespace bayesnet {
}
double FeatureSelect::computeMeritCFS()
{
double result;
double rcf = 0;
for (auto feature : selectedFeatures) {
rcf += suLabels[feature];

View File

@ -28,7 +28,7 @@ namespace bayesnet {
selectedFeatures.push_back(feature);
// Compute merit with selectedFeatures
auto meritNew = computeMeritCFS();
double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0;
double delta = merit != 0.0 ? std::abs(merit - meritNew) / merit : 0.0;
if (meritNew > merit || delta < threshold) {
if (meritNew > merit) {
merit = meritNew;

View File

@ -2,13 +2,18 @@ if(ENABLE_TESTING)
set(TEST_BAYESNET "unit_tests_bayesnet")
include_directories(
${BayesNet_SOURCE_DIR}/src
${BayesNet_SOURCE_DIR}/src/Platform
${BayesNet_SOURCE_DIR}/src/feature_selection
${BayesNet_SOURCE_DIR}/src/bayesian_network
${BayesNet_SOURCE_DIR}/src/classifiers
${BayesNet_SOURCE_DIR}/src/utils
${BayesNet_SOURCE_DIR}/src/ensembles
${BayesNet_SOURCE_DIR}/lib/Files
${BayesNet_SOURCE_DIR}/lib/mdlp
${BayesNet_SOURCE_DIR}/lib/folding
${BayesNet_SOURCE_DIR}/lib/json/include
${CMAKE_BINARY_DIR}/configured_files/include
)
file(GLOB_RECURSE BayesNet_SOURCES "${BayesNet_SOURCE_DIR}/src/*.cc")
set(TEST_SOURCES_BAYESNET TestBayesModels.cc TestBayesNetwork.cc TestBayesMetrics.cc TestUtils.cc ${BayesNet_SOURCES})
add_executable(${TEST_BAYESNET} ${TEST_SOURCES_BAYESNET})
target_link_libraries(${TEST_BAYESNET} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain )

View File

@ -224,6 +224,8 @@ TEST_CASE("BoostAODE voting-proba", "[BayesNet]")
REQUIRE(score_voting == Catch::Approx(0.98).epsilon(raw.epsilon));
REQUIRE(pred_voting[83][2] == Catch::Approx(0.552091).epsilon(raw.epsilon));
REQUIRE(pred_proba[83][2] == Catch::Approx(0.546017).epsilon(raw.epsilon));
clf.dump_cpt();
REQUIRE(clf.topological_order() == std::vector<std::string>());
}
TEST_CASE("BoostAODE order asc, desc & random", "[BayesNet]")
{
@ -240,10 +242,28 @@ TEST_CASE("BoostAODE order asc, desc & random", "[BayesNet]")
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = clf.score(raw.Xv, raw.yv);
auto scoret = clf.score(raw.Xt, raw.yt);
auto score2 = clf.score(raw.Xv, raw.yv);
auto scoret2 = clf.score(raw.Xt, raw.yt);
INFO("order: " + order);
REQUIRE(score == Catch::Approx(scores[order]).epsilon(raw.epsilon));
REQUIRE(scoret == Catch::Approx(scores[order]).epsilon(raw.epsilon));
}
}
TEST_CASE("BoostAODE predict_single", "[BayesNet]")
{
auto raw = RawDatasets("glass", true);
std::map<bool, double> scores{
{true, 0.84579f }, { false, 0.80841f }
};
for (const bool kind : { true, false}) {
auto clf = bayesnet::BoostAODE();
clf.setHyperparameters({
{"predict_single", kind}, {"order", "desc" },
});
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = clf.score(raw.Xv, raw.yv);
auto scoret = clf.score(raw.Xt, raw.yt);
INFO("kind: " + std::string(kind ? "true" : "false"));
REQUIRE(score == Catch::Approx(scores[kind]).epsilon(raw.epsilon));
REQUIRE(scoret == Catch::Approx(scores[kind]).epsilon(raw.epsilon));
}
}