Compare commits

..

10 Commits

15 changed files with 141 additions and 116 deletions

5
.gitmodules vendored
View File

@@ -8,11 +8,6 @@
main = v2.x
update = merge
url = https://github.com/catchorg/Catch2.git
[submodule "lib/argparse"]
path = lib/argparse
url = https://github.com/p-ranav/argparse
master = master
update = merge
[submodule "lib/json"]
path = lib/json
url = https://github.com/nlohmann/json.git

6
.vscode/launch.json vendored
View File

@@ -106,12 +106,12 @@
"type": "lldb",
"request": "launch",
"name": "test",
"program": "${workspaceFolder}/build_debug/tests/unit_tests",
"program": "${workspaceFolder}/build_debug/tests/unit_tests_bayesnet",
"args": [
"-c=\"Metrics Test\"",
//"-c=\"Metrics Test\"",
// "-s",
],
"cwd": "${workspaceFolder}/build/tests",
"cwd": "${workspaceFolder}/build_debug/tests",
},
{
"name": "Build & debug active file",

32
CHANGELOG.md Normal file
View File

@@ -0,0 +1,32 @@
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.0.2] - 2024-02-20
### Fixed
- Fix bug in BoostAODE: do not include the model if epsilon sub t is greater than 0.5
- Fix bug in BoostAODE: compare accuracy with previous accuracy instead of the first of the ensemble if convergence true
## [1.0.1] - 2024-02-12
### Added
- Notes in Classifier class
- BoostAODE: Add note with used features in initialization with feature selection
- BoostAODE: Add note with the number of models
- BoostAODE: Add note with the number of features used to create models if not all features are used
- Test version number in TestBayesModels
- Add tests with feature_select and notes on BoostAODE
### Fixed
- Network predict test
- Network predict_proba test
- Network score test

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.20)
project(BayesNet
VERSION 1.0.0
VERSION 1.0.2
DESCRIPTION "Bayesian Network and basic classifiers Library."
HOMEPAGE_URL "https://github.com/rmontanana/bayesnet"
LANGUAGES CXX
@@ -52,7 +52,6 @@ endif (ENABLE_CLANG_TIDY)
# ---------------------------------------------
# include(FetchContent)
add_git_submodule("lib/mdlp")
add_git_submodule("lib/argparse")
add_git_submodule("lib/json")
# Subdirectories

BIN
docs/BoostAODE.docx Normal file

Binary file not shown.

1
lib/libxlsxwriter Submodule

Submodule lib/libxlsxwriter added at 29355a0887

View File

@@ -26,6 +26,7 @@ namespace bayesnet {
std::vector<std::string> virtual graph(const std::string& title = "") const = 0;
virtual std::string getVersion() = 0;
std::vector<std::string> virtual topological_order() = 0;
std::vector<std::string> virtual getNotes() const = 0;
void virtual dump_cpt()const = 0;
virtual void setHyperparameters(const nlohmann::json& hyperparameters) = 0;
std::vector<std::string>& getValidHyperparameters() { return validHyperparameters; }

View File

@@ -115,11 +115,15 @@ namespace bayesnet {
significanceModels.push_back(1.0);
n_models++;
}
notes.push_back("Used features in initialization: " + std::to_string(featuresUsed.size()) + " of " + std::to_string(features.size()) + " with " + algorithm);
delete featureSelector;
return featuresUsed;
}
void BoostAODE::trainModel(const torch::Tensor& weights)
{
fitted = true;
// Algorithm based on the adaboost algorithm for classification
// as explained in Ensemble methods (Zhi-Hua Zhou, 2012)
std::unordered_set<int> featuresUsed;
if (selectFeatures) {
featuresUsed = initializeModels();
@@ -131,9 +135,8 @@ namespace bayesnet {
// Variables to control the accuracy finish condition
double priorAccuracy = 0.0;
double delta = 1.0;
double threshold = 1e-4;
int count = 0; // number of times the accuracy is lower than the threshold
fitted = true; // to enable predict
double convergence_threshold = 1e-4;
int count = 0; // number of times the accuracy is lower than the convergence_threshold
// Step 0: Set the finish condition
// if not repeatSparent a finish condition is run out of features
// n_models == maxModels
@@ -159,7 +162,6 @@ namespace bayesnet {
continue;
}
}
featuresUsed.insert(feature);
model = std::make_unique<SPODE>(feature);
model->fit(dataset, features, className, states, weights_);
auto ypred = model->predict(X_train);
@@ -168,6 +170,12 @@ namespace bayesnet {
auto mask_right = ypred == y_train;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double epsilon_t = masked_weights.sum().item<double>();
if (epsilon_t > 0.5) {
// Inverse the weights policy (plot ln(wt))
// "In each round of AdaBoost, there is a sanity check to ensure that the current base
// learner is better than random guess" (Zhi-Hua Zhou, 2012)
break;
}
double wt = (1 - epsilon_t) / epsilon_t;
double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
// Step 3.2: Update weights for next classifier
@@ -179,6 +187,7 @@ namespace bayesnet {
double totalWeights = torch::sum(weights_).item<double>();
weights_ = weights_ / totalWeights;
// Step 3.4: Store classifier and its accuracy to weigh its future vote
featuresUsed.insert(feature);
models.push_back(std::move(model));
significanceModels.push_back(alpha_t);
n_models++;
@@ -190,15 +199,18 @@ namespace bayesnet {
} else {
delta = accuracy - priorAccuracy;
}
if (delta < threshold) {
if (delta < convergence_threshold) {
count++;
}
priorAccuracy = accuracy;
}
exitCondition = n_models >= maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance;
exitCondition = n_models >= maxModels && repeatSparent || count > tolerance;
}
if (featuresUsed.size() != features.size()) {
notes.push_back("Used features in train: " + std::to_string(featuresUsed.size()) + " of " + std::to_string(features.size()));
status = WARNING;
}
notes.push_back("Number of models: " + std::to_string(n_models));
}
std::vector<std::string> BoostAODE::graph(const std::string& title) const
{

View File

@@ -19,6 +19,7 @@ namespace bayesnet {
std::map<std::string, std::vector<int>> states;
torch::Tensor dataset; // (n+1)xm tensor
status_t status = NORMAL;
std::vector<std::string> notes; // Used to store messages occurred during the fit process
void checkFitParameters();
virtual void buildModel(const torch::Tensor& weights) = 0;
void trainModel(const torch::Tensor& weights) override;
@@ -36,12 +37,13 @@ namespace bayesnet {
int getNumberOfStates() const override;
torch::Tensor predict(torch::Tensor& X) override;
status_t getStatus() const override { return status; }
std::string getVersion() override { return "0.2.0"; };
std::string getVersion() override { return { project_version.begin(), project_version.end() }; };
std::vector<int> predict(std::vector<std::vector<int>>& X) override;
float score(torch::Tensor& X, torch::Tensor& y) override;
float score(std::vector<std::vector<int>>& X, std::vector<int>& y) override;
std::vector<std::string> show() const override;
std::vector<std::string> topological_order() override;
std::vector<std::string> getNotes() const override { return notes; }
void dump_cpt() const override;
void setHyperparameters(const nlohmann::json& hyperparameters) override; //For classifiers that don't have hyperparameters
};

View File

@@ -7,7 +7,6 @@ if(ENABLE_TESTING)
${BayesNet_SOURCE_DIR}/lib/mdlp
${BayesNet_SOURCE_DIR}/lib/folding
${BayesNet_SOURCE_DIR}/lib/json/include
${BayesNet_SOURCE_DIR}/lib/argparse/include
${CMAKE_BINARY_DIR}/configured_files/include
)
set(TEST_SOURCES_BAYESNET TestBayesModels.cc TestBayesNetwork.cc TestBayesMetrics.cc TestUtils.cc ${BayesNet_SOURCES})

View File

@@ -16,6 +16,11 @@
#include "AODELd.h"
#include "TestUtils.h"
TEST_CASE("Library check version", "[BayesNet]")
{
auto clf = bayesnet::KDB(2);
REQUIRE(clf.getVersion() == "1.0.2");
}
TEST_CASE("Test Bayesian Classifiers score", "[BayesNet]")
{
map <pair<std::string, std::string>, float> scores = {
@@ -138,4 +143,35 @@ TEST_CASE("Get num features & num edges", "[BayesNet]")
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
REQUIRE(clf.getNumberOfNodes() == 5);
REQUIRE(clf.getNumberOfEdges() == 8);
}
}
TEST_CASE("BoostAODE feature_select CFS", "[BayesNet]")
{
auto raw = RawDatasets("glass", true);
auto clf = bayesnet::BoostAODE();
clf.setHyperparameters({ {"select_features", "CFS"} });
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
REQUIRE(clf.getNumberOfNodes() == 90);
REQUIRE(clf.getNumberOfEdges() == 153);
REQUIRE(clf.getNotes().size() == 2);
REQUIRE(clf.getNotes()[0] == "Used features in initialization: 6 of 9 with CFS");
REQUIRE(clf.getNotes()[1] == "Number of models: 9");
}
TEST_CASE("BoostAODE test used features in train note", "[BayesNet]")
{
auto raw = RawDatasets("diabetes", true);
auto clf = bayesnet::BoostAODE();
clf.setHyperparameters({
{"ascending",true},
{"convergence", true},
{"repeatSparent",true},
{"select_features","CFS"},
{"tolerance", 3}
});
clf.fit(raw.Xv, raw.yv, raw.featuresv, raw.classNamev, raw.statesv);
REQUIRE(clf.getNumberOfNodes() == 72);
REQUIRE(clf.getNumberOfEdges() == 120);
REQUIRE(clf.getNotes().size() == 3);
REQUIRE(clf.getNotes()[0] == "Used features in initialization: 6 of 8 with CFS");
REQUIRE(clf.getNotes()[1] == "Used features in train: 7 of 8");
REQUIRE(clf.getNotes()[2] == "Number of models: 8");
}

View File

@@ -25,6 +25,7 @@ TEST_CASE("Test Bayesian Network", "[BayesNet]")
auto raw = RawDatasets("iris", true);
auto net = bayesnet::Network();
double threshold = 1e-4;
SECTION("Test get features")
{
@@ -167,97 +168,44 @@ TEST_CASE("Test Bayesian Network", "[BayesNet]")
REQUIRE(str[5] == "C [shape=circle] \n");
REQUIRE(str[6] == "}\n");
}
// SECTION("Test predict")
// {
// auto net = bayesnet::Network();
// net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
// std::vector<std::vector<int>> test = { {1, 2, 0, 1}, {0, 1, 2, 0}, {1, 1, 1, 1}, {0, 0, 0, 0}, {2, 2, 2, 2} };
// std::vector<int> y_test = { 0, 1, 1, 0, 2 };
// auto y_pred = net.predict(test);
// REQUIRE(y_pred == y_test);
// }
// SECTION("Test predict_proba")
// {
// auto net = bayesnet::Network();
// net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
// std::vector<std::vector<int>> test = { {1, 2, 0, 1}, {0, 1, 2, 0}, {1, 1, 1, 1}, {0, 0, 0, 0}, {2, 2, 2, 2} };
// auto y_test = { 0, 1, 1, 0, 2 };
// auto y_pred = net.predict(test);
// REQUIRE(y_pred == y_test);
// }
}
// SECTION("Test score")
// {
// auto net = bayesnet::Network();
// net.fit(Xd, y, weights, features, className, states);
// auto test = { {1, 2, 0, 1}, {0, 1, 2, 0}, {1, 1, 1, 1}, {0, 0, 0, 0}, {2, 2, 2, 2} };
// auto score = net.score(X, y);
// REQUIRE(score == Catch::Approx();
// }
//
//
// SECTION("Test graph")
// {
// auto net = bayesnet::Network();
// net.addNode("A");
// net.addNode("B");
// net.addNode("C");
// net.addEdge("A", "B");
// net.addEdge("A", "C");
// auto str = net.graph("Test Graph");
// REQUIRE(str.size() == 6);
// REQUIRE(str[0] == "digraph \"Test Graph\" {");
// REQUIRE(str[1] == " A -> B;");
// REQUIRE(str[2] == " A -> C;");
// REQUIRE(str[3] == " B [shape=ellipse];");
// REQUIRE(str[4] == " C [shape=ellipse];");
// REQUIRE(str[5] == "}");
// }
// SECTION("Test initialize")
// {
// auto net = bayesnet::Network();
// net.addNode("A");
// net.addNode("B");
// net.addNode("C");
// net.addEdge("A", "B");
// net.addEdge("A", "C");
// net.initialize();
// REQUIRE(net.getNodes().size() == 0);
// REQUIRE(net.getEdges().size() == 0);
// REQUIRE(net.getFeatures().size() == 0);
// REQUIRE(net.getClassNumStates() == 0);
// REQUIRE(net.getClassName().empty());
// REQUIRE(net.getStates() == 0);
// REQUIRE(net.getSamples().numel() == 0);
// }
// SECTION("Test dump_cpt")
// {
// auto net = bayesnet::Network();
// net.addNode("A");
// net.addNode("B");
// net.addNode("C");
// net.addEdge("A", "B");
// net.addEdge("A", "C");
// net.setClassName("C");
// net.setStates({ {"A", {0, 1}}, {"B", {0, 1}}, {"C", {0, 1, 2}} });
// net.fit({ {0, 0}, {0, 1}, {1, 0}, {1, 1} }, { 0, 1, 1, 2 }, {}, { "A", "B" }, "C", { {"A", {0, 1}}, {"B", {0, 1}}, {"C", {0, 1, 2}} });
// net.dump_cpt();
// // TODO: Check that the file was created and contains the expected data
// }
// SECTION("Test version")
// {
// auto net = bayesnet::Network();
// REQUIRE(net.version() == "0.2.0");
// }
// }
// }
SECTION("Test predict")
{
auto net = bayesnet::Network();
buildModel(net, raw.featuresv, raw.classNamev);
net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
std::vector<std::vector<int>> test = { {1, 2, 0, 1, 1}, {0, 1, 2, 0, 1}, {0, 0, 0, 0, 1}, {2, 2, 2, 2, 1} };
std::vector<int> y_test = { 2, 2, 0, 2, 1 };
auto y_pred = net.predict(test);
REQUIRE(y_pred == y_test);
}
SECTION("Test predict_proba")
{
auto net = bayesnet::Network();
buildModel(net, raw.featuresv, raw.classNamev);
net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
std::vector<std::vector<int>> test = { {1, 2, 0, 1, 1}, {0, 1, 2, 0, 1}, {0, 0, 0, 0, 1}, {2, 2, 2, 2, 1} };
std::vector<std::vector<double>> y_test = {
{0.450237, 0.0866621, 0.463101},
{0.244443, 0.0925922, 0.662964},
{0.913441, 0.0125857, 0.0739732},
{0.450237, 0.0866621, 0.463101},
{0.0135226, 0.971726, 0.0147519}
};
auto y_pred = net.predict_proba(test);
REQUIRE(y_pred.size() == 5);
REQUIRE(y_pred[0].size() == 3);
for (int i = 0; i < y_pred.size(); ++i) {
for (int j = 0; j < y_pred[i].size(); ++j) {
REQUIRE(y_pred[i][j] == Catch::Approx(y_test[i][j]).margin(threshold));
}
}
}
SECTION("Test score")
{
auto net = bayesnet::Network();
buildModel(net, raw.featuresv, raw.classNamev);
net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv);
auto score = net.score(raw.Xv, raw.yv);
REQUIRE(score == Catch::Approx(0.97333333).margin(threshold));
}
}