Implement 3 types of smoothing

This commit is contained in:
Ricardo Montañana Gómez 2024-06-10 15:49:01 +02:00
parent 684443a788
commit 27a3e5a5e0
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
11 changed files with 37 additions and 9 deletions

View File

@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add tests to check the correct version of the mdlp, folding and json libraries.
- Library documentation generated with Doxygen.
- Link to documentation in the README.md.
- Three types of smoothing the Bayesian Network OLD_LAPLACE, LAPLACE and CESTNIK.
### Internal

View File

@ -7,7 +7,7 @@
[![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=rmontanana_BayesNet&metric=security_rating)](https://sonarcloud.io/summary/new_code?id=rmontanana_BayesNet)
[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=rmontanana_BayesNet&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=rmontanana_BayesNet)
![Gitea Last Commit](https://img.shields.io/gitea/last-commit/rmontanana/bayesnet?gitea_url=https://gitea.rmontanana.es:3000&logo=gitea)
[![Coverage Badge](https://img.shields.io/badge/Coverage-97,3%25-green)](html/index.html)
[![Coverage Badge](https://img.shields.io/badge/Coverage-97,2%25-green)](html/index.html)
Bayesian Network Classifiers using libtorch from scratch

View File

@ -8,10 +8,13 @@
#include <vector>
#include <torch/torch.h>
#include <nlohmann/json.hpp>
#include "bayesnet/network/Network.h"
namespace bayesnet {
enum status_t { NORMAL, WARNING, ERROR };
class BaseClassifier {
public:
void setSmoothing(Smoothing_t smoothing) { this->smoothing = smoothing; } // To call before fit
// X is nxm std::vector, y is nx1 std::vector
virtual BaseClassifier& fit(std::vector<std::vector<int>>& X, std::vector<int>& y, const std::vector<std::string>& features, const std::string& className, std::map<std::string, std::vector<int>>& states) = 0;
// X is nxm tensor, y is nx1 tensor
@ -41,5 +44,6 @@ namespace bayesnet {
protected:
virtual void trainModel(const torch::Tensor& weights) = 0;
std::vector<std::string> validHyperparameters;
Smoothing_t smoothing = Smoothing_t::NONE;
};
}

View File

@ -22,6 +22,7 @@ namespace bayesnet {
auto n_classes = states.at(className).size();
metrics = Metrics(dataset, features, className, n_classes);
model.initialize();
model.setSmoothing(smoothing);
buildModel(weights);
trainModel(weights);
fitted = true;

View File

@ -8,7 +8,6 @@
#define CLASSIFIER_H
#include <torch/torch.h>
#include "bayesnet/utils/BayesMetrics.h"
#include "bayesnet/network/Network.h"
#include "bayesnet/BaseClassifier.h"
namespace bayesnet {

View File

@ -37,6 +37,7 @@ namespace bayesnet {
void AODELd::trainModel(const torch::Tensor& weights)
{
for (const auto& model : models) {
model->setSmoothing(smoothing);
model->fit(Xf, y, features, className, states);
}
}

View File

@ -32,6 +32,7 @@ namespace bayesnet {
for (int j = i + 1; j < featuresSelected.size(); j++) {
auto parents = { featuresSelected[i], featuresSelected[j] };
std::unique_ptr<Classifier> model = std::make_unique<SPnDE>(parents);
model->setSmoothing(smoothing);
model->fit(dataset, features, className, states, weights_);
models.push_back(std::move(model));
significanceModels.push_back(1.0); // They will be updated later in trainModel
@ -96,6 +97,7 @@ namespace bayesnet {
pairSelection.erase(pairSelection.begin());
std::unique_ptr<Classifier> model;
model = std::make_unique<SPnDE>(std::vector<int>({ feature_pair.first, feature_pair.second }));
model->setSmoothing(smoothing);
model->fit(dataset, features, className, states, weights_);
alpha_t = 0.0;
if (!block_update) {

View File

@ -22,6 +22,7 @@ namespace bayesnet {
std::vector<int> featuresSelected = featureSelection(weights_);
for (const int& feature : featuresSelected) {
std::unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
model->setSmoothing(smoothing);
model->fit(dataset, features, className, states, weights_);
models.push_back(std::move(model));
significanceModels.push_back(1.0); // They will be updated later in trainModel
@ -89,6 +90,7 @@ namespace bayesnet {
featureSelection.erase(featureSelection.begin());
std::unique_ptr<Classifier> model;
model = std::make_unique<SPODE>(feature);
model->setSmoothing(smoothing);
model->fit(dataset, features, className, states, weights_);
alpha_t = 0.0;
if (!block_update) {

View File

@ -18,6 +18,7 @@ namespace bayesnet {
n_models = models.size();
for (auto i = 0; i < n_models; ++i) {
// fit with std::vectors
models[i]->setSmoothing(smoothing);
models[i]->fit(dataset, features, className, states);
}
}

View File

@ -165,14 +165,14 @@ namespace bayesnet {
for (int i = 0; i < featureNames.size(); ++i) {
auto row_feature = X.index({ i, "..." });
}
completeFit(states, X.size(0), weights);
completeFit(states, weights);
}
void Network::fit(const torch::Tensor& samples, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states)
{
checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states, weights);
this->className = className;
this->samples = samples;
completeFit(states, samples.size(1), weights);
completeFit(states, weights);
}
// input_data comes in nxm, where n is the number of features and m the number of samples
void Network::fit(const std::vector<std::vector<int>>& input_data, const std::vector<int>& labels, const std::vector<double>& weights_, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states)
@ -186,16 +186,30 @@ namespace bayesnet {
samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32));
}
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
completeFit(states, input_data[0].size(), weights);
completeFit(states, weights);
}
void Network::completeFit(const std::map<std::string, std::vector<int>>& states, const int n_samples, const torch::Tensor& weights)
void Network::completeFit(const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights)
{
setStates(states);
std::vector<std::thread> threads;
const double n_samples = static_cast<double>(samples.size(1));
for (auto& node : nodes) {
threads.emplace_back([this, &node, &weights, n_samples]() {
auto numStates = node.second->getNumStates();
double smoothing_factor = smoothing == Smoothing_t::CESTNIK ? static_cast<double>(n_samples) / numStates : 1.0 / static_cast<double>(n_samples);
double numStates = static_cast<double>(node.second->getNumStates());
double smoothing_factor = 0.0;
switch (smoothing) {
case Smoothing_t::OLD_LAPLACE:
smoothing_factor = 1.0 / n_samples;
break;
case Smoothing_t::LAPLACE:
smoothing_factor = 1.0;
break;
case Smoothing_t::CESTNIK:
smoothing_factor = n_samples / numStates;
break;
default:
throw std::invalid_argument("Smoothing method not recognized " + std::to_string(static_cast<int>(smoothing)));
}
node.second->computeCPT(samples, features, smoothing_factor, weights);
});
}

View File

@ -13,6 +13,8 @@
namespace bayesnet {
enum class Smoothing_t {
NONE = -1,
OLD_LAPLACE = 0,
LAPLACE,
CESTNIK
};
@ -36,6 +38,7 @@ namespace bayesnet {
/*
Notice: Nodes have to be inserted in the same order as they are in the dataset, i.e., first node is first column and so on.
*/
void setSmoothing(Smoothing_t smoothing) { this->smoothing = smoothing; };
void fit(const std::vector<std::vector<int>>& input_data, const std::vector<int>& labels, const std::vector<double>& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states);
void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states);
void fit(const torch::Tensor& samples, const torch::Tensor& weights, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states);
@ -65,7 +68,7 @@ namespace bayesnet {
std::vector<double> predict_sample(const torch::Tensor&);
std::vector<double> exactInference(std::map<std::string, int>&);
double computeFactor(std::map<std::string, int>&);
void completeFit(const std::map<std::string, std::vector<int>>& states, const int n_samples, const torch::Tensor& weights);
void completeFit(const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
void checkFitData(int n_samples, int n_features, int n_samples_y, const std::vector<std::string>& featureNames, const std::string& className, const std::map<std::string, std::vector<int>>& states, const torch::Tensor& weights);
void setStates(const std::map<std::string, std::vector<int>>&);
};