Add status to classifier and Experiment

This commit is contained in:
Ricardo Montañana Gómez 2023-09-05 13:39:43 +02:00
parent 64fc7bd9dd
commit 5a7c8f1818
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
5 changed files with 256 additions and 180 deletions

View File

@ -58,180 +58,226 @@ pair<vector<vector<int>>, vector<int>> extract_indices(vector<int> indices, vect
int main(int argc, char** argv)
{
map<string, bool> datasets = {
{"diabetes", true},
{"ecoli", true},
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
auto valid_datasets = vector<string>();
transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
[](const pair<string, bool>& pair) { return pair.first; });
argparse::ArgumentParser program("BayesNetSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name")
.action([valid_datasets](const std::string& value) {
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value;
}
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
}
);
program.add_argument("-p", "--path")
.help(" folder where the data files are located, default")
.default_value(string{ PATH }
);
program.add_argument("-m", "--model")
.help("Model to use " + platform::Models::instance()->toString())
.action([](const std::string& value) {
static const vector<string> choices = platform::Models::instance()->getNames();
if (find(choices.begin(), choices.end(), value) != choices.end()) {
return value;
}
throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
}
);
program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) {
try {
auto k = stoi(value);
if (k < 2) {
throw runtime_error("Number of folds must be greater than 1");
}
return k;
}
catch (const runtime_error& err) {
throw runtime_error(err.what());
}
catch (...) {
throw runtime_error("Number of folds must be an integer");
}});
program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
bool class_last, stratified, tensors, dump_cpt;
string model_name, file_name, path, complete_file_name;
int nFolds, seed;
try {
program.parse_args(argc, argv);
file_name = program.get<string>("dataset");
path = program.get<string>("path");
model_name = program.get<string>("model");
complete_file_name = path + file_name + ".arff";
stratified = program.get<bool>("stratified");
tensors = program.get<bool>("tensors");
nFolds = program.get<int>("folds");
seed = program.get<int>("seed");
dump_cpt = program.get<bool>("dumpcpt");
class_last = datasets[file_name];
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
torch::Tensor weights_ = torch::full({ 10 }, 1.0 / 10, torch::kFloat64);
torch::Tensor y_ = torch::tensor({ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 }, torch::kInt32);
torch::Tensor ypred = torch::tensor({ 1, 1, 1, 0, 0, 1, 1, 1, 1, 0 }, torch::kInt32);
cout << "Initial weights_: " << endl;
for (int i = 0; i < 10; i++) {
cout << weights_.index({ i }).item<double>() << ", ";
}
catch (const exception& err) {
cerr << err.what() << endl;
cerr << program;
exit(1);
cout << "end." << endl;
cout << "y_: " << endl;
for (int i = 0; i < 10; i++) {
cout << y_.index({ i }).item<int>() << ", ";
}
cout << "end." << endl;
cout << "ypred: " << endl;
for (int i = 0; i < 10; i++) {
cout << ypred.index({ i }).item<int>() << ", ";
}
cout << "end." << endl;
auto mask_wrong = ypred != y_;
auto mask_right = ypred == y_;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double epsilon_t = masked_weights.sum().item<double>();
cout << "epsilon_t: " << epsilon_t << endl;
double wt = (1 - epsilon_t) / epsilon_t;
cout << "wt: " << wt << endl;
double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
cout << "alpha_t: " << alpha_t << endl;
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
cout << "exp(alpha_t): " << exp(alpha_t) << endl;
cout << "exp(-alpha_t): " << exp(-alpha_t) << endl;
weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_;
// Step 3.2.2: Update weights of right samples
weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights_).item<double>();
cout << "totalWeights: " << totalWeights << endl;
cout << "Before normalization: " << endl;
for (int i = 0; i < 10; i++) {
cout << weights_.index({ i }).item<double>() << endl;
}
weights_ = weights_ / totalWeights;
cout << "After normalization: " << endl;
for (int i = 0; i < 10; i++) {
cout << weights_.index({ i }).item<double>() << endl;
}
// map<string, bool> datasets = {
// {"diabetes", true},
// {"ecoli", true},
// {"glass", true},
// {"iris", true},
// {"kdd_JapaneseVowels", false},
// {"letter", true},
// {"liver-disorders", true},
// {"mfeat-factors", true},
// };
// auto valid_datasets = vector<string>();
// transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets),
// [](const pair<string, bool>& pair) { return pair.first; });
// argparse::ArgumentParser program("BayesNetSample");
// program.add_argument("-d", "--dataset")
// .help("Dataset file name")
// .action([valid_datasets](const std::string& value) {
// if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
// return value;
// }
// throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
// }
// );
// program.add_argument("-p", "--path")
// .help(" folder where the data files are located, default")
// .default_value(string{ PATH }
// );
// program.add_argument("-m", "--model")
// .help("Model to use " + platform::Models::instance()->toString())
// .action([](const std::string& value) {
// static const vector<string> choices = platform::Models::instance()->getNames();
// if (find(choices.begin(), choices.end(), value) != choices.end()) {
// return value;
// }
// throw runtime_error("Model must be one of " + platform::Models::instance()->toString());
// }
// );
// program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true);
// program.add_argument("--dumpcpt").help("Dump CPT Tables").default_value(false).implicit_value(true);
// program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true);
// program.add_argument("--tensors").help("Use tensors to store samples").default_value(false).implicit_value(true);
// program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) {
// try {
// auto k = stoi(value);
// if (k < 2) {
// throw runtime_error("Number of folds must be greater than 1");
// }
// return k;
// }
// catch (const runtime_error& err) {
// throw runtime_error(err.what());
// }
// catch (...) {
// throw runtime_error("Number of folds must be an integer");
// }});
// program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>();
// bool class_last, stratified, tensors, dump_cpt;
// string model_name, file_name, path, complete_file_name;
// int nFolds, seed;
// try {
// program.parse_args(argc, argv);
// file_name = program.get<string>("dataset");
// path = program.get<string>("path");
// model_name = program.get<string>("model");
// complete_file_name = path + file_name + ".arff";
// stratified = program.get<bool>("stratified");
// tensors = program.get<bool>("tensors");
// nFolds = program.get<int>("folds");
// seed = program.get<int>("seed");
// dump_cpt = program.get<bool>("dumpcpt");
// class_last = datasets[file_name];
// if (!file_exists(complete_file_name)) {
// throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
// }
// }
// catch (const exception& err) {
// cerr << err.what() << endl;
// cerr << program;
// exit(1);
// }
/*
* Begin Processing
*/
auto handler = ArffFiles();
handler.load(complete_file_name, class_last);
// Get Dataset X, y
vector<mdlp::samples_t>& X = handler.getX();
mdlp::labels_t& y = handler.getY();
// Get className & Features
auto className = handler.getClassName();
vector<string> features;
auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features),
[](const pair<string, string>& item) { return item.first; });
// Discretize Dataset
auto [Xd, maxes] = discretize(X, y, features);
maxes[className] = *max_element(y.begin(), y.end()) + 1;
map<string, vector<int>> states;
for (auto feature : features) {
states[feature] = vector<int>(maxes[feature]);
}
states[className] = vector<int>(maxes[className]);
auto clf = platform::Models::instance()->create(model_name);
clf->fit(Xd, y, features, className, states);
if (dump_cpt) {
cout << "--- CPT Tables ---" << endl;
clf->dump_cpt();
}
auto lines = clf->show();
for (auto line : lines) {
cout << line << endl;
}
cout << "--- Topological Order ---" << endl;
auto order = clf->topological_order();
for (auto name : order) {
cout << name << ", ";
}
cout << "end." << endl;
auto score = clf->score(Xd, y);
cout << "Score: " << score << endl;
auto graph = clf->graph();
auto dot_file = model_name + "_" + file_name;
ofstream file(dot_file + ".dot");
file << graph;
file.close();
cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;
cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
string stratified_string = stratified ? " Stratified" : "";
cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl;
cout << "==========================================" << endl;
torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
torch::Tensor yt = torch::tensor(y, torch::kInt32);
for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
}
float total_score = 0, total_score_train = 0, score_train, score_test;
platform::Fold* fold;
if (stratified)
fold = new platform::StratifiedKFold(nFolds, y, seed);
else
fold = new platform::KFold(nFolds, y.size(), seed);
for (auto i = 0; i < nFolds; ++i) {
auto [train, test] = fold->getFold(i);
cout << "Fold: " << i + 1 << endl;
if (tensors) {
auto ttrain = torch::tensor(train, torch::kInt64);
auto ttest = torch::tensor(test, torch::kInt64);
torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
torch::Tensor ytraint = yt.index({ ttrain });
torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
torch::Tensor ytestt = yt.index({ ttest });
clf->fit(Xtraint, ytraint, features, className, states);
auto temp = clf->predict(Xtraint);
score_train = clf->score(Xtraint, ytraint);
score_test = clf->score(Xtestt, ytestt);
} else {
auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
auto [Xtest, ytest] = extract_indices(test, Xd, y);
clf->fit(Xtrain, ytrain, features, className, states);
score_train = clf->score(Xtrain, ytrain);
score_test = clf->score(Xtest, ytest);
}
if (dump_cpt) {
cout << "--- CPT Tables ---" << endl;
clf->dump_cpt();
}
total_score_train += score_train;
total_score += score_test;
cout << "Score Train: " << score_train << endl;
cout << "Score Test : " << score_test << endl;
cout << "-------------------------------------------------------------------------------" << endl;
}
cout << "**********************************************************************************" << endl;
cout << "Average Score Train: " << total_score_train / nFolds << endl;
cout << "Average Score Test : " << total_score / nFolds << endl;return 0;
// auto handler = ArffFiles();
// handler.load(complete_file_name, class_last);
// // Get Dataset X, y
// vector<mdlp::samples_t>& X = handler.getX();
// mdlp::labels_t& y = handler.getY();
// // Get className & Features
// auto className = handler.getClassName();
// vector<string> features;
// auto attributes = handler.getAttributes();
// transform(attributes.begin(), attributes.end(), back_inserter(features),
// [](const pair<string, string>& item) { return item.first; });
// // Discretize Dataset
// auto [Xd, maxes] = discretize(X, y, features);
// maxes[className] = *max_element(y.begin(), y.end()) + 1;
// map<string, vector<int>> states;
// for (auto feature : features) {
// states[feature] = vector<int>(maxes[feature]);
// }
// states[className] = vector<int>(maxes[className]);
// auto clf = platform::Models::instance()->create(model_name);
// clf->fit(Xd, y, features, className, states);
// if (dump_cpt) {
// cout << "--- CPT Tables ---" << endl;
// clf->dump_cpt();
// }
// auto lines = clf->show();
// for (auto line : lines) {
// cout << line << endl;
// }
// cout << "--- Topological Order ---" << endl;
// auto order = clf->topological_order();
// for (auto name : order) {
// cout << name << ", ";
// }
// cout << "end." << endl;
// auto score = clf->score(Xd, y);
// cout << "Score: " << score << endl;
// auto graph = clf->graph();
// auto dot_file = model_name + "_" + file_name;
// ofstream file(dot_file + ".dot");
// file << graph;
// file.close();
// cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;
// cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
// string stratified_string = stratified ? " Stratified" : "";
// cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl;
// cout << "==========================================" << endl;
// torch::Tensor Xt = torch::zeros({ static_cast<int>(Xd.size()), static_cast<int>(Xd[0].size()) }, torch::kInt32);
// torch::Tensor yt = torch::tensor(y, torch::kInt32);
// for (int i = 0; i < features.size(); ++i) {
// Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
// }
// float total_score = 0, total_score_train = 0, score_train, score_test;
// platform::Fold* fold;
// if (stratified)
// fold = new platform::StratifiedKFold(nFolds, y, seed);
// else
// fold = new platform::KFold(nFolds, y.size(), seed);
// for (auto i = 0; i < nFolds; ++i) {
// auto [train, test] = fold->getFold(i);
// cout << "Fold: " << i + 1 << endl;
// if (tensors) {
// auto ttrain = torch::tensor(train, torch::kInt64);
// auto ttest = torch::tensor(test, torch::kInt64);
// torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain);
// torch::Tensor ytraint = yt.index({ ttrain });
// torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest);
// torch::Tensor ytestt = yt.index({ ttest });
// clf->fit(Xtraint, ytraint, features, className, states);
// auto temp = clf->predict(Xtraint);
// score_train = clf->score(Xtraint, ytraint);
// score_test = clf->score(Xtestt, ytestt);
// } else {
// auto [Xtrain, ytrain] = extract_indices(train, Xd, y);
// auto [Xtest, ytest] = extract_indices(test, Xd, y);
// clf->fit(Xtrain, ytrain, features, className, states);
// score_train = clf->score(Xtrain, ytrain);
// score_test = clf->score(Xtest, ytest);
// }
// if (dump_cpt) {
// cout << "--- CPT Tables ---" << endl;
// clf->dump_cpt();
// }
// total_score_train += score_train;
// total_score += score_test;
// cout << "Score Train: " << score_train << endl;
// cout << "Score Test : " << score_test << endl;
// cout << "-------------------------------------------------------------------------------" << endl;
// }
// cout << "**********************************************************************************" << endl;
// cout << "Average Score Train: " << total_score_train / nFolds << endl;
// cout << "Average Score Test : " << total_score / nFolds << endl;return 0;
}

View File

@ -5,6 +5,7 @@
#include <vector>
namespace bayesnet {
using namespace std;
enum status_t { NORMAL, WARNING, ERROR };
class BaseClassifier {
protected:
virtual void trainModel(const torch::Tensor& weights) = 0;
@ -18,6 +19,7 @@ namespace bayesnet {
virtual ~BaseClassifier() = default;
torch::Tensor virtual predict(torch::Tensor& X) = 0;
vector<int> virtual predict(vector<vector<int>>& X) = 0;
status_t virtual getStatus() const = 0;
float virtual score(vector<vector<int>>& X, vector<int>& y) = 0;
float virtual score(torch::Tensor& X, torch::Tensor& y) = 0;
int virtual getNumberOfNodes()const = 0;

View File

@ -1,6 +1,7 @@
#include "BoostAODE.h"
#include <set>
#include "BayesMetrics.h"
#include "Colors.h"
namespace bayesnet {
BoostAODE::BoostAODE() : Ensemble() {}
@ -64,22 +65,26 @@ namespace bayesnet {
auto ypred = model->predict(X_);
// Step 3.1: Compute the classifier amout of say
auto mask_wrong = ypred != y_;
auto mask_right = ypred == y_;
auto masked_weights = weights_ * mask_wrong.to(weights_.dtype());
double wrongWeights = masked_weights.sum().item<double>();
double significance = wrongWeights == 0 ? 1 : 0.5 * log((1 - wrongWeights) / wrongWeights);
double epsilon_t = masked_weights.sum().item<double>();
double wt = (1 - epsilon_t) / epsilon_t;
double alpha_t = epsilon_t == 0 ? 1 : 0.5 * log(wt);
// Step 3.2: Update weights for next classifier
// Step 3.2.1: Update weights of wrong samples
weights_ += mask_wrong.to(weights_.dtype()) * exp(significance) * weights_;
weights_ += mask_wrong.to(weights_.dtype()) * exp(alpha_t) * weights_;
// Step 3.2.2: Update weights of right samples
weights_ += mask_right.to(weights_.dtype()) * exp(-alpha_t) * weights_;
// Step 3.3: Normalise the weights
double totalWeights = torch::sum(weights_).item<double>();
weights_ = weights_ / totalWeights;
// Step 3.4: Store classifier and its accuracy to weigh its future vote
models.push_back(std::move(model));
significanceModels.push_back(significance);
exitCondition = n_models == maxModels && repeatSparent;
significanceModels.push_back(alpha_t);
exitCondition = n_models == maxModels && repeatSparent || epsilon_t > 0.5;
}
if (featuresUsed.size() != features.size()) {
cout << "Warning: BoostAODE did not use all the features" << endl;
status = WARNING;
}
weights.copy_(weights_);
}

View File

@ -21,6 +21,7 @@ namespace bayesnet {
string className;
map<string, vector<int>> states;
Tensor dataset; // (n+1)xm tensor
status_t status = NORMAL;
void checkFitParameters();
virtual void buildModel(const torch::Tensor& weights) = 0;
void trainModel(const torch::Tensor& weights) override;
@ -37,6 +38,7 @@ namespace bayesnet {
int getNumberOfEdges() const override;
int getNumberOfStates() const override;
Tensor predict(Tensor& X) override;
status_t getStatus() const override { return status; }
vector<int> predict(vector<vector<int>>& X) override;
float score(Tensor& X, Tensor& y) override;
float score(vector<vector<int>>& X, vector<int>& y) override;

View File

@ -111,6 +111,26 @@ namespace platform {
}
}
string getColor(bayesnet::status_t status)
{
switch (status) {
case bayesnet::NORMAL:
return Colors::GREEN();
case bayesnet::WARNING:
return Colors::YELLOW();
case bayesnet::ERROR:
return Colors::RED();
default:
return Colors::RESET();
}
}
void showProgress(int fold, const string& color, const string& phase)
{
string prefix = phase == "a" ? "" : "\b\b\b\b";
cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
}
void Experiment::cross_validation(const string& path, const string& fileName)
{
auto datasets = platform::Datasets(path, discretized, platform::ARFF);
@ -159,23 +179,24 @@ namespace platform {
auto y_train = y.index({ train_t });
auto X_test = X.index({ "...", test_t });
auto y_test = y.index({ test_t });
cout << nfold + 1 << "(a)" << flush;
showProgress(nfold + 1, getColor(clf->getStatus()), "a");
// Train model
clf->fit(X_train, y_train, features, className, states);
cout << "\b\bb)" << flush;
showProgress(nfold + 1, getColor(clf->getStatus()), "b");
nodes[item] = clf->getNumberOfNodes();
edges[item] = clf->getNumberOfEdges();
num_states[item] = clf->getNumberOfStates();
train_time[item] = train_timer.getDuration();
// Score train
auto accuracy_train_value = clf->score(X_train, y_train);
cout << "\b\bc)" << flush;
// Test model
showProgress(nfold + 1, getColor(clf->getStatus()), "c");
test_timer.start();
auto accuracy_test_value = clf->score(X_test, y_test);
cout << "\b\b\b, " << flush;
test_time[item] = test_timer.getDuration();
accuracy_train[item] = accuracy_train_value;
accuracy_test[item] = accuracy_test_value;
cout << "\b\b\b, " << flush;
// Store results and times in vector
result.addScoreTrain(accuracy_train_value);
result.addScoreTest(accuracy_test_value);