Complete Experiment

This commit is contained in:
Ricardo Montañana Gómez 2023-07-27 15:49:58 +02:00
parent bc214a496c
commit 3d8fea7a37
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
6 changed files with 80 additions and 67 deletions

8
.vscode/launch.json vendored
View File

@ -23,15 +23,15 @@
"name": "experiment",
"program": "${workspaceFolder}/build/src/Platform/main",
"args": [
"-d",
"iris",
"-m",
"TAN",
"-p",
"../../../data/",
"datasets",
"--discretize",
"-f",
"2"
"5",
"--title",
"Debug test"
],
"cwd": "${workspaceFolder}/build/src/Platform",
},

View File

@ -2,6 +2,7 @@
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
@ -118,6 +119,7 @@ void ArffFiles::generateDataset(int labelIndex)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), "");
auto removeLines = vector<int>(); // Lines with missing values
for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
@ -127,10 +129,20 @@ void ArffFiles::generateDataset(int labelIndex)
if (pos++ == labelIndex) {
yy[i] = value;
} else {
X[xIndex++][i] = stof(value);
if (value == "?") {
X[xIndex++][i] = -1;
removeLines.push_back(i);
} else
X[xIndex++][i] = stof(value);
}
}
}
for (auto i : removeLines) {
yy.erase(yy.begin() + i);
for (auto& x : X) {
x.erase(x.begin() + i);
}
}
y = factorize(yy);
}

View File

@ -28,13 +28,6 @@ namespace platform {
throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]");
}
}
Dataset& Datasets::getDataset(string name)
{
if (datasets.find(name) == datasets.end()) {
throw invalid_argument("Dataset not found.");
}
return *datasets[name];
}
vector<string> Datasets::getNames()
{
vector<string> result;
@ -45,45 +38,56 @@ namespace platform {
}
vector<string> Datasets::getFeatures(string name)
{
auto dataset = getDataset(name);
if (dataset.isLoaded()) {
return dataset.getFeatures();
if (datasets[name]->isLoaded()) {
return datasets[name]->getFeatures();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Datasets::getStates(string name)
{
auto dataset = getDataset(name);
if (dataset.isLoaded()) {
return dataset.getStates();
if (datasets[name]->isLoaded()) {
return datasets[name]->getStates();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
string Datasets::getClassName(string name)
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getClassName();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNSamples(string name)
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getNSamples();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(string name)
{
auto dataset = getDataset(name);
if (!dataset.isLoaded()) {
dataset.load();
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return dataset.getVectors();
return datasets[name]->getVectors();
}
pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(string name)
{
auto dataset = getDataset(name);
if (!dataset.isLoaded()) {
dataset.load();
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return dataset.getVectorsDiscretized();
return datasets[name]->getVectorsDiscretized();
}
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(string name)
{
auto dataset = getDataset(name);
if (!dataset.isLoaded()) {
dataset.load();
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return dataset.getTensors();
return datasets[name]->getTensors();
}
Dataset::Dataset(Dataset& dataset)
{
@ -195,11 +199,11 @@ namespace platform {
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()));
iota(Xd[i].begin(), Xd[i].end(), 0);
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
iota(begin(states[features[i]]), end(states[features[i]]), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()));
iota(yv.begin(), yv.end(), 0);
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states[className]), end(states[className]), 0);
}
void Dataset::load_arff()
{
@ -209,8 +213,7 @@ namespace platform {
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
auto className = arff.getClassName();
vector<string> features;
className = arff.getClassName();
for (auto feature : arff.getAttributes()) {
features.push_back(feature.first);
}
@ -246,7 +249,7 @@ namespace platform {
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
y = torch::tensor(yv, torch::kInt32);
}
y = torch::tensor(yv, torch::kInt32);
}
}

View File

@ -50,9 +50,10 @@ namespace platform {
void load(); // Loads the list of datasets
public:
Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); };
Dataset& getDataset(string name);
vector<string> getNames();
vector<string> getFeatures(string name);
int getNSamples(string name);
string getClassName(string name);
map<string, vector<int>> getStates(string name);
pair<vector<vector<float>>&, vector<int>&> getVectors(string name);
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(string name);

View File

@ -79,17 +79,16 @@ namespace platform {
file << data;
file.close();
}
Result cross_validation(Fold* fold, string model_name, torch::Tensor& X, torch::Tensor& y, vector<string> features, string className, map<string, vector<int>> states)
Result cross_validation(Fold* fold, string model_name, torch::Tensor& Xt, torch::Tensor& y, vector<string> features, string className, map<string, vector<int>> states)
{
auto classifiers = map<string, bayesnet::BaseClassifier*>({
{ "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) },
{ "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() }
}
);
auto Xt = torch::transpose(X, 0, 1);
auto result = Result();
auto [values, counts] = at::_unique(y);
result.setSamples(X.size(0)).setFeatures(X.size(1)).setClasses(values.size(0));
result.setSamples(Xt.size(1)).setFeatures(Xt.size(0)).setClasses(values.size(0));
auto k = fold->getNumberOfFolds();
auto accuracy_test = torch::zeros({ k }, torch::kFloat64);
auto accuracy_train = torch::zeros({ k }, torch::kFloat64);
@ -99,6 +98,7 @@ namespace platform {
auto edges = torch::zeros({ k }, torch::kFloat64);
auto num_states = torch::zeros({ k }, torch::kFloat64);
Timer train_timer, test_timer;
cout << "doing Fold: " << flush;
for (int i = 0; i < k; i++) {
bayesnet::BaseClassifier* model = classifiers[model_name];
result.setModelVersion(model->getVersion());
@ -110,15 +110,11 @@ namespace platform {
auto y_train = y.index({ train_t });
auto X_test = Xt.index({ "...", test_t });
auto y_test = y.index({ test_t });
cout << i + 1 << ", " << flush;
model->fit(X_train, y_train, features, className, states);
nodes[i] = model->getNumberOfNodes();
edges[i] = model->getNumberOfEdges();
num_states[i] = model->getNumberOfStates();
cout << "Training Fold " << i + 1 << endl;
cout << "X_train: " << X_train.sizes() << endl;
cout << "y_train: " << y_train.sizes() << endl;
cout << "X_test: " << X_test.sizes() << endl;
cout << "y_test: " << y_test.sizes() << endl;
train_time[i] = train_timer.getDuration();
auto accuracy_train_value = model->score(X_train, y_train);
test_timer.start();
@ -127,6 +123,7 @@ namespace platform {
accuracy_train[i] = accuracy_train_value;
accuracy_test[i] = accuracy_test_value;
}
cout << "end." << endl;
result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());

View File

@ -6,12 +6,12 @@
using namespace std;
const string PATH_RESULTS = "results";
argparse::ArgumentParser manageArguments(int argc, char** argv)
{
argparse::ArgumentParser program("BayesNetSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name");
program.add_argument("-d", "--dataset").default_value("").help("Dataset file name");
program.add_argument("-p", "--path")
.help("folder where the data files are located, default")
.default_value(string{ PATH }
@ -59,9 +59,6 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
complete_file_name = path + file_name + ".arff";
class_last = false;//datasets[file_name];
title = program.get<string>("title");
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
}
catch (const exception& err) {
cerr << err.what() << endl;
@ -98,26 +95,29 @@ int main(int argc, char** argv)
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet");
experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy");
platform::Timer timer;
cout << "*** Starting experiment: " << title << " ***" << endl;
timer.start();
for (auto fileName : filesToProcess) {
cout << "Processing " << fileName << endl;
cout << "- " << fileName << " ";
auto [X, y] = datasets.getTensors(fileName);
// auto states = datasets.getStates(fileName);
// auto features = datasets.getFeatures(fileName);
// auto className = datasets.getDataset(fileName).getClassName();
// Fold* fold;
// if (stratified)
// fold = new StratifiedKFold(n_folds, y, seed);
// else
// fold = new KFold(n_folds, y.numel(), seed);
// auto result = platform::cross_validation(fold, model_name, X, y, features, className, states);
// result.setDataset(file_name);
// experiment.setModelVersion(result.getModelVersion());
// experiment.addResult(result);
// delete fold;
auto states = datasets.getStates(fileName);
auto features = datasets.getFeatures(fileName);
auto samples = datasets.getNSamples(fileName);
auto className = datasets.getClassName(fileName);
cout << " (" << samples << ", " << features.size() << ") " << flush;
Fold* fold;
if (stratified)
fold = new StratifiedKFold(n_folds, y, seed);
else
fold = new KFold(n_folds, samples, seed);
auto result = platform::cross_validation(fold, model_name, X, y, features, className, states);
result.setDataset(file_name);
experiment.setModelVersion(result.getModelVersion());
experiment.addResult(result);
delete fold;
}
experiment.setDuration(timer.getDuration());
experiment.save(path);
experiment.show();
experiment.save(PATH_RESULTS);
cout << "Done!" << endl;
return 0;
}