Complete Experiment

This commit is contained in:
Ricardo Montañana Gómez 2023-07-27 15:49:58 +02:00
parent bc214a496c
commit 3d8fea7a37
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
6 changed files with 80 additions and 67 deletions

8
.vscode/launch.json vendored
View File

@ -23,15 +23,15 @@
"name": "experiment", "name": "experiment",
"program": "${workspaceFolder}/build/src/Platform/main", "program": "${workspaceFolder}/build/src/Platform/main",
"args": [ "args": [
"-d",
"iris",
"-m", "-m",
"TAN", "TAN",
"-p", "-p",
"../../../data/", "datasets",
"--discretize", "--discretize",
"-f", "-f",
"2" "5",
"--title",
"Debug test"
], ],
"cwd": "${workspaceFolder}/build/src/Platform", "cwd": "${workspaceFolder}/build/src/Platform",
}, },

View File

@ -2,6 +2,7 @@
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include <map> #include <map>
#include <iostream>
using namespace std; using namespace std;
@ -118,6 +119,7 @@ void ArffFiles::generateDataset(int labelIndex)
{ {
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size())); X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), ""); auto yy = vector<string>(lines.size(), "");
auto removeLines = vector<int>(); // Lines with missing values
for (size_t i = 0; i < lines.size(); i++) { for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]); stringstream ss(lines[i]);
string value; string value;
@ -127,10 +129,20 @@ void ArffFiles::generateDataset(int labelIndex)
if (pos++ == labelIndex) { if (pos++ == labelIndex) {
yy[i] = value; yy[i] = value;
} else { } else {
X[xIndex++][i] = stof(value); if (value == "?") {
X[xIndex++][i] = -1;
removeLines.push_back(i);
} else
X[xIndex++][i] = stof(value);
} }
} }
} }
for (auto i : removeLines) {
yy.erase(yy.begin() + i);
for (auto& x : X) {
x.erase(x.begin() + i);
}
}
y = factorize(yy); y = factorize(yy);
} }

View File

@ -28,13 +28,6 @@ namespace platform {
throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]"); throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]");
} }
} }
Dataset& Datasets::getDataset(string name)
{
if (datasets.find(name) == datasets.end()) {
throw invalid_argument("Dataset not found.");
}
return *datasets[name];
}
vector<string> Datasets::getNames() vector<string> Datasets::getNames()
{ {
vector<string> result; vector<string> result;
@ -45,45 +38,56 @@ namespace platform {
} }
vector<string> Datasets::getFeatures(string name) vector<string> Datasets::getFeatures(string name)
{ {
auto dataset = getDataset(name); if (datasets[name]->isLoaded()) {
if (dataset.isLoaded()) { return datasets[name]->getFeatures();
return dataset.getFeatures();
} else { } else {
throw invalid_argument("Dataset not loaded."); throw invalid_argument("Dataset not loaded.");
} }
} }
map<string, vector<int>> Datasets::getStates(string name) map<string, vector<int>> Datasets::getStates(string name)
{ {
auto dataset = getDataset(name); if (datasets[name]->isLoaded()) {
if (dataset.isLoaded()) { return datasets[name]->getStates();
return dataset.getStates(); } else {
throw invalid_argument("Dataset not loaded.");
}
}
string Datasets::getClassName(string name)
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getClassName();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNSamples(string name)
{
if (datasets[name]->isLoaded()) {
return datasets[name]->getNSamples();
} else { } else {
throw invalid_argument("Dataset not loaded."); throw invalid_argument("Dataset not loaded.");
} }
} }
pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(string name) pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(string name)
{ {
auto dataset = getDataset(name); if (!datasets[name]->isLoaded()) {
if (!dataset.isLoaded()) { datasets[name]->load();
dataset.load();
} }
return dataset.getVectors(); return datasets[name]->getVectors();
} }
pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(string name) pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(string name)
{ {
auto dataset = getDataset(name); if (!datasets[name]->isLoaded()) {
if (!dataset.isLoaded()) { datasets[name]->load();
dataset.load();
} }
return dataset.getVectorsDiscretized(); return datasets[name]->getVectorsDiscretized();
} }
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(string name) pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(string name)
{ {
auto dataset = getDataset(name); if (!datasets[name]->isLoaded()) {
if (!dataset.isLoaded()) { datasets[name]->load();
dataset.load();
} }
return dataset.getTensors(); return datasets[name]->getTensors();
} }
Dataset::Dataset(Dataset& dataset) Dataset::Dataset(Dataset& dataset)
{ {
@ -195,11 +199,11 @@ namespace platform {
void Dataset::computeStates() void Dataset::computeStates()
{ {
for (int i = 0; i < features.size(); ++i) { for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end())); states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
iota(Xd[i].begin(), Xd[i].end(), 0); iota(begin(states[features[i]]), end(states[features[i]]), 0);
} }
states[className] = vector<int>(*max_element(yv.begin(), yv.end())); states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(yv.begin(), yv.end(), 0); iota(begin(states[className]), end(states[className]), 0);
} }
void Dataset::load_arff() void Dataset::load_arff()
{ {
@ -209,8 +213,7 @@ namespace platform {
Xv = arff.getX(); Xv = arff.getX();
yv = arff.getY(); yv = arff.getY();
// Get className & Features // Get className & Features
auto className = arff.getClassName(); className = arff.getClassName();
vector<string> features;
for (auto feature : arff.getAttributes()) { for (auto feature : arff.getAttributes()) {
features.push_back(feature.first); features.push_back(feature.first);
} }
@ -246,7 +249,7 @@ namespace platform {
} else { } else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32)); X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
} }
y = torch::tensor(yv, torch::kInt32);
} }
y = torch::tensor(yv, torch::kInt32);
} }
} }

View File

@ -50,9 +50,10 @@ namespace platform {
void load(); // Loads the list of datasets void load(); // Loads the list of datasets
public: public:
Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); };
Dataset& getDataset(string name);
vector<string> getNames(); vector<string> getNames();
vector<string> getFeatures(string name); vector<string> getFeatures(string name);
int getNSamples(string name);
string getClassName(string name);
map<string, vector<int>> getStates(string name); map<string, vector<int>> getStates(string name);
pair<vector<vector<float>>&, vector<int>&> getVectors(string name); pair<vector<vector<float>>&, vector<int>&> getVectors(string name);
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(string name); pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(string name);

View File

@ -79,17 +79,16 @@ namespace platform {
file << data; file << data;
file.close(); file.close();
} }
Result cross_validation(Fold* fold, string model_name, torch::Tensor& X, torch::Tensor& y, vector<string> features, string className, map<string, vector<int>> states) Result cross_validation(Fold* fold, string model_name, torch::Tensor& Xt, torch::Tensor& y, vector<string> features, string className, map<string, vector<int>> states)
{ {
auto classifiers = map<string, bayesnet::BaseClassifier*>({ auto classifiers = map<string, bayesnet::BaseClassifier*>({
{ "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) },
{ "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() }
} }
); );
auto Xt = torch::transpose(X, 0, 1);
auto result = Result(); auto result = Result();
auto [values, counts] = at::_unique(y); auto [values, counts] = at::_unique(y);
result.setSamples(X.size(0)).setFeatures(X.size(1)).setClasses(values.size(0)); result.setSamples(Xt.size(1)).setFeatures(Xt.size(0)).setClasses(values.size(0));
auto k = fold->getNumberOfFolds(); auto k = fold->getNumberOfFolds();
auto accuracy_test = torch::zeros({ k }, torch::kFloat64); auto accuracy_test = torch::zeros({ k }, torch::kFloat64);
auto accuracy_train = torch::zeros({ k }, torch::kFloat64); auto accuracy_train = torch::zeros({ k }, torch::kFloat64);
@ -99,6 +98,7 @@ namespace platform {
auto edges = torch::zeros({ k }, torch::kFloat64); auto edges = torch::zeros({ k }, torch::kFloat64);
auto num_states = torch::zeros({ k }, torch::kFloat64); auto num_states = torch::zeros({ k }, torch::kFloat64);
Timer train_timer, test_timer; Timer train_timer, test_timer;
cout << "doing Fold: " << flush;
for (int i = 0; i < k; i++) { for (int i = 0; i < k; i++) {
bayesnet::BaseClassifier* model = classifiers[model_name]; bayesnet::BaseClassifier* model = classifiers[model_name];
result.setModelVersion(model->getVersion()); result.setModelVersion(model->getVersion());
@ -110,15 +110,11 @@ namespace platform {
auto y_train = y.index({ train_t }); auto y_train = y.index({ train_t });
auto X_test = Xt.index({ "...", test_t }); auto X_test = Xt.index({ "...", test_t });
auto y_test = y.index({ test_t }); auto y_test = y.index({ test_t });
cout << i + 1 << ", " << flush;
model->fit(X_train, y_train, features, className, states); model->fit(X_train, y_train, features, className, states);
nodes[i] = model->getNumberOfNodes(); nodes[i] = model->getNumberOfNodes();
edges[i] = model->getNumberOfEdges(); edges[i] = model->getNumberOfEdges();
num_states[i] = model->getNumberOfStates(); num_states[i] = model->getNumberOfStates();
cout << "Training Fold " << i + 1 << endl;
cout << "X_train: " << X_train.sizes() << endl;
cout << "y_train: " << y_train.sizes() << endl;
cout << "X_test: " << X_test.sizes() << endl;
cout << "y_test: " << y_test.sizes() << endl;
train_time[i] = train_timer.getDuration(); train_time[i] = train_timer.getDuration();
auto accuracy_train_value = model->score(X_train, y_train); auto accuracy_train_value = model->score(X_train, y_train);
test_timer.start(); test_timer.start();
@ -127,6 +123,7 @@ namespace platform {
accuracy_train[i] = accuracy_train_value; accuracy_train[i] = accuracy_train_value;
accuracy_test[i] = accuracy_test_value; accuracy_test[i] = accuracy_test_value;
} }
cout << "end." << endl;
result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>()); result.setScoreTest(torch::mean(accuracy_test).item<double>()).setScoreTrain(torch::mean(accuracy_train).item<double>());
result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>()); result.setScoreTestStd(torch::std(accuracy_test).item<double>()).setScoreTrainStd(torch::std(accuracy_train).item<double>());
result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>()); result.setTrainTime(torch::mean(train_time).item<double>()).setTestTime(torch::mean(test_time).item<double>());

View File

@ -6,12 +6,12 @@
using namespace std; using namespace std;
const string PATH_RESULTS = "results";
argparse::ArgumentParser manageArguments(int argc, char** argv) argparse::ArgumentParser manageArguments(int argc, char** argv)
{ {
argparse::ArgumentParser program("BayesNetSample"); argparse::ArgumentParser program("BayesNetSample");
program.add_argument("-d", "--dataset") program.add_argument("-d", "--dataset").default_value("").help("Dataset file name");
.help("Dataset file name");
program.add_argument("-p", "--path") program.add_argument("-p", "--path")
.help("folder where the data files are located, default") .help("folder where the data files are located, default")
.default_value(string{ PATH } .default_value(string{ PATH }
@ -59,9 +59,6 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
complete_file_name = path + file_name + ".arff"; complete_file_name = path + file_name + ".arff";
class_last = false;//datasets[file_name]; class_last = false;//datasets[file_name];
title = program.get<string>("title"); title = program.get<string>("title");
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
}
} }
catch (const exception& err) { catch (const exception& err) {
cerr << err.what() << endl; cerr << err.what() << endl;
@ -98,26 +95,29 @@ int main(int argc, char** argv)
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet"); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet");
experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy"); experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy");
platform::Timer timer; platform::Timer timer;
cout << "*** Starting experiment: " << title << " ***" << endl;
timer.start(); timer.start();
for (auto fileName : filesToProcess) { for (auto fileName : filesToProcess) {
cout << "Processing " << fileName << endl; cout << "- " << fileName << " ";
auto [X, y] = datasets.getTensors(fileName); auto [X, y] = datasets.getTensors(fileName);
// auto states = datasets.getStates(fileName); auto states = datasets.getStates(fileName);
// auto features = datasets.getFeatures(fileName); auto features = datasets.getFeatures(fileName);
// auto className = datasets.getDataset(fileName).getClassName(); auto samples = datasets.getNSamples(fileName);
// Fold* fold; auto className = datasets.getClassName(fileName);
// if (stratified) cout << " (" << samples << ", " << features.size() << ") " << flush;
// fold = new StratifiedKFold(n_folds, y, seed); Fold* fold;
// else if (stratified)
// fold = new KFold(n_folds, y.numel(), seed); fold = new StratifiedKFold(n_folds, y, seed);
// auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); else
// result.setDataset(file_name); fold = new KFold(n_folds, samples, seed);
// experiment.setModelVersion(result.getModelVersion()); auto result = platform::cross_validation(fold, model_name, X, y, features, className, states);
// experiment.addResult(result); result.setDataset(file_name);
// delete fold; experiment.setModelVersion(result.getModelVersion());
experiment.addResult(result);
delete fold;
} }
experiment.setDuration(timer.getDuration()); experiment.setDuration(timer.getDuration());
experiment.save(path); experiment.save(PATH_RESULTS);
experiment.show(); cout << "Done!" << endl;
return 0; return 0;
} }