From 3d8fea7a379aceb8a0302fd064fa7be97ec04ee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Thu, 27 Jul 2023 15:49:58 +0200 Subject: [PATCH] Complete Experiment --- .vscode/launch.json | 8 ++--- lib/Files/ArffFiles.cc | 14 +++++++- src/Platform/Datasets.cc | 67 ++++++++++++++++++++------------------ src/Platform/Datasets.h | 3 +- src/Platform/Experiment.cc | 13 +++----- src/Platform/main.cc | 42 ++++++++++++------------ 6 files changed, 80 insertions(+), 67 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index ff1e951..1b983e6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -23,15 +23,15 @@ "name": "experiment", "program": "${workspaceFolder}/build/src/Platform/main", "args": [ - "-d", - "iris", "-m", "TAN", "-p", - "../../../data/", + "datasets", "--discretize", "-f", - "2" + "5", + "--title", + "Debug test" ], "cwd": "${workspaceFolder}/build/src/Platform", }, diff --git a/lib/Files/ArffFiles.cc b/lib/Files/ArffFiles.cc index 8ae99ca..4039b0b 100644 --- a/lib/Files/ArffFiles.cc +++ b/lib/Files/ArffFiles.cc @@ -2,6 +2,7 @@ #include #include #include +#include using namespace std; @@ -118,6 +119,7 @@ void ArffFiles::generateDataset(int labelIndex) { X = vector>(attributes.size(), vector(lines.size())); auto yy = vector(lines.size(), ""); + auto removeLines = vector(); // Lines with missing values for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; @@ -127,10 +129,20 @@ void ArffFiles::generateDataset(int labelIndex) if (pos++ == labelIndex) { yy[i] = value; } else { - X[xIndex++][i] = stof(value); + if (value == "?") { + X[xIndex++][i] = -1; + removeLines.push_back(i); + } else + X[xIndex++][i] = stof(value); } } } + for (auto i : removeLines) { + yy.erase(yy.begin() + i); + for (auto& x : X) { + x.erase(x.begin() + i); + } + } y = factorize(yy); } diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index a9ee23c..7943bad 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -28,13 +28,6 @@ namespace platform { throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]"); } } - Dataset& Datasets::getDataset(string name) - { - if (datasets.find(name) == datasets.end()) { - throw invalid_argument("Dataset not found."); - } - return *datasets[name]; - } vector Datasets::getNames() { vector result; @@ -45,45 +38,56 @@ namespace platform { } vector Datasets::getFeatures(string name) { - auto dataset = getDataset(name); - if (dataset.isLoaded()) { - return dataset.getFeatures(); + if (datasets[name]->isLoaded()) { + return datasets[name]->getFeatures(); } else { throw invalid_argument("Dataset not loaded."); } } map> Datasets::getStates(string name) { - auto dataset = getDataset(name); - if (dataset.isLoaded()) { - return dataset.getStates(); + if (datasets[name]->isLoaded()) { + return datasets[name]->getStates(); + } else { + throw invalid_argument("Dataset not loaded."); + } + } + string Datasets::getClassName(string name) + { + if (datasets[name]->isLoaded()) { + return datasets[name]->getClassName(); + } else { + throw invalid_argument("Dataset not loaded."); + } + } + int Datasets::getNSamples(string name) + { + if (datasets[name]->isLoaded()) { + return datasets[name]->getNSamples(); } else { throw invalid_argument("Dataset not loaded."); } } pair>&, vector&> Datasets::getVectors(string name) { - auto dataset = getDataset(name); - if (!dataset.isLoaded()) { - dataset.load(); + if (!datasets[name]->isLoaded()) { + datasets[name]->load(); } - return dataset.getVectors(); + return datasets[name]->getVectors(); } pair>&, vector&> Datasets::getVectorsDiscretized(string name) { - auto dataset = getDataset(name); - if (!dataset.isLoaded()) { - dataset.load(); + if (!datasets[name]->isLoaded()) { + datasets[name]->load(); } - return dataset.getVectorsDiscretized(); + return datasets[name]->getVectorsDiscretized(); } pair Datasets::getTensors(string name) { - auto dataset = getDataset(name); - if (!dataset.isLoaded()) { - dataset.load(); + if (!datasets[name]->isLoaded()) { + datasets[name]->load(); } - return dataset.getTensors(); + return datasets[name]->getTensors(); } Dataset::Dataset(Dataset& dataset) { @@ -195,11 +199,11 @@ namespace platform { void Dataset::computeStates() { for (int i = 0; i < features.size(); ++i) { - states[features[i]] = vector(*max_element(Xd[i].begin(), Xd[i].end())); - iota(Xd[i].begin(), Xd[i].end(), 0); + states[features[i]] = vector(*max_element(Xd[i].begin(), Xd[i].end()) + 1); + iota(begin(states[features[i]]), end(states[features[i]]), 0); } - states[className] = vector(*max_element(yv.begin(), yv.end())); - iota(yv.begin(), yv.end(), 0); + states[className] = vector(*max_element(yv.begin(), yv.end()) + 1); + iota(begin(states[className]), end(states[className]), 0); } void Dataset::load_arff() { @@ -209,8 +213,7 @@ namespace platform { Xv = arff.getX(); yv = arff.getY(); // Get className & Features - auto className = arff.getClassName(); - vector features; + className = arff.getClassName(); for (auto feature : arff.getAttributes()) { features.push_back(feature.first); } @@ -246,7 +249,7 @@ namespace platform { } else { X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32)); } - y = torch::tensor(yv, torch::kInt32); } + y = torch::tensor(yv, torch::kInt32); } } \ No newline at end of file diff --git a/src/Platform/Datasets.h b/src/Platform/Datasets.h index ae54376..b593e24 100644 --- a/src/Platform/Datasets.h +++ b/src/Platform/Datasets.h @@ -50,9 +50,10 @@ namespace platform { void load(); // Loads the list of datasets public: Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; - Dataset& getDataset(string name); vector getNames(); vector getFeatures(string name); + int getNSamples(string name); + string getClassName(string name); map> getStates(string name); pair>&, vector&> getVectors(string name); pair>&, vector&> getVectorsDiscretized(string name); diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 64c8134..406e907 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -79,17 +79,16 @@ namespace platform { file << data; file.close(); } - Result cross_validation(Fold* fold, string model_name, torch::Tensor& X, torch::Tensor& y, vector features, string className, map> states) + Result cross_validation(Fold* fold, string model_name, torch::Tensor& Xt, torch::Tensor& y, vector features, string className, map> states) { auto classifiers = map({ { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } } ); - auto Xt = torch::transpose(X, 0, 1); auto result = Result(); auto [values, counts] = at::_unique(y); - result.setSamples(X.size(0)).setFeatures(X.size(1)).setClasses(values.size(0)); + result.setSamples(Xt.size(1)).setFeatures(Xt.size(0)).setClasses(values.size(0)); auto k = fold->getNumberOfFolds(); auto accuracy_test = torch::zeros({ k }, torch::kFloat64); auto accuracy_train = torch::zeros({ k }, torch::kFloat64); @@ -99,6 +98,7 @@ namespace platform { auto edges = torch::zeros({ k }, torch::kFloat64); auto num_states = torch::zeros({ k }, torch::kFloat64); Timer train_timer, test_timer; + cout << "doing Fold: " << flush; for (int i = 0; i < k; i++) { bayesnet::BaseClassifier* model = classifiers[model_name]; result.setModelVersion(model->getVersion()); @@ -110,15 +110,11 @@ namespace platform { auto y_train = y.index({ train_t }); auto X_test = Xt.index({ "...", test_t }); auto y_test = y.index({ test_t }); + cout << i + 1 << ", " << flush; model->fit(X_train, y_train, features, className, states); nodes[i] = model->getNumberOfNodes(); edges[i] = model->getNumberOfEdges(); num_states[i] = model->getNumberOfStates(); - cout << "Training Fold " << i + 1 << endl; - cout << "X_train: " << X_train.sizes() << endl; - cout << "y_train: " << y_train.sizes() << endl; - cout << "X_test: " << X_test.sizes() << endl; - cout << "y_test: " << y_test.sizes() << endl; train_time[i] = train_timer.getDuration(); auto accuracy_train_value = model->score(X_train, y_train); test_timer.start(); @@ -127,6 +123,7 @@ namespace platform { accuracy_train[i] = accuracy_train_value; accuracy_test[i] = accuracy_test_value; } + cout << "end." << endl; result.setScoreTest(torch::mean(accuracy_test).item()).setScoreTrain(torch::mean(accuracy_train).item()); result.setScoreTestStd(torch::std(accuracy_test).item()).setScoreTrainStd(torch::std(accuracy_train).item()); result.setTrainTime(torch::mean(train_time).item()).setTestTime(torch::mean(test_time).item()); diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 00873ea..f1f6ed8 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -6,12 +6,12 @@ using namespace std; +const string PATH_RESULTS = "results"; argparse::ArgumentParser manageArguments(int argc, char** argv) { argparse::ArgumentParser program("BayesNetSample"); - program.add_argument("-d", "--dataset") - .help("Dataset file name"); + program.add_argument("-d", "--dataset").default_value("").help("Dataset file name"); program.add_argument("-p", "--path") .help("folder where the data files are located, default") .default_value(string{ PATH } @@ -59,9 +59,6 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) complete_file_name = path + file_name + ".arff"; class_last = false;//datasets[file_name]; title = program.get("title"); - if (!file_exists(complete_file_name)) { - throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); - } } catch (const exception& err) { cerr << err.what() << endl; @@ -98,26 +95,29 @@ int main(int argc, char** argv) experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet"); experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy"); platform::Timer timer; + cout << "*** Starting experiment: " << title << " ***" << endl; timer.start(); for (auto fileName : filesToProcess) { - cout << "Processing " << fileName << endl; + cout << "- " << fileName << " "; auto [X, y] = datasets.getTensors(fileName); - // auto states = datasets.getStates(fileName); - // auto features = datasets.getFeatures(fileName); - // auto className = datasets.getDataset(fileName).getClassName(); - // Fold* fold; - // if (stratified) - // fold = new StratifiedKFold(n_folds, y, seed); - // else - // fold = new KFold(n_folds, y.numel(), seed); - // auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); - // result.setDataset(file_name); - // experiment.setModelVersion(result.getModelVersion()); - // experiment.addResult(result); - // delete fold; + auto states = datasets.getStates(fileName); + auto features = datasets.getFeatures(fileName); + auto samples = datasets.getNSamples(fileName); + auto className = datasets.getClassName(fileName); + cout << " (" << samples << ", " << features.size() << ") " << flush; + Fold* fold; + if (stratified) + fold = new StratifiedKFold(n_folds, y, seed); + else + fold = new KFold(n_folds, samples, seed); + auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); + result.setDataset(file_name); + experiment.setModelVersion(result.getModelVersion()); + experiment.addResult(result); + delete fold; } experiment.setDuration(timer.getDuration()); - experiment.save(path); - experiment.show(); + experiment.save(PATH_RESULTS); + cout << "Done!" << endl; return 0; }