Adding Datasets management

This commit is contained in:
Ricardo Montañana Gómez 2023-07-27 01:56:06 +02:00
parent 3e954ba841
commit bc214a496c
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
13 changed files with 404 additions and 59 deletions

View File

@ -101,7 +101,8 @@
"*.ipp": "cpp",
"cassert": "cpp",
"charconv": "cpp",
"source_location": "cpp"
"source_location": "cpp",
"ranges": "cpp"
},
"cmake.configureOnOpen": false,
"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools"

View File

@ -42,7 +42,7 @@ vector<int>& ArffFiles::getY()
return y;
}
void ArffFiles::load(const string& fileName, bool classLast)
void ArffFiles::loadCommon(string fileName)
{
ifstream file(fileName);
if (!file.is_open()) {
@ -74,24 +74,50 @@ void ArffFiles::load(const string& fileName, bool classLast)
file.close();
if (attributes.empty())
throw invalid_argument("No attributes found");
}
void ArffFiles::load(const string& fileName, bool classLast)
{
int labelIndex;
loadCommon(fileName);
if (classLast) {
className = get<0>(attributes.back());
classType = get<1>(attributes.back());
attributes.pop_back();
labelIndex = static_cast<int>(attributes.size());
} else {
className = get<0>(attributes.front());
classType = get<1>(attributes.front());
attributes.erase(attributes.begin());
labelIndex = 0;
}
generateDataset(classLast);
generateDataset(labelIndex);
}
void ArffFiles::load(const string& fileName, const string& name)
{
int labelIndex;
loadCommon(fileName);
bool found = false;
for (int i = 0; i < attributes.size(); ++i) {
if (attributes[i].first == name) {
className = get<0>(attributes[i]);
classType = get<1>(attributes[i]);
attributes.erase(attributes.begin() + i);
labelIndex = i;
found = true;
break;
}
}
if (!found) {
throw invalid_argument("Class name not found");
}
generateDataset(labelIndex);
}
void ArffFiles::generateDataset(bool classLast)
void ArffFiles::generateDataset(int labelIndex)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
auto yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;

View File

@ -14,12 +14,12 @@ private:
string classType;
vector<vector<float>> X;
vector<int> y;
void generateDataset(bool);
void generateDataset(int);
void loadCommon(string);
public:
ArffFiles();
void load(const string&, bool = true);
void load(const string&, const string&);
vector<string> getLines() const;
unsigned long int getSize() const;
string getClassName() const;

View File

@ -17,6 +17,7 @@ namespace bayesnet {
vector<string> virtual show() = 0;
vector<string> virtual graph(string title = "") = 0;
virtual ~BaseClassifier() = default;
const string inline getVersion() const { return "0.1.0"; };
};
}
#endif

View File

@ -7,7 +7,7 @@
namespace bayesnet {
class Network {
private:
map<string, std::unique_ptr<Node>> nodes;
map<string, unique_ptr<Node>> nodes;
map<string, vector<int>> dataset;
bool fitted;
float maxThreads;

View File

@ -3,7 +3,7 @@
namespace bayesnet {
using namespace torch;
TAN::TAN() : Classifier(Network(0.1)) {}
TAN::TAN() : Classifier(Network()) {}
void TAN::train()
{

View File

@ -4,5 +4,5 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc)
add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc)
target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES} ")

252
src/Platform/Datasets.cc Normal file
View File

@ -0,0 +1,252 @@
#include "Datasets.h"
#include "platformUtils.h"
#include "ArffFiles.h"
namespace platform {
vector<string> split(string text, char delimiter)
{
vector<string> result;
stringstream ss(text);
string token;
while (getline(ss, token, delimiter)) {
result.push_back(token);
}
return result;
}
void Datasets::load()
{
string line;
ifstream catalog(path + "/all.txt");
if (catalog.is_open()) {
while (getline(catalog, line)) {
vector<string> tokens = split(line, ',');
string name = tokens[0];
string className = tokens[1];
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
}
catalog.close();
} else {
throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]");
}
}
Dataset& Datasets::getDataset(string name)
{
if (datasets.find(name) == datasets.end()) {
throw invalid_argument("Dataset not found.");
}
return *datasets[name];
}
vector<string> Datasets::getNames()
{
vector<string> result;
for (auto& d : datasets) {
result.push_back(d.first);
}
return result;
}
vector<string> Datasets::getFeatures(string name)
{
auto dataset = getDataset(name);
if (dataset.isLoaded()) {
return dataset.getFeatures();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Datasets::getStates(string name)
{
auto dataset = getDataset(name);
if (dataset.isLoaded()) {
return dataset.getStates();
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Datasets::getVectors(string name)
{
auto dataset = getDataset(name);
if (!dataset.isLoaded()) {
dataset.load();
}
return dataset.getVectors();
}
pair<vector<vector<int>>&, vector<int>&> Datasets::getVectorsDiscretized(string name)
{
auto dataset = getDataset(name);
if (!dataset.isLoaded()) {
dataset.load();
}
return dataset.getVectorsDiscretized();
}
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(string name)
{
auto dataset = getDataset(name);
if (!dataset.isLoaded()) {
dataset.load();
}
return dataset.getTensors();
}
Dataset::Dataset(Dataset& dataset)
{
path = dataset.path;
name = dataset.name;
className = dataset.className;
n_samples = dataset.n_samples;
n_features = dataset.n_features;
features = dataset.features;
states = dataset.states;
loaded = dataset.loaded;
discretize = dataset.discretize;
X = dataset.X;
y = dataset.y;
Xv = dataset.Xv;
Xd = dataset.Xd;
yv = dataset.yv;
fileType = dataset.fileType;
}
string Dataset::getName()
{
return name;
}
string Dataset::getClassName()
{
return className;
}
vector<string> Dataset::getFeatures()
{
if (loaded) {
return features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures()
{
if (loaded) {
return n_features;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples()
{
if (loaded) {
return n_samples;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
map<string, vector<int>> Dataset::getStates()
{
if (loaded) {
return states;
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
string line;
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
getline(file, line);
vector<string> tokens = split(line, ',');
features = vector<string>(tokens.begin(), tokens.end() - 1);
className = tokens.back();
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()));
iota(Xd[i].begin(), Xd[i].end(), 0);
}
states[className] = vector<int>(*max_element(yv.begin(), yv.end()));
iota(yv.begin(), yv.end(), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
auto className = arff.getClassName();
vector<string> features;
for (auto feature : arff.getAttributes()) {
features.push_back(feature.first);
}
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
n_samples = Xd[0].size();
n_features = Xd.size();
}
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
y = torch::tensor(yv, torch::kInt32);
}
}
}

63
src/Platform/Datasets.h Normal file
View File

@ -0,0 +1,63 @@
#ifndef DATASETS_H
#define DATASETS_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
namespace platform {
using namespace std;
enum fileType_t { CSV, ARFF };
class Dataset {
private:
string path;
string name;
fileType_t fileType;
string className;
int n_samples, n_features;
vector<string> features;
map<string, vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
vector<vector<float>> Xv;
vector<vector<int>> Xd;
vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void computeStates();
public:
Dataset(string path, string name, string className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
Dataset(Dataset&);
string getName();
string getClassName();
vector<string> getFeatures();
map<string, vector<int>> getStates();
pair<vector<vector<float>>&, vector<int>&> getVectors();
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures();
int getNSamples();
void load();
const bool inline isLoaded() const { return loaded; };
};
class Datasets {
private:
string path;
fileType_t fileType;
map<string, unique_ptr<Dataset>> datasets;
bool discretize;
void load(); // Loads the list of datasets
public:
Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); };
Dataset& getDataset(string name);
vector<string> getNames();
vector<string> getFeatures(string name);
map<string, vector<int>> getStates(string name);
pair<vector<vector<float>>&, vector<int>&> getVectors(string name);
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized(string name);
pair<torch::Tensor&, torch::Tensor&> getTensors(string name);
};
};
#endif

View File

@ -101,6 +101,7 @@ namespace platform {
Timer train_timer, test_timer;
for (int i = 0; i < k; i++) {
bayesnet::BaseClassifier* model = classifiers[model_name];
result.setModelVersion(model->getVersion());
train_timer.start();
auto [train, test] = fold->getFold(i);
auto train_t = torch::tensor(train);

View File

@ -24,7 +24,7 @@ namespace platform {
};
class Result {
private:
string dataset, hyperparameters;
string dataset, hyperparameters, model_version;
int samples, features, classes;
float score_train, score_test, score_train_std, score_test_std, train_time, train_time_std, test_time, test_time_std;
float nodes, leaves, depth;
@ -46,6 +46,7 @@ namespace platform {
Result& setNodes(float nodes) { this->nodes = nodes; return *this; }
Result& setLeaves(float leaves) { this->leaves = leaves; return *this; }
Result& setDepth(float depth) { this->depth = depth; return *this; }
Result& setModelVersion(string model_version) { this->model_version = model_version; return *this; }
const float get_score_train() const { return score_train; }
float get_score_test() { return score_test; }
const string& getDataset() const { return dataset; }
@ -64,6 +65,7 @@ namespace platform {
const float getNodes() const { return nodes; }
const float getLeaves() const { return leaves; }
const float getDepth() const { return depth; }
const string& getModelVersion() const { return model_version; }
};
class Experiment {
private:

View File

@ -1,49 +1,17 @@
#include <iostream>
#include <string>
#include <torch/torch.h>
#include <thread>
#include <argparse/argparse.hpp>
#include "ArffFiles.h"
#include "Network.h"
#include "BayesMetrics.h"
#include "CPPFImdlp.h"
#include "KDB.h"
#include "SPODE.h"
#include "AODE.h"
#include "TAN.h"
#include "platformUtils.h"
#include "Experiment.h"
#include "Folding.h"
#include "Datasets.h"
using namespace std;
int main(int argc, char** argv)
argparse::ArgumentParser manageArguments(int argc, char** argv)
{
map<string, bool> datasets = {
{"diabetes", true},
{"ecoli", true},
{"glass", true},
{"iris", true},
{"kdd_JapaneseVowels", false},
{"letter", true},
{"liver-disorders", true},
{"mfeat-factors", true},
};
auto valid_datasets = vector<string>();
for (auto dataset : datasets) {
valid_datasets.push_back(dataset.first);
}
argparse::ArgumentParser program("BayesNetSample");
program.add_argument("-d", "--dataset")
.help("Dataset file name")
.action([valid_datasets](const std::string& value) {
if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
return value;
}
throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
}
);
.help("Dataset file name");
program.add_argument("-p", "--path")
.help("folder where the data files are located, default")
.default_value(string{ PATH }
@ -89,7 +57,7 @@ int main(int argc, char** argv)
n_folds = program.get<int>("folds");
seed = program.get<int>("seed");
complete_file_name = path + file_name + ".arff";
class_last = datasets[file_name];
class_last = false;//datasets[file_name];
title = program.get<string>("title");
if (!file_exists(complete_file_name)) {
throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
@ -100,24 +68,54 @@ int main(int argc, char** argv)
cerr << program;
exit(1);
}
return program;
}
int main(int argc, char** argv)
{
auto program = manageArguments(argc, argv);
auto file_name = program.get<string>("dataset");
auto path = program.get<string>("path");
auto model_name = program.get<string>("model");
auto discretize_dataset = program.get<bool>("discretize");
auto stratified = program.get<bool>("stratified");
auto n_folds = program.get<int>("folds");
auto seed = program.get<int>("seed");
vector<string> filesToProcess;
auto datasets = platform::Datasets(path, true, platform::ARFF);
if (file_name != "") {
filesToProcess.push_back(file_name);
} else {
filesToProcess = platform::Datasets(path, true, platform::ARFF).getNames();
}
auto title = program.get<string>("title");
/*
* Begin Processing
*/
auto [X, y, features, className, states] = loadDataset(path, file_name, class_last, discretize_dataset);
Fold* fold;
if (stratified)
fold = new StratifiedKFold(n_folds, y, seed);
else
fold = new KFold(n_folds, y.numel(), seed);
auto experiment = platform::Experiment();
experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("1.0.0");
experiment.setDiscretized(discretize_dataset).setModel(model_name).setModelVersion("1...0").setPlatform("BayesNet");
experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet");
experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy");
platform::Timer timer;
timer.start();
auto result = platform::cross_validation(fold, model_name, X, y, features, className, states);
result.setDataset(file_name);
experiment.addResult(result);
for (auto fileName : filesToProcess) {
cout << "Processing " << fileName << endl;
auto [X, y] = datasets.getTensors(fileName);
// auto states = datasets.getStates(fileName);
// auto features = datasets.getFeatures(fileName);
// auto className = datasets.getDataset(fileName).getClassName();
// Fold* fold;
// if (stratified)
// fold = new StratifiedKFold(n_folds, y, seed);
// else
// fold = new KFold(n_folds, y.numel(), seed);
// auto result = platform::cross_validation(fold, model_name, X, y, features, className, states);
// result.setDataset(file_name);
// experiment.setModelVersion(result.getModelVersion());
// experiment.addResult(result);
// delete fold;
}
experiment.setDuration(timer.getDuration());
experiment.save(path);
experiment.show();

View File

@ -12,6 +12,7 @@ const string PATH = "../../data/";
bool file_exists(const std::string& name);
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features);
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
pair<torch::Tensor, map<string, vector<int>>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector<string>& features, string className);
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(string name);
tuple<torch::Tensor, torch::Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(string path, string name, bool class_last, bool discretize_dataset);