Remove platformUtils and split Datasets & Dataset
This commit is contained in:
parent
bb423da42f
commit
66ec1b343b
16
.vscode/launch.json
vendored
16
.vscode/launch.json
vendored
@ -22,26 +22,24 @@
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "experiment",
|
||||
"program": "${workspaceFolder}/build/src/Platform/main",
|
||||
"program": "${workspaceFolder}/build/src/Platform/b_main",
|
||||
"args": [
|
||||
"-m",
|
||||
"BoostAODE",
|
||||
"-p",
|
||||
"/Users/rmontanana/Code/discretizbench/datasets",
|
||||
"TAN",
|
||||
"--stratified",
|
||||
"-d",
|
||||
"mfeat-morphological",
|
||||
"zoo",
|
||||
"--discretize"
|
||||
// "--hyperparameters",
|
||||
// "{\"repeatSparent\": true, \"maxModels\": 12}"
|
||||
],
|
||||
"cwd": "/Users/rmontanana/Code/discretizbench",
|
||||
"cwd": "/Users/rmontanana/Code/odtebench",
|
||||
},
|
||||
{
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "best",
|
||||
"program": "${workspaceFolder}/build/src/Platform/best",
|
||||
"program": "${workspaceFolder}/build/src/Platform/b_best",
|
||||
"args": [
|
||||
"-m",
|
||||
"BoostAODE",
|
||||
@ -55,7 +53,7 @@
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "manage",
|
||||
"program": "${workspaceFolder}/build/src/Platform/manage",
|
||||
"program": "${workspaceFolder}/build/src/Platform/b_manage",
|
||||
"args": [
|
||||
"-n",
|
||||
"20"
|
||||
@ -66,7 +64,7 @@
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"name": "list",
|
||||
"program": "${workspaceFolder}/build/src/Platform/list",
|
||||
"program": "${workspaceFolder}/build/src/Platform/b_list",
|
||||
"args": [],
|
||||
"cwd": "/Users/rmontanana/Code/discretizbench",
|
||||
},
|
||||
|
@ -5,9 +5,9 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
|
||||
include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include)
|
||||
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
|
||||
include_directories(${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/include)
|
||||
add_executable(b_main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc ReportConsole.cc ReportBase.cc)
|
||||
add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc platformUtils.cc ExcelFile.cc)
|
||||
add_executable(b_list list.cc platformUtils Datasets.cc)
|
||||
add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Models.cc ReportConsole.cc ReportBase.cc)
|
||||
add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
|
||||
add_executable(b_list list.cc Datasets.cc Dataset.cc)
|
||||
add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc)
|
||||
target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
|
||||
if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
|
||||
|
225
src/Platform/Dataset.cc
Normal file
225
src/Platform/Dataset.cc
Normal file
@ -0,0 +1,225 @@
|
||||
#include "Dataset.h"
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
namespace platform {
|
||||
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
|
||||
{
|
||||
}
|
||||
string Dataset::getName() const
|
||||
{
|
||||
return name;
|
||||
}
|
||||
string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
vector<string> Dataset::getFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return features;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_features;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNSamples() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_samples;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
map<string, vector<int>> Dataset::getStates() const
|
||||
{
|
||||
if (loaded) {
|
||||
return states;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xv, yv };
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xd, yv };
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
buildTensors();
|
||||
return { X, y };
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
void Dataset::load_csv()
|
||||
{
|
||||
ifstream file(path + "/" + name + ".csv");
|
||||
if (file.is_open()) {
|
||||
string line;
|
||||
getline(file, line);
|
||||
vector<string> tokens = split(line, ',');
|
||||
features = vector<string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
|
||||
auto item = states.at(features[i]);
|
||||
iota(begin(item), end(item), 0);
|
||||
}
|
||||
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
}
|
||||
void Dataset::load_arff()
|
||||
{
|
||||
auto arff = ArffFiles();
|
||||
arff.load(path + "/" + name + ".arff", className);
|
||||
// Get Dataset X, y
|
||||
Xv = arff.getX();
|
||||
yv = arff.getY();
|
||||
// Get className & Features
|
||||
className = arff.getClassName();
|
||||
auto attributes = arff.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
|
||||
}
|
||||
vector<string> tokenize(string line)
|
||||
{
|
||||
vector<string> tokens;
|
||||
for (auto i = 0; i < line.size(); ++i) {
|
||||
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
|
||||
string token = line.substr(0, i);
|
||||
tokens.push_back(token);
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
i = 0;
|
||||
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
}
|
||||
}
|
||||
if (line.size() > 0) {
|
||||
tokens.push_back(line);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
void Dataset::load_rdata()
|
||||
{
|
||||
ifstream file(path + "/" + name + "_R.dat");
|
||||
if (file.is_open()) {
|
||||
string line;
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
vector<string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::load()
|
||||
{
|
||||
if (loaded) {
|
||||
return;
|
||||
}
|
||||
if (fileType == CSV) {
|
||||
load_csv();
|
||||
} else if (fileType == ARFF) {
|
||||
load_arff();
|
||||
} else if (fileType == RDATA) {
|
||||
load_rdata();
|
||||
}
|
||||
if (discretize) {
|
||||
Xd = discretizeDataset(Xv, yv);
|
||||
computeStates();
|
||||
}
|
||||
n_samples = Xv[0].size();
|
||||
n_features = Xv.size();
|
||||
loaded = true;
|
||||
}
|
||||
void Dataset::buildTensors()
|
||||
{
|
||||
if (discretize) {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
|
||||
} else {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
|
||||
}
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
if (discretize) {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
} else {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
}
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
}
|
||||
vector<mdlp::labels_t> Dataset::discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||
{
|
||||
vector<mdlp::labels_t> Xd;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return Xd;
|
||||
}
|
||||
vector<string> Dataset::split(const string& text, char delimiter)
|
||||
{
|
||||
vector<string> result;
|
||||
stringstream ss(text);
|
||||
string token;
|
||||
while (getline(ss, token, delimiter)) {
|
||||
result.push_back(token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
80
src/Platform/Dataset.h
Normal file
80
src/Platform/Dataset.h
Normal file
@ -0,0 +1,80 @@
|
||||
#ifndef DATASET_H
|
||||
#define DATASET_H
|
||||
#include <torch/torch.h>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "CPPFImdlp.h"
|
||||
namespace platform {
|
||||
using namespace std;
|
||||
|
||||
enum fileType_t { CSV, ARFF, RDATA };
|
||||
class SourceData {
|
||||
public:
|
||||
SourceData(string source)
|
||||
{
|
||||
if (source == "Surcov") {
|
||||
path = "datasets/";
|
||||
fileType = CSV;
|
||||
} else if (source == "Arff") {
|
||||
path = "datasets/";
|
||||
fileType = ARFF;
|
||||
} else if (source == "Tanveer") {
|
||||
path = "data/";
|
||||
fileType = RDATA;
|
||||
} else {
|
||||
throw invalid_argument("Unknown source.");
|
||||
}
|
||||
}
|
||||
string getPath()
|
||||
{
|
||||
return path;
|
||||
}
|
||||
fileType_t getFileType()
|
||||
{
|
||||
return fileType;
|
||||
}
|
||||
private:
|
||||
string path;
|
||||
fileType_t fileType;
|
||||
};
|
||||
class Dataset {
|
||||
private:
|
||||
string path;
|
||||
string name;
|
||||
fileType_t fileType;
|
||||
string className;
|
||||
int n_samples{ 0 }, n_features{ 0 };
|
||||
vector<string> features;
|
||||
map<string, vector<int>> states;
|
||||
bool loaded;
|
||||
bool discretize;
|
||||
torch::Tensor X, y;
|
||||
vector<vector<float>> Xv;
|
||||
vector<vector<int>> Xd;
|
||||
vector<int> yv;
|
||||
void buildTensors();
|
||||
void load_csv();
|
||||
void load_arff();
|
||||
void load_rdata();
|
||||
void computeStates();
|
||||
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
|
||||
public:
|
||||
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
|
||||
explicit Dataset(const Dataset&);
|
||||
static vector<string> split(const string& text, char delimiter);
|
||||
string getName() const;
|
||||
string getClassName() const;
|
||||
vector<string> getFeatures() const;
|
||||
map<string, vector<int>> getStates() const;
|
||||
pair<vector<vector<float>>&, vector<int>&> getVectors();
|
||||
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
|
||||
pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
int getNFeatures() const;
|
||||
int getNSamples() const;
|
||||
void load();
|
||||
const bool inline isLoaded() const { return loaded; };
|
||||
};
|
||||
};
|
||||
|
||||
#endif
|
@ -1,6 +1,4 @@
|
||||
#include "Datasets.h"
|
||||
#include "platformUtils.h"
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
namespace platform {
|
||||
void Datasets::load()
|
||||
@ -15,7 +13,7 @@ namespace platform {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
vector<string> tokens = split(line, ',');
|
||||
vector<string> tokens = Dataset::split(line, ',');
|
||||
string name = tokens[0];
|
||||
string className;
|
||||
try {
|
||||
@ -129,203 +127,4 @@ namespace platform {
|
||||
{
|
||||
return datasets.find(name) != datasets.end();
|
||||
}
|
||||
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
|
||||
{
|
||||
}
|
||||
string Dataset::getName() const
|
||||
{
|
||||
return name;
|
||||
}
|
||||
string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
vector<string> Dataset::getFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return features;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_features;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
int Dataset::getNSamples() const
|
||||
{
|
||||
if (loaded) {
|
||||
return n_samples;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
map<string, vector<int>> Dataset::getStates() const
|
||||
{
|
||||
if (loaded) {
|
||||
return states;
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<vector<vector<float>>&, vector<int>&> Dataset::getVectors()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xv, yv };
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<vector<vector<int>>&, vector<int>&> Dataset::getVectorsDiscretized()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xd, yv };
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
buildTensors();
|
||||
return { X, y };
|
||||
} else {
|
||||
throw invalid_argument("Dataset not loaded.");
|
||||
}
|
||||
}
|
||||
void Dataset::load_csv()
|
||||
{
|
||||
ifstream file(path + "/" + name + ".csv");
|
||||
if (file.is_open()) {
|
||||
string line;
|
||||
getline(file, line);
|
||||
vector<string> tokens = split(line, ',');
|
||||
features = vector<string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
|
||||
auto item = states.at(features[i]);
|
||||
iota(begin(item), end(item), 0);
|
||||
}
|
||||
states[className] = vector<int>(*max_element(yv.begin(), yv.end()) + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
}
|
||||
void Dataset::load_arff()
|
||||
{
|
||||
auto arff = ArffFiles();
|
||||
arff.load(path + "/" + name + ".arff", className);
|
||||
// Get Dataset X, y
|
||||
Xv = arff.getX();
|
||||
yv = arff.getY();
|
||||
// Get className & Features
|
||||
className = arff.getClassName();
|
||||
auto attributes = arff.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
|
||||
}
|
||||
vector<string> tokenize(string line)
|
||||
{
|
||||
vector<string> tokens;
|
||||
for (auto i = 0; i < line.size(); ++i) {
|
||||
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
|
||||
string token = line.substr(0, i);
|
||||
tokens.push_back(token);
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
i = 0;
|
||||
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
|
||||
line.erase(line.begin(), line.begin() + i + 1);
|
||||
}
|
||||
}
|
||||
if (line.size() > 0) {
|
||||
tokens.push_back(line);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
void Dataset::load_rdata()
|
||||
{
|
||||
ifstream file(path + "/" + name + "_R.dat");
|
||||
if (file.is_open()) {
|
||||
string line;
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
vector<string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
file.close();
|
||||
} else {
|
||||
throw invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
}
|
||||
void Dataset::load()
|
||||
{
|
||||
if (loaded) {
|
||||
return;
|
||||
}
|
||||
if (fileType == CSV) {
|
||||
load_csv();
|
||||
} else if (fileType == ARFF) {
|
||||
load_arff();
|
||||
} else if (fileType == RDATA) {
|
||||
load_rdata();
|
||||
}
|
||||
if (discretize) {
|
||||
Xd = discretizeDataset(Xv, yv);
|
||||
computeStates();
|
||||
}
|
||||
n_samples = Xv[0].size();
|
||||
n_features = Xv.size();
|
||||
loaded = true;
|
||||
}
|
||||
void Dataset::buildTensors()
|
||||
{
|
||||
if (discretize) {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
|
||||
} else {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
|
||||
}
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
if (discretize) {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
} else {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
}
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
}
|
||||
}
|
@ -1,76 +1,8 @@
|
||||
#ifndef DATASETS_H
|
||||
#define DATASETS_H
|
||||
#include <torch/torch.h>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include "Dataset.h"
|
||||
namespace platform {
|
||||
using namespace std;
|
||||
enum fileType_t { CSV, ARFF, RDATA };
|
||||
class SourceData {
|
||||
public:
|
||||
SourceData(string source)
|
||||
{
|
||||
if (source == "Surcov") {
|
||||
path = "datasets/";
|
||||
fileType = CSV;
|
||||
} else if (source == "Arff") {
|
||||
path = "datasets/";
|
||||
fileType = ARFF;
|
||||
} else if (source == "Tanveer") {
|
||||
path = "data/";
|
||||
fileType = RDATA;
|
||||
} else {
|
||||
throw invalid_argument("Unknown source.");
|
||||
}
|
||||
}
|
||||
string getPath()
|
||||
{
|
||||
return path;
|
||||
}
|
||||
fileType_t getFileType()
|
||||
{
|
||||
return fileType;
|
||||
}
|
||||
private:
|
||||
string path;
|
||||
fileType_t fileType;
|
||||
};
|
||||
class Dataset {
|
||||
private:
|
||||
string path;
|
||||
string name;
|
||||
fileType_t fileType;
|
||||
string className;
|
||||
int n_samples{ 0 }, n_features{ 0 };
|
||||
vector<string> features;
|
||||
map<string, vector<int>> states;
|
||||
bool loaded;
|
||||
bool discretize;
|
||||
torch::Tensor X, y;
|
||||
vector<vector<float>> Xv;
|
||||
vector<vector<int>> Xd;
|
||||
vector<int> yv;
|
||||
void buildTensors();
|
||||
void load_csv();
|
||||
void load_arff();
|
||||
void load_rdata();
|
||||
void computeStates();
|
||||
public:
|
||||
Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
|
||||
explicit Dataset(const Dataset&);
|
||||
string getName() const;
|
||||
string getClassName() const;
|
||||
vector<string> getFeatures() const;
|
||||
map<string, vector<int>> getStates() const;
|
||||
pair<vector<vector<float>>&, vector<int>&> getVectors();
|
||||
pair<vector<vector<int>>&, vector<int>&> getVectorsDiscretized();
|
||||
pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
int getNFeatures() const;
|
||||
int getNSamples() const;
|
||||
void load();
|
||||
const bool inline isLoaded() const { return loaded; };
|
||||
};
|
||||
class Datasets {
|
||||
private:
|
||||
string path;
|
||||
|
@ -4,7 +4,7 @@
|
||||
#include <map>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include "platformUtils.h"
|
||||
#include "Dataset.h"
|
||||
namespace platform {
|
||||
class DotEnv {
|
||||
private:
|
||||
@ -51,7 +51,7 @@ namespace platform {
|
||||
auto seeds_str = env["seeds"];
|
||||
seeds_str = trim(seeds_str);
|
||||
seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
|
||||
auto seeds_str_split = split(seeds_str, ',');
|
||||
auto seeds_str_split = Dataset::split(seeds_str, ',');
|
||||
transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
|
||||
return stoi(str);
|
||||
});
|
||||
|
@ -102,12 +102,12 @@ namespace platform {
|
||||
cout << data.dump(4) << endl;
|
||||
}
|
||||
|
||||
void Experiment::go(vector<string> filesToProcess, const string& path)
|
||||
void Experiment::go(vector<string> filesToProcess)
|
||||
{
|
||||
cout << "*** Starting experiment: " << title << " ***" << endl;
|
||||
for (auto fileName : filesToProcess) {
|
||||
cout << "- " << setw(20) << left << fileName << " " << right << flush;
|
||||
cross_validation(path, fileName);
|
||||
cross_validation(fileName);
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
@ -132,7 +132,7 @@ namespace platform {
|
||||
cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush;
|
||||
|
||||
}
|
||||
void Experiment::cross_validation(const string& path, const string& fileName)
|
||||
void Experiment::cross_validation(const string& fileName)
|
||||
{
|
||||
auto env = platform::DotEnv();
|
||||
auto datasets = platform::Datasets(discretized, env.get("source_data"));
|
||||
|
@ -108,8 +108,8 @@ namespace platform {
|
||||
Experiment& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; }
|
||||
string get_file_name();
|
||||
void save(const string& path);
|
||||
void cross_validation(const string& path, const string& fileName);
|
||||
void go(vector<string> filesToProcess, const string& path);
|
||||
void cross_validation(const string& fileName);
|
||||
void go(vector<string> filesToProcess);
|
||||
void show();
|
||||
void report();
|
||||
};
|
||||
|
@ -4,7 +4,6 @@
|
||||
namespace platform {
|
||||
class Paths {
|
||||
public:
|
||||
static std::string datasets() { return "datasets/"; }
|
||||
static std::string results() { return "results/"; }
|
||||
static std::string excel() { return "excel/"; }
|
||||
};
|
||||
|
@ -1,5 +1,4 @@
|
||||
#include <filesystem>
|
||||
#include "platformUtils.h"
|
||||
#include "Results.h"
|
||||
#include "ReportConsole.h"
|
||||
#include "ReportExcel.h"
|
||||
|
@ -1,7 +1,6 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "platformUtils.h"
|
||||
#include "Experiment.h"
|
||||
#include "Datasets.h"
|
||||
#include "DotEnv.h"
|
||||
@ -19,9 +18,6 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
|
||||
argparse::ArgumentParser program("main");
|
||||
program.add_argument("-d", "--dataset").default_value("").help("Dataset file name");
|
||||
program.add_argument("--hyperparameters").default_value("{}").help("Hyperparamters passed to the model in Experiment");
|
||||
program.add_argument("-p", "--path")
|
||||
.help("folder where the data files are located, default")
|
||||
.default_value(string{ platform::Paths::datasets() });
|
||||
program.add_argument("-m", "--model")
|
||||
.help("Model to use " + platform::Models::instance()->toString())
|
||||
.action([](const std::string& value) {
|
||||
@ -55,13 +51,11 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
|
||||
try {
|
||||
program.parse_args(argc, argv);
|
||||
auto file_name = program.get<string>("dataset");
|
||||
auto path = program.get<string>("path");
|
||||
auto model_name = program.get<string>("model");
|
||||
auto discretize_dataset = program.get<bool>("discretize");
|
||||
auto stratified = program.get<bool>("stratified");
|
||||
auto n_folds = program.get<int>("folds");
|
||||
auto seeds = program.get<vector<int>>("seeds");
|
||||
auto complete_file_name = path + file_name + ".arff";
|
||||
auto title = program.get<string>("title");
|
||||
auto hyperparameters = program.get<string>("hyperparameters");
|
||||
auto saveResults = program.get<bool>("save");
|
||||
@ -81,7 +75,6 @@ int main(int argc, char** argv)
|
||||
{
|
||||
auto program = manageArguments(argc, argv);
|
||||
auto file_name = program.get<string>("dataset");
|
||||
auto path = program.get<string>("path");
|
||||
auto model_name = program.get<string>("model");
|
||||
auto discretize_dataset = program.get<bool>("discretize");
|
||||
auto stratified = program.get<bool>("stratified");
|
||||
@ -120,7 +113,7 @@ int main(int argc, char** argv)
|
||||
}
|
||||
platform::Timer timer;
|
||||
timer.start();
|
||||
experiment.go(filesToTest, path);
|
||||
experiment.go(filesToTest);
|
||||
experiment.setDuration(timer.getDuration());
|
||||
if (saveResults) {
|
||||
experiment.save(platform::Paths::results());
|
||||
|
@ -1,6 +1,5 @@
|
||||
#include <iostream>
|
||||
#include <argparse/argparse.hpp>
|
||||
#include "platformUtils.h"
|
||||
#include "Paths.h"
|
||||
#include "Results.h"
|
||||
|
||||
|
@ -1,110 +0,0 @@
|
||||
#include "platformUtils.h"
|
||||
#include "Paths.h"
|
||||
|
||||
using namespace torch;
|
||||
|
||||
vector<string> split(const string& text, char delimiter)
|
||||
{
|
||||
vector<string> result;
|
||||
stringstream ss(text);
|
||||
string token;
|
||||
while (getline(ss, token, delimiter)) {
|
||||
result.push_back(token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
|
||||
{
|
||||
vector<mdlp::labels_t> Xd;
|
||||
map<string, int> maxes;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return { Xd, maxes };
|
||||
}
|
||||
|
||||
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||
{
|
||||
vector<mdlp::labels_t> Xd;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return Xd;
|
||||
}
|
||||
|
||||
bool file_exists(const string& name)
|
||||
{
|
||||
if (FILE* file = fopen(name.c_str(), "r")) {
|
||||
fclose(file);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset)
|
||||
{
|
||||
auto handler = ArffFiles();
|
||||
handler.load(path + static_cast<string>(name) + ".arff", class_last);
|
||||
// Get Dataset X, y
|
||||
vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
vector<string> features;
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
||||
Tensor Xd;
|
||||
auto states = map<string, vector<int>>();
|
||||
if (discretize_dataset) {
|
||||
auto Xr = discretizeDataset(X, y);
|
||||
Xd = torch::zeros({ static_cast<int>(Xr[0].size()), static_cast<int>(Xr.size()) }, torch::kInt32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
|
||||
auto item = states.at(features[i]);
|
||||
iota(begin(item), end(item), 0);
|
||||
Xd.index_put_({ "...", i }, torch::tensor(Xr[i], torch::kInt32));
|
||||
}
|
||||
states[className] = vector<int>(*max_element(y.begin(), y.end()) + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
} else {
|
||||
Xd = torch::zeros({ static_cast<int>(X[0].size()), static_cast<int>(X.size()) }, torch::kFloat32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
Xd.index_put_({ "...", i }, torch::tensor(X[i]));
|
||||
}
|
||||
}
|
||||
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
|
||||
}
|
||||
|
||||
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name)
|
||||
{
|
||||
auto handler = ArffFiles();
|
||||
handler.load(platform::Paths::datasets() + static_cast<string>(name) + ".arff");
|
||||
// Get Dataset X, y
|
||||
vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
vector<string> features;
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
||||
// Discretize Dataset
|
||||
vector<mdlp::labels_t> Xd;
|
||||
map<string, int> maxes;
|
||||
tie(Xd, maxes) = discretize(X, y, features);
|
||||
maxes[className] = *max_element(y.begin(), y.end()) + 1;
|
||||
map<string, vector<int>> states;
|
||||
for (auto feature : features) {
|
||||
states[feature] = vector<int>(maxes[feature]);
|
||||
}
|
||||
states[className] = vector<int>(maxes[className]);
|
||||
return { Xd, y, features, className, states };
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
#ifndef PLATFORM_UTILS_H
|
||||
#define PLATFORM_UTILS_H
|
||||
#include <torch/torch.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <tuple>
|
||||
#include "ArffFiles.h"
|
||||
#include "CPPFImdlp.h"
|
||||
using namespace std;
|
||||
|
||||
bool file_exists(const std::string& name);
|
||||
vector<string> split(const string& text, char delimiter);
|
||||
pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features);
|
||||
vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y);
|
||||
pair<torch::Tensor, map<string, vector<int>>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector<string>& features, const string& className);
|
||||
tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name);
|
||||
tuple<torch::Tensor, torch::Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset);
|
||||
map<string, vector<int>> get_states(vector<string>& features, string className, map<string, int>& maxes);
|
||||
#endif //PLATFORM_UTILS_H
|
@ -9,7 +9,6 @@
|
||||
#include "TAN.h"
|
||||
#include "SPODE.h"
|
||||
#include "AODE.h"
|
||||
#include "platformUtils.h"
|
||||
|
||||
TEST_CASE("Test Bayesian Classifiers score", "[BayesNet]")
|
||||
{
|
||||
|
@ -3,7 +3,6 @@
|
||||
#include <catch2/generators/catch_generators.hpp>
|
||||
#include <string>
|
||||
#include "KDB.h"
|
||||
#include "platformUtils.h"
|
||||
|
||||
TEST_CASE("Test Bayesian Network")
|
||||
{
|
||||
|
@ -5,7 +5,7 @@ if(ENABLE_TESTING)
|
||||
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
|
||||
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
|
||||
include_directories(${BayesNet_SOURCE_DIR}/lib/json/include)
|
||||
set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCE_DIR}/src/Platform/platformUtils.cc ${BayesNet_SOURCES})
|
||||
set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCES})
|
||||
add_executable(${TEST_MAIN} ${TEST_SOURCES})
|
||||
target_link_libraries(${TEST_MAIN} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain)
|
||||
add_test(NAME ${TEST_MAIN} COMMAND ${TEST_MAIN})
|
||||
|
Loading…
Reference in New Issue
Block a user