diff --git a/.vscode/launch.json b/.vscode/launch.json index 4e9b5f1..26ae62b 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -22,26 +22,24 @@ "type": "lldb", "request": "launch", "name": "experiment", - "program": "${workspaceFolder}/build/src/Platform/main", + "program": "${workspaceFolder}/build/src/Platform/b_main", "args": [ "-m", - "BoostAODE", - "-p", - "/Users/rmontanana/Code/discretizbench/datasets", + "TAN", "--stratified", "-d", - "mfeat-morphological", + "zoo", "--discretize" // "--hyperparameters", // "{\"repeatSparent\": true, \"maxModels\": 12}" ], - "cwd": "/Users/rmontanana/Code/discretizbench", + "cwd": "/Users/rmontanana/Code/odtebench", }, { "type": "lldb", "request": "launch", "name": "best", - "program": "${workspaceFolder}/build/src/Platform/best", + "program": "${workspaceFolder}/build/src/Platform/b_best", "args": [ "-m", "BoostAODE", @@ -55,7 +53,7 @@ "type": "lldb", "request": "launch", "name": "manage", - "program": "${workspaceFolder}/build/src/Platform/manage", + "program": "${workspaceFolder}/build/src/Platform/b_manage", "args": [ "-n", "20" @@ -66,7 +64,7 @@ "type": "lldb", "request": "launch", "name": "list", - "program": "${workspaceFolder}/build/src/Platform/list", + "program": "${workspaceFolder}/build/src/Platform/b_list", "args": [], "cwd": "/Users/rmontanana/Code/discretizbench", }, diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index b36916b..4e62a1f 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -5,9 +5,9 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) include_directories(${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/include) -add_executable(b_main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc ReportConsole.cc ReportBase.cc) -add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc platformUtils.cc ExcelFile.cc) -add_executable(b_list list.cc platformUtils Datasets.cc) +add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Models.cc ReportConsole.cc ReportBase.cc) +add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc) +add_executable(b_list list.cc Datasets.cc Dataset.cc) add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc) target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") diff --git a/src/Platform/Dataset.cc b/src/Platform/Dataset.cc new file mode 100644 index 0000000..02a36f9 --- /dev/null +++ b/src/Platform/Dataset.cc @@ -0,0 +1,225 @@ +#include "Dataset.h" +#include "ArffFiles.h" +#include +namespace platform { + Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType) + { + } + string Dataset::getName() const + { + return name; + } + string Dataset::getClassName() const + { + return className; + } + vector Dataset::getFeatures() const + { + if (loaded) { + return features; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + int Dataset::getNFeatures() const + { + if (loaded) { + return n_features; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + int Dataset::getNSamples() const + { + if (loaded) { + return n_samples; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + map> Dataset::getStates() const + { + if (loaded) { + return states; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair>&, vector&> Dataset::getVectors() + { + if (loaded) { + return { Xv, yv }; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair>&, vector&> Dataset::getVectorsDiscretized() + { + if (loaded) { + return { Xd, yv }; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair Dataset::getTensors() + { + if (loaded) { + buildTensors(); + return { X, y }; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + void Dataset::load_csv() + { + ifstream file(path + "/" + name + ".csv"); + if (file.is_open()) { + string line; + getline(file, line); + vector tokens = split(line, ','); + features = vector(tokens.begin(), tokens.end() - 1); + if (className == "-1") { + className = tokens.back(); + } + for (auto i = 0; i < features.size(); ++i) { + Xv.push_back(vector()); + } + while (getline(file, line)) { + tokens = split(line, ','); + for (auto i = 0; i < features.size(); ++i) { + Xv[i].push_back(stof(tokens[i])); + } + yv.push_back(stoi(tokens.back())); + } + file.close(); + } else { + throw invalid_argument("Unable to open dataset file."); + } + } + void Dataset::computeStates() + { + for (int i = 0; i < features.size(); ++i) { + states[features[i]] = vector(*max_element(Xd[i].begin(), Xd[i].end()) + 1); + auto item = states.at(features[i]); + iota(begin(item), end(item), 0); + } + states[className] = vector(*max_element(yv.begin(), yv.end()) + 1); + iota(begin(states.at(className)), end(states.at(className)), 0); + } + void Dataset::load_arff() + { + auto arff = ArffFiles(); + arff.load(path + "/" + name + ".arff", className); + // Get Dataset X, y + Xv = arff.getX(); + yv = arff.getY(); + // Get className & Features + className = arff.getClassName(); + auto attributes = arff.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; }); + } + vector tokenize(string line) + { + vector tokens; + for (auto i = 0; i < line.size(); ++i) { + if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') { + string token = line.substr(0, i); + tokens.push_back(token); + line.erase(line.begin(), line.begin() + i + 1); + i = 0; + while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') + line.erase(line.begin(), line.begin() + i + 1); + } + } + if (line.size() > 0) { + tokens.push_back(line); + } + return tokens; + } + void Dataset::load_rdata() + { + ifstream file(path + "/" + name + "_R.dat"); + if (file.is_open()) { + string line; + getline(file, line); + line = ArffFiles::trim(line); + vector tokens = tokenize(line); + transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); }); + if (className == "-1") { + className = ArffFiles::trim(tokens.back()); + } + for (auto i = 0; i < features.size(); ++i) { + Xv.push_back(vector()); + } + while (getline(file, line)) { + tokens = tokenize(line); + // We have to skip the first token, which is the instance number. + for (auto i = 1; i < features.size() + 1; ++i) { + const float value = stof(tokens[i]); + Xv[i - 1].push_back(value); + } + yv.push_back(stoi(tokens.back())); + } + file.close(); + } else { + throw invalid_argument("Unable to open dataset file."); + } + } + void Dataset::load() + { + if (loaded) { + return; + } + if (fileType == CSV) { + load_csv(); + } else if (fileType == ARFF) { + load_arff(); + } else if (fileType == RDATA) { + load_rdata(); + } + if (discretize) { + Xd = discretizeDataset(Xv, yv); + computeStates(); + } + n_samples = Xv[0].size(); + n_features = Xv.size(); + loaded = true; + } + void Dataset::buildTensors() + { + if (discretize) { + X = torch::zeros({ static_cast(n_features), static_cast(n_samples) }, torch::kInt32); + } else { + X = torch::zeros({ static_cast(n_features), static_cast(n_samples) }, torch::kFloat32); + } + for (int i = 0; i < features.size(); ++i) { + if (discretize) { + X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + } else { + X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32)); + } + } + y = torch::tensor(yv, torch::kInt32); + } + vector Dataset::discretizeDataset(vector& X, mdlp::labels_t& y) + { + vector Xd; + auto fimdlp = mdlp::CPPFImdlp(); + for (int i = 0; i < X.size(); i++) { + fimdlp.fit(X[i], y); + mdlp::labels_t& xd = fimdlp.transform(X[i]); + Xd.push_back(xd); + } + return Xd; + } + vector Dataset::split(const string& text, char delimiter) + { + vector result; + stringstream ss(text); + string token; + while (getline(ss, token, delimiter)) { + result.push_back(token); + } + return result; + } +} \ No newline at end of file diff --git a/src/Platform/Dataset.h b/src/Platform/Dataset.h new file mode 100644 index 0000000..fbc577e --- /dev/null +++ b/src/Platform/Dataset.h @@ -0,0 +1,80 @@ +#ifndef DATASET_H +#define DATASET_H +#include +#include +#include +#include +#include "CPPFImdlp.h" +namespace platform { + using namespace std; + + enum fileType_t { CSV, ARFF, RDATA }; + class SourceData { + public: + SourceData(string source) + { + if (source == "Surcov") { + path = "datasets/"; + fileType = CSV; + } else if (source == "Arff") { + path = "datasets/"; + fileType = ARFF; + } else if (source == "Tanveer") { + path = "data/"; + fileType = RDATA; + } else { + throw invalid_argument("Unknown source."); + } + } + string getPath() + { + return path; + } + fileType_t getFileType() + { + return fileType; + } + private: + string path; + fileType_t fileType; + }; + class Dataset { + private: + string path; + string name; + fileType_t fileType; + string className; + int n_samples{ 0 }, n_features{ 0 }; + vector features; + map> states; + bool loaded; + bool discretize; + torch::Tensor X, y; + vector> Xv; + vector> Xd; + vector yv; + void buildTensors(); + void load_csv(); + void load_arff(); + void load_rdata(); + void computeStates(); + vector discretizeDataset(vector& X, mdlp::labels_t& y); + public: + Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; + explicit Dataset(const Dataset&); + static vector split(const string& text, char delimiter); + string getName() const; + string getClassName() const; + vector getFeatures() const; + map> getStates() const; + pair>&, vector&> getVectors(); + pair>&, vector&> getVectorsDiscretized(); + pair getTensors(); + int getNFeatures() const; + int getNSamples() const; + void load(); + const bool inline isLoaded() const { return loaded; }; + }; +}; + +#endif \ No newline at end of file diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 17b2ee1..fe04f41 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -1,6 +1,4 @@ #include "Datasets.h" -#include "platformUtils.h" -#include "ArffFiles.h" #include namespace platform { void Datasets::load() @@ -15,7 +13,7 @@ namespace platform { if (line.empty() || line[0] == '#') { continue; } - vector tokens = split(line, ','); + vector tokens = Dataset::split(line, ','); string name = tokens[0]; string className; try { @@ -129,203 +127,4 @@ namespace platform { { return datasets.find(name) != datasets.end(); } - Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType) - { - } - string Dataset::getName() const - { - return name; - } - string Dataset::getClassName() const - { - return className; - } - vector Dataset::getFeatures() const - { - if (loaded) { - return features; - } else { - throw invalid_argument("Dataset not loaded."); - } - } - int Dataset::getNFeatures() const - { - if (loaded) { - return n_features; - } else { - throw invalid_argument("Dataset not loaded."); - } - } - int Dataset::getNSamples() const - { - if (loaded) { - return n_samples; - } else { - throw invalid_argument("Dataset not loaded."); - } - } - map> Dataset::getStates() const - { - if (loaded) { - return states; - } else { - throw invalid_argument("Dataset not loaded."); - } - } - pair>&, vector&> Dataset::getVectors() - { - if (loaded) { - return { Xv, yv }; - } else { - throw invalid_argument("Dataset not loaded."); - } - } - pair>&, vector&> Dataset::getVectorsDiscretized() - { - if (loaded) { - return { Xd, yv }; - } else { - throw invalid_argument("Dataset not loaded."); - } - } - pair Dataset::getTensors() - { - if (loaded) { - buildTensors(); - return { X, y }; - } else { - throw invalid_argument("Dataset not loaded."); - } - } - void Dataset::load_csv() - { - ifstream file(path + "/" + name + ".csv"); - if (file.is_open()) { - string line; - getline(file, line); - vector tokens = split(line, ','); - features = vector(tokens.begin(), tokens.end() - 1); - if (className == "-1") { - className = tokens.back(); - } - for (auto i = 0; i < features.size(); ++i) { - Xv.push_back(vector()); - } - while (getline(file, line)) { - tokens = split(line, ','); - for (auto i = 0; i < features.size(); ++i) { - Xv[i].push_back(stof(tokens[i])); - } - yv.push_back(stoi(tokens.back())); - } - file.close(); - } else { - throw invalid_argument("Unable to open dataset file."); - } - } - void Dataset::computeStates() - { - for (int i = 0; i < features.size(); ++i) { - states[features[i]] = vector(*max_element(Xd[i].begin(), Xd[i].end()) + 1); - auto item = states.at(features[i]); - iota(begin(item), end(item), 0); - } - states[className] = vector(*max_element(yv.begin(), yv.end()) + 1); - iota(begin(states.at(className)), end(states.at(className)), 0); - } - void Dataset::load_arff() - { - auto arff = ArffFiles(); - arff.load(path + "/" + name + ".arff", className); - // Get Dataset X, y - Xv = arff.getX(); - yv = arff.getY(); - // Get className & Features - className = arff.getClassName(); - auto attributes = arff.getAttributes(); - transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; }); - } - vector tokenize(string line) - { - vector tokens; - for (auto i = 0; i < line.size(); ++i) { - if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') { - string token = line.substr(0, i); - tokens.push_back(token); - line.erase(line.begin(), line.begin() + i + 1); - i = 0; - while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') - line.erase(line.begin(), line.begin() + i + 1); - } - } - if (line.size() > 0) { - tokens.push_back(line); - } - return tokens; - } - void Dataset::load_rdata() - { - ifstream file(path + "/" + name + "_R.dat"); - if (file.is_open()) { - string line; - getline(file, line); - line = ArffFiles::trim(line); - vector tokens = tokenize(line); - transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); }); - if (className == "-1") { - className = ArffFiles::trim(tokens.back()); - } - for (auto i = 0; i < features.size(); ++i) { - Xv.push_back(vector()); - } - while (getline(file, line)) { - tokens = tokenize(line); - // We have to skip the first token, which is the instance number. - for (auto i = 1; i < features.size() + 1; ++i) { - const float value = stof(tokens[i]); - Xv[i - 1].push_back(value); - } - yv.push_back(stoi(tokens.back())); - } - file.close(); - } else { - throw invalid_argument("Unable to open dataset file."); - } - } - void Dataset::load() - { - if (loaded) { - return; - } - if (fileType == CSV) { - load_csv(); - } else if (fileType == ARFF) { - load_arff(); - } else if (fileType == RDATA) { - load_rdata(); - } - if (discretize) { - Xd = discretizeDataset(Xv, yv); - computeStates(); - } - n_samples = Xv[0].size(); - n_features = Xv.size(); - loaded = true; - } - void Dataset::buildTensors() - { - if (discretize) { - X = torch::zeros({ static_cast(n_features), static_cast(n_samples) }, torch::kInt32); - } else { - X = torch::zeros({ static_cast(n_features), static_cast(n_samples) }, torch::kFloat32); - } - for (int i = 0; i < features.size(); ++i) { - if (discretize) { - X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); - } else { - X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32)); - } - } - y = torch::tensor(yv, torch::kInt32); - } } \ No newline at end of file diff --git a/src/Platform/Datasets.h b/src/Platform/Datasets.h index aa3c109..1d16400 100644 --- a/src/Platform/Datasets.h +++ b/src/Platform/Datasets.h @@ -1,76 +1,8 @@ #ifndef DATASETS_H #define DATASETS_H -#include -#include -#include -#include +#include "Dataset.h" namespace platform { using namespace std; - enum fileType_t { CSV, ARFF, RDATA }; - class SourceData { - public: - SourceData(string source) - { - if (source == "Surcov") { - path = "datasets/"; - fileType = CSV; - } else if (source == "Arff") { - path = "datasets/"; - fileType = ARFF; - } else if (source == "Tanveer") { - path = "data/"; - fileType = RDATA; - } else { - throw invalid_argument("Unknown source."); - } - } - string getPath() - { - return path; - } - fileType_t getFileType() - { - return fileType; - } - private: - string path; - fileType_t fileType; - }; - class Dataset { - private: - string path; - string name; - fileType_t fileType; - string className; - int n_samples{ 0 }, n_features{ 0 }; - vector features; - map> states; - bool loaded; - bool discretize; - torch::Tensor X, y; - vector> Xv; - vector> Xd; - vector yv; - void buildTensors(); - void load_csv(); - void load_arff(); - void load_rdata(); - void computeStates(); - public: - Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; - explicit Dataset(const Dataset&); - string getName() const; - string getClassName() const; - vector getFeatures() const; - map> getStates() const; - pair>&, vector&> getVectors(); - pair>&, vector&> getVectorsDiscretized(); - pair getTensors(); - int getNFeatures() const; - int getNSamples() const; - void load(); - const bool inline isLoaded() const { return loaded; }; - }; class Datasets { private: string path; diff --git a/src/Platform/DotEnv.h b/src/Platform/DotEnv.h index a7e3e36..401d5af 100644 --- a/src/Platform/DotEnv.h +++ b/src/Platform/DotEnv.h @@ -4,7 +4,7 @@ #include #include #include -#include "platformUtils.h" +#include "Dataset.h" namespace platform { class DotEnv { private: @@ -51,7 +51,7 @@ namespace platform { auto seeds_str = env["seeds"]; seeds_str = trim(seeds_str); seeds_str = seeds_str.substr(1, seeds_str.size() - 2); - auto seeds_str_split = split(seeds_str, ','); + auto seeds_str_split = Dataset::split(seeds_str, ','); transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) { return stoi(str); }); diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index f33e8d3..dced445 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -102,12 +102,12 @@ namespace platform { cout << data.dump(4) << endl; } - void Experiment::go(vector filesToProcess, const string& path) + void Experiment::go(vector filesToProcess) { cout << "*** Starting experiment: " << title << " ***" << endl; for (auto fileName : filesToProcess) { cout << "- " << setw(20) << left << fileName << " " << right << flush; - cross_validation(path, fileName); + cross_validation(fileName); cout << endl; } } @@ -132,7 +132,7 @@ namespace platform { cout << prefix << color << fold << Colors::RESET() << "(" << color << phase << Colors::RESET() << ")" << flush; } - void Experiment::cross_validation(const string& path, const string& fileName) + void Experiment::cross_validation(const string& fileName) { auto env = platform::DotEnv(); auto datasets = platform::Datasets(discretized, env.get("source_data")); diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index fe4415e..5653e93 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -108,8 +108,8 @@ namespace platform { Experiment& setHyperparameters(const json& hyperparameters) { this->hyperparameters = hyperparameters; return *this; } string get_file_name(); void save(const string& path); - void cross_validation(const string& path, const string& fileName); - void go(vector filesToProcess, const string& path); + void cross_validation(const string& fileName); + void go(vector filesToProcess); void show(); void report(); }; diff --git a/src/Platform/Paths.h b/src/Platform/Paths.h index b19b09f..926568e 100644 --- a/src/Platform/Paths.h +++ b/src/Platform/Paths.h @@ -4,7 +4,6 @@ namespace platform { class Paths { public: - static std::string datasets() { return "datasets/"; } static std::string results() { return "results/"; } static std::string excel() { return "excel/"; } }; diff --git a/src/Platform/Results.cc b/src/Platform/Results.cc index f5e3481..f313d91 100644 --- a/src/Platform/Results.cc +++ b/src/Platform/Results.cc @@ -1,5 +1,4 @@ #include -#include "platformUtils.h" #include "Results.h" #include "ReportConsole.h" #include "ReportExcel.h" diff --git a/src/Platform/main.cc b/src/Platform/main.cc index ccd4271..62470c5 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -1,7 +1,6 @@ #include #include #include -#include "platformUtils.h" #include "Experiment.h" #include "Datasets.h" #include "DotEnv.h" @@ -19,9 +18,6 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) argparse::ArgumentParser program("main"); program.add_argument("-d", "--dataset").default_value("").help("Dataset file name"); program.add_argument("--hyperparameters").default_value("{}").help("Hyperparamters passed to the model in Experiment"); - program.add_argument("-p", "--path") - .help("folder where the data files are located, default") - .default_value(string{ platform::Paths::datasets() }); program.add_argument("-m", "--model") .help("Model to use " + platform::Models::instance()->toString()) .action([](const std::string& value) { @@ -55,13 +51,11 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) try { program.parse_args(argc, argv); auto file_name = program.get("dataset"); - auto path = program.get("path"); auto model_name = program.get("model"); auto discretize_dataset = program.get("discretize"); auto stratified = program.get("stratified"); auto n_folds = program.get("folds"); auto seeds = program.get>("seeds"); - auto complete_file_name = path + file_name + ".arff"; auto title = program.get("title"); auto hyperparameters = program.get("hyperparameters"); auto saveResults = program.get("save"); @@ -81,7 +75,6 @@ int main(int argc, char** argv) { auto program = manageArguments(argc, argv); auto file_name = program.get("dataset"); - auto path = program.get("path"); auto model_name = program.get("model"); auto discretize_dataset = program.get("discretize"); auto stratified = program.get("stratified"); @@ -120,7 +113,7 @@ int main(int argc, char** argv) } platform::Timer timer; timer.start(); - experiment.go(filesToTest, path); + experiment.go(filesToTest); experiment.setDuration(timer.getDuration()); if (saveResults) { experiment.save(platform::Paths::results()); diff --git a/src/Platform/manage.cc b/src/Platform/manage.cc index cf699d6..7e95473 100644 --- a/src/Platform/manage.cc +++ b/src/Platform/manage.cc @@ -1,6 +1,5 @@ #include #include -#include "platformUtils.h" #include "Paths.h" #include "Results.h" diff --git a/src/Platform/platformUtils.cc b/src/Platform/platformUtils.cc deleted file mode 100644 index f114636..0000000 --- a/src/Platform/platformUtils.cc +++ /dev/null @@ -1,110 +0,0 @@ -#include "platformUtils.h" -#include "Paths.h" - -using namespace torch; - -vector split(const string& text, char delimiter) -{ - vector result; - stringstream ss(text); - string token; - while (getline(ss, token, delimiter)) { - result.push_back(token); - } - return result; -} - -pair, map> discretize(vector& X, mdlp::labels_t& y, vector features) -{ - vector Xd; - map maxes; - auto fimdlp = mdlp::CPPFImdlp(); - for (int i = 0; i < X.size(); i++) { - fimdlp.fit(X[i], y); - mdlp::labels_t& xd = fimdlp.transform(X[i]); - maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1; - Xd.push_back(xd); - } - return { Xd, maxes }; -} - -vector discretizeDataset(vector& X, mdlp::labels_t& y) -{ - vector Xd; - auto fimdlp = mdlp::CPPFImdlp(); - for (int i = 0; i < X.size(); i++) { - fimdlp.fit(X[i], y); - mdlp::labels_t& xd = fimdlp.transform(X[i]); - Xd.push_back(xd); - } - return Xd; -} - -bool file_exists(const string& name) -{ - if (FILE* file = fopen(name.c_str(), "r")) { - fclose(file); - return true; - } else { - return false; - } -} - -tuple, string, map>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset) -{ - auto handler = ArffFiles(); - handler.load(path + static_cast(name) + ".arff", class_last); - // Get Dataset X, y - vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); - // Get className & Features - auto className = handler.getClassName(); - vector features; - auto attributes = handler.getAttributes(); - transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); - Tensor Xd; - auto states = map>(); - if (discretize_dataset) { - auto Xr = discretizeDataset(X, y); - Xd = torch::zeros({ static_cast(Xr[0].size()), static_cast(Xr.size()) }, torch::kInt32); - for (int i = 0; i < features.size(); ++i) { - states[features[i]] = vector(*max_element(Xr[i].begin(), Xr[i].end()) + 1); - auto item = states.at(features[i]); - iota(begin(item), end(item), 0); - Xd.index_put_({ "...", i }, torch::tensor(Xr[i], torch::kInt32)); - } - states[className] = vector(*max_element(y.begin(), y.end()) + 1); - iota(begin(states.at(className)), end(states.at(className)), 0); - } else { - Xd = torch::zeros({ static_cast(X[0].size()), static_cast(X.size()) }, torch::kFloat32); - for (int i = 0; i < features.size(); ++i) { - Xd.index_put_({ "...", i }, torch::tensor(X[i])); - } - } - return { Xd, torch::tensor(y, torch::kInt32), features, className, states }; -} - -tuple>, vector, vector, string, map>> loadFile(const string& name) -{ - auto handler = ArffFiles(); - handler.load(platform::Paths::datasets() + static_cast(name) + ".arff"); - // Get Dataset X, y - vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); - // Get className & Features - auto className = handler.getClassName(); - vector features; - auto attributes = handler.getAttributes(); - transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); - // Discretize Dataset - vector Xd; - map maxes; - tie(Xd, maxes) = discretize(X, y, features); - maxes[className] = *max_element(y.begin(), y.end()) + 1; - map> states; - for (auto feature : features) { - states[feature] = vector(maxes[feature]); - } - states[className] = vector(maxes[className]); - return { Xd, y, features, className, states }; -} \ No newline at end of file diff --git a/src/Platform/platformUtils.h b/src/Platform/platformUtils.h deleted file mode 100644 index 213e28a..0000000 --- a/src/Platform/platformUtils.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef PLATFORM_UTILS_H -#define PLATFORM_UTILS_H -#include -#include -#include -#include -#include -#include "ArffFiles.h" -#include "CPPFImdlp.h" -using namespace std; - -bool file_exists(const std::string& name); -vector split(const string& text, char delimiter); -pair, map> discretize(vector& X, mdlp::labels_t& y, vector features); -vector discretizeDataset(vector& X, mdlp::labels_t& y); -pair>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector& features, const string& className); -tuple>, vector, vector, string, map>> loadFile(const string& name); -tuple, string, map>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset); -map> get_states(vector& features, string className, map& maxes); -#endif //PLATFORM_UTILS_H diff --git a/tests/BayesModels.cc b/tests/BayesModels.cc index fb33166..98a8f25 100644 --- a/tests/BayesModels.cc +++ b/tests/BayesModels.cc @@ -9,7 +9,6 @@ #include "TAN.h" #include "SPODE.h" #include "AODE.h" -#include "platformUtils.h" TEST_CASE("Test Bayesian Classifiers score", "[BayesNet]") { diff --git a/tests/BayesNetwork.cc b/tests/BayesNetwork.cc index 613392e..b18a532 100644 --- a/tests/BayesNetwork.cc +++ b/tests/BayesNetwork.cc @@ -3,7 +3,6 @@ #include #include #include "KDB.h" -#include "platformUtils.h" TEST_CASE("Test Bayesian Network") { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9267caa..2edbebd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,7 +5,7 @@ if(ENABLE_TESTING) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) - set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCE_DIR}/src/Platform/platformUtils.cc ${BayesNet_SOURCES}) + set(TEST_SOURCES BayesModels.cc BayesNetwork.cc ${BayesNet_SOURCES}) add_executable(${TEST_MAIN} ${TEST_SOURCES}) target_link_libraries(${TEST_MAIN} PUBLIC "${TORCH_LIBRARIES}" ArffFiles mdlp Catch2::Catch2WithMain) add_test(NAME ${TEST_MAIN} COMMAND ${TEST_MAIN})