diff --git a/.vscode/settings.json b/.vscode/settings.json index 09ba4f0..d7af13f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -101,7 +101,8 @@ "*.ipp": "cpp", "cassert": "cpp", "charconv": "cpp", - "source_location": "cpp" + "source_location": "cpp", + "ranges": "cpp" }, "cmake.configureOnOpen": false, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools" diff --git a/lib/Files/ArffFiles.cc b/lib/Files/ArffFiles.cc index b3336fe..8ae99ca 100644 --- a/lib/Files/ArffFiles.cc +++ b/lib/Files/ArffFiles.cc @@ -42,7 +42,7 @@ vector& ArffFiles::getY() return y; } -void ArffFiles::load(const string& fileName, bool classLast) +void ArffFiles::loadCommon(string fileName) { ifstream file(fileName); if (!file.is_open()) { @@ -74,24 +74,50 @@ void ArffFiles::load(const string& fileName, bool classLast) file.close(); if (attributes.empty()) throw invalid_argument("No attributes found"); +} + +void ArffFiles::load(const string& fileName, bool classLast) +{ + int labelIndex; + loadCommon(fileName); if (classLast) { className = get<0>(attributes.back()); classType = get<1>(attributes.back()); attributes.pop_back(); + labelIndex = static_cast(attributes.size()); } else { className = get<0>(attributes.front()); classType = get<1>(attributes.front()); attributes.erase(attributes.begin()); + labelIndex = 0; } - generateDataset(classLast); - + generateDataset(labelIndex); +} +void ArffFiles::load(const string& fileName, const string& name) +{ + int labelIndex; + loadCommon(fileName); + bool found = false; + for (int i = 0; i < attributes.size(); ++i) { + if (attributes[i].first == name) { + className = get<0>(attributes[i]); + classType = get<1>(attributes[i]); + attributes.erase(attributes.begin() + i); + labelIndex = i; + found = true; + break; + } + } + if (!found) { + throw invalid_argument("Class name not found"); + } + generateDataset(labelIndex); } -void ArffFiles::generateDataset(bool classLast) +void ArffFiles::generateDataset(int labelIndex) { X = vector>(attributes.size(), vector(lines.size())); auto yy = vector(lines.size(), ""); - int labelIndex = classLast ? static_cast(attributes.size()) : 0; for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; diff --git a/lib/Files/ArffFiles.h b/lib/Files/ArffFiles.h index ff8bbc5..5cacb27 100644 --- a/lib/Files/ArffFiles.h +++ b/lib/Files/ArffFiles.h @@ -14,12 +14,12 @@ private: string classType; vector> X; vector y; - - void generateDataset(bool); - + void generateDataset(int); + void loadCommon(string); public: ArffFiles(); void load(const string&, bool = true); + void load(const string&, const string&); vector getLines() const; unsigned long int getSize() const; string getClassName() const; diff --git a/src/BayesNet/BaseClassifier.h b/src/BayesNet/BaseClassifier.h index 9d0a404..00a60af 100644 --- a/src/BayesNet/BaseClassifier.h +++ b/src/BayesNet/BaseClassifier.h @@ -17,6 +17,7 @@ namespace bayesnet { vector virtual show() = 0; vector virtual graph(string title = "") = 0; virtual ~BaseClassifier() = default; + const string inline getVersion() const { return "0.1.0"; }; }; } #endif \ No newline at end of file diff --git a/src/BayesNet/Network.h b/src/BayesNet/Network.h index 8ef8be7..c8d832d 100644 --- a/src/BayesNet/Network.h +++ b/src/BayesNet/Network.h @@ -7,7 +7,7 @@ namespace bayesnet { class Network { private: - map> nodes; + map> nodes; map> dataset; bool fitted; float maxThreads; diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index 0e87f44..9c8dfff 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -3,7 +3,7 @@ namespace bayesnet { using namespace torch; - TAN::TAN() : Classifier(Network(0.1)) {} + TAN::TAN() : Classifier(Network()) {} void TAN::train() { diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 9d6293c..b0e135a 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -4,5 +4,5 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) -add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc) +add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc) target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES} ") \ No newline at end of file diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc new file mode 100644 index 0000000..a9ee23c --- /dev/null +++ b/src/Platform/Datasets.cc @@ -0,0 +1,252 @@ +#include "Datasets.h" +#include "platformUtils.h" +#include "ArffFiles.h" +namespace platform { + vector split(string text, char delimiter) + { + vector result; + stringstream ss(text); + string token; + while (getline(ss, token, delimiter)) { + result.push_back(token); + } + return result; + } + void Datasets::load() + { + string line; + ifstream catalog(path + "/all.txt"); + if (catalog.is_open()) { + while (getline(catalog, line)) { + vector tokens = split(line, ','); + string name = tokens[0]; + string className = tokens[1]; + datasets[name] = make_unique(path, name, className, discretize, fileType); + } + catalog.close(); + } else { + throw invalid_argument("Unable to open catalog file. [" + path + "/all.txt" + "]"); + } + } + Dataset& Datasets::getDataset(string name) + { + if (datasets.find(name) == datasets.end()) { + throw invalid_argument("Dataset not found."); + } + return *datasets[name]; + } + vector Datasets::getNames() + { + vector result; + for (auto& d : datasets) { + result.push_back(d.first); + } + return result; + } + vector Datasets::getFeatures(string name) + { + auto dataset = getDataset(name); + if (dataset.isLoaded()) { + return dataset.getFeatures(); + } else { + throw invalid_argument("Dataset not loaded."); + } + } + map> Datasets::getStates(string name) + { + auto dataset = getDataset(name); + if (dataset.isLoaded()) { + return dataset.getStates(); + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair>&, vector&> Datasets::getVectors(string name) + { + auto dataset = getDataset(name); + if (!dataset.isLoaded()) { + dataset.load(); + } + return dataset.getVectors(); + } + pair>&, vector&> Datasets::getVectorsDiscretized(string name) + { + auto dataset = getDataset(name); + if (!dataset.isLoaded()) { + dataset.load(); + } + return dataset.getVectorsDiscretized(); + } + pair Datasets::getTensors(string name) + { + auto dataset = getDataset(name); + if (!dataset.isLoaded()) { + dataset.load(); + } + return dataset.getTensors(); + } + Dataset::Dataset(Dataset& dataset) + { + path = dataset.path; + name = dataset.name; + className = dataset.className; + n_samples = dataset.n_samples; + n_features = dataset.n_features; + features = dataset.features; + states = dataset.states; + loaded = dataset.loaded; + discretize = dataset.discretize; + X = dataset.X; + y = dataset.y; + Xv = dataset.Xv; + Xd = dataset.Xd; + yv = dataset.yv; + fileType = dataset.fileType; + } + string Dataset::getName() + { + return name; + } + string Dataset::getClassName() + { + return className; + } + vector Dataset::getFeatures() + { + if (loaded) { + return features; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + int Dataset::getNFeatures() + { + if (loaded) { + return n_features; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + int Dataset::getNSamples() + { + if (loaded) { + return n_samples; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + map> Dataset::getStates() + { + if (loaded) { + return states; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair>&, vector&> Dataset::getVectors() + { + if (loaded) { + return { Xv, yv }; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair>&, vector&> Dataset::getVectorsDiscretized() + { + if (loaded) { + return { Xd, yv }; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair Dataset::getTensors() + { + if (loaded) { + buildTensors(); + return { X, y }; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + void Dataset::load_csv() + { + string line; + ifstream file(path + "/" + name + ".csv"); + if (file.is_open()) { + getline(file, line); + vector tokens = split(line, ','); + features = vector(tokens.begin(), tokens.end() - 1); + className = tokens.back(); + for (auto i = 0; i < features.size(); ++i) { + Xv.push_back(vector()); + } + while (getline(file, line)) { + tokens = split(line, ','); + for (auto i = 0; i < features.size(); ++i) { + Xv[i].push_back(stof(tokens[i])); + } + yv.push_back(stoi(tokens.back())); + } + file.close(); + } else { + throw invalid_argument("Unable to open dataset file."); + } + } + void Dataset::computeStates() + { + for (int i = 0; i < features.size(); ++i) { + states[features[i]] = vector(*max_element(Xd[i].begin(), Xd[i].end())); + iota(Xd[i].begin(), Xd[i].end(), 0); + } + states[className] = vector(*max_element(yv.begin(), yv.end())); + iota(yv.begin(), yv.end(), 0); + } + void Dataset::load_arff() + { + auto arff = ArffFiles(); + arff.load(path + "/" + name + ".arff", className); + // Get Dataset X, y + Xv = arff.getX(); + yv = arff.getY(); + // Get className & Features + auto className = arff.getClassName(); + vector features; + for (auto feature : arff.getAttributes()) { + features.push_back(feature.first); + } + } + void Dataset::load() + { + if (loaded) { + return; + } + if (fileType == CSV) { + load_csv(); + } else if (fileType == ARFF) { + load_arff(); + } + if (discretize) { + Xd = discretizeDataset(Xv, yv); + computeStates(); + n_samples = Xd[0].size(); + n_features = Xd.size(); + } + loaded = true; + } + void Dataset::buildTensors() + { + if (discretize) { + X = torch::zeros({ static_cast(n_features), static_cast(n_samples) }, torch::kInt32); + } else { + X = torch::zeros({ static_cast(n_features), static_cast(n_samples) }, torch::kFloat32); + } + for (int i = 0; i < features.size(); ++i) { + if (discretize) { + X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + } else { + X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32)); + } + y = torch::tensor(yv, torch::kInt32); + } + } +} \ No newline at end of file diff --git a/src/Platform/Datasets.h b/src/Platform/Datasets.h new file mode 100644 index 0000000..ae54376 --- /dev/null +++ b/src/Platform/Datasets.h @@ -0,0 +1,63 @@ +#ifndef DATASETS_H +#define DATASETS_H +#include +#include +#include +#include +namespace platform { + using namespace std; + enum fileType_t { CSV, ARFF }; + class Dataset { + private: + string path; + string name; + fileType_t fileType; + string className; + int n_samples, n_features; + vector features; + map> states; + bool loaded; + bool discretize; + torch::Tensor X, y; + vector> Xv; + vector> Xd; + vector yv; + void buildTensors(); + void load_csv(); + void load_arff(); + void computeStates(); + public: + Dataset(string path, string name, string className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; + Dataset(Dataset&); + string getName(); + string getClassName(); + vector getFeatures(); + map> getStates(); + pair>&, vector&> getVectors(); + pair>&, vector&> getVectorsDiscretized(); + pair getTensors(); + int getNFeatures(); + int getNSamples(); + void load(); + const bool inline isLoaded() const { return loaded; }; + }; + class Datasets { + private: + string path; + fileType_t fileType; + map> datasets; + bool discretize; + void load(); // Loads the list of datasets + public: + Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; + Dataset& getDataset(string name); + vector getNames(); + vector getFeatures(string name); + map> getStates(string name); + pair>&, vector&> getVectors(string name); + pair>&, vector&> getVectorsDiscretized(string name); + pair getTensors(string name); + }; +}; + +#endif \ No newline at end of file diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 583f460..64c8134 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -101,6 +101,7 @@ namespace platform { Timer train_timer, test_timer; for (int i = 0; i < k; i++) { bayesnet::BaseClassifier* model = classifiers[model_name]; + result.setModelVersion(model->getVersion()); train_timer.start(); auto [train, test] = fold->getFold(i); auto train_t = torch::tensor(train); diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index bfd3b5f..a5687a5 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -24,7 +24,7 @@ namespace platform { }; class Result { private: - string dataset, hyperparameters; + string dataset, hyperparameters, model_version; int samples, features, classes; float score_train, score_test, score_train_std, score_test_std, train_time, train_time_std, test_time, test_time_std; float nodes, leaves, depth; @@ -46,6 +46,7 @@ namespace platform { Result& setNodes(float nodes) { this->nodes = nodes; return *this; } Result& setLeaves(float leaves) { this->leaves = leaves; return *this; } Result& setDepth(float depth) { this->depth = depth; return *this; } + Result& setModelVersion(string model_version) { this->model_version = model_version; return *this; } const float get_score_train() const { return score_train; } float get_score_test() { return score_test; } const string& getDataset() const { return dataset; } @@ -64,6 +65,7 @@ namespace platform { const float getNodes() const { return nodes; } const float getLeaves() const { return leaves; } const float getDepth() const { return depth; } + const string& getModelVersion() const { return model_version; } }; class Experiment { private: diff --git a/src/Platform/main.cc b/src/Platform/main.cc index d54d8ea..00873ea 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -1,49 +1,17 @@ #include -#include -#include -#include #include -#include "ArffFiles.h" -#include "Network.h" -#include "BayesMetrics.h" -#include "CPPFImdlp.h" -#include "KDB.h" -#include "SPODE.h" -#include "AODE.h" -#include "TAN.h" #include "platformUtils.h" #include "Experiment.h" -#include "Folding.h" +#include "Datasets.h" using namespace std; -int main(int argc, char** argv) +argparse::ArgumentParser manageArguments(int argc, char** argv) { - map datasets = { - {"diabetes", true}, - {"ecoli", true}, - {"glass", true}, - {"iris", true}, - {"kdd_JapaneseVowels", false}, - {"letter", true}, - {"liver-disorders", true}, - {"mfeat-factors", true}, - }; - auto valid_datasets = vector(); - for (auto dataset : datasets) { - valid_datasets.push_back(dataset.first); - } argparse::ArgumentParser program("BayesNetSample"); program.add_argument("-d", "--dataset") - .help("Dataset file name") - .action([valid_datasets](const std::string& value) { - if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) { - return value; - } - throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}"); - } - ); + .help("Dataset file name"); program.add_argument("-p", "--path") .help("folder where the data files are located, default") .default_value(string{ PATH } @@ -89,7 +57,7 @@ int main(int argc, char** argv) n_folds = program.get("folds"); seed = program.get("seed"); complete_file_name = path + file_name + ".arff"; - class_last = datasets[file_name]; + class_last = false;//datasets[file_name]; title = program.get("title"); if (!file_exists(complete_file_name)) { throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); @@ -100,24 +68,54 @@ int main(int argc, char** argv) cerr << program; exit(1); } + return program; +} + +int main(int argc, char** argv) +{ + auto program = manageArguments(argc, argv); + auto file_name = program.get("dataset"); + auto path = program.get("path"); + auto model_name = program.get("model"); + auto discretize_dataset = program.get("discretize"); + auto stratified = program.get("stratified"); + auto n_folds = program.get("folds"); + auto seed = program.get("seed"); + vector filesToProcess; + auto datasets = platform::Datasets(path, true, platform::ARFF); + if (file_name != "") { + filesToProcess.push_back(file_name); + } else { + filesToProcess = platform::Datasets(path, true, platform::ARFF).getNames(); + } + auto title = program.get("title"); + /* * Begin Processing */ - auto [X, y, features, className, states] = loadDataset(path, file_name, class_last, discretize_dataset); - Fold* fold; - if (stratified) - fold = new StratifiedKFold(n_folds, y, seed); - else - fold = new KFold(n_folds, y.numel(), seed); auto experiment = platform::Experiment(); experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("1.0.0"); - experiment.setDiscretized(discretize_dataset).setModel(model_name).setModelVersion("1...0").setPlatform("BayesNet"); + experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet"); experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy"); platform::Timer timer; timer.start(); - auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); - result.setDataset(file_name); - experiment.addResult(result); + for (auto fileName : filesToProcess) { + cout << "Processing " << fileName << endl; + auto [X, y] = datasets.getTensors(fileName); + // auto states = datasets.getStates(fileName); + // auto features = datasets.getFeatures(fileName); + // auto className = datasets.getDataset(fileName).getClassName(); + // Fold* fold; + // if (stratified) + // fold = new StratifiedKFold(n_folds, y, seed); + // else + // fold = new KFold(n_folds, y.numel(), seed); + // auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); + // result.setDataset(file_name); + // experiment.setModelVersion(result.getModelVersion()); + // experiment.addResult(result); + // delete fold; + } experiment.setDuration(timer.getDuration()); experiment.save(path); experiment.show(); diff --git a/src/Platform/platformUtils.h b/src/Platform/platformUtils.h index c17c855..abc69bd 100644 --- a/src/Platform/platformUtils.h +++ b/src/Platform/platformUtils.h @@ -12,6 +12,7 @@ const string PATH = "../../data/"; bool file_exists(const std::string& name); pair, map> discretize(vector& X, mdlp::labels_t& y, vector features); +vector discretizeDataset(vector& X, mdlp::labels_t& y); pair>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector& features, string className); tuple>, vector, vector, string, map>> loadFile(string name); tuple, string, map>> loadDataset(string path, string name, bool class_last, bool discretize_dataset);