diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index 4f9d087..000a88b 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -3,5 +3,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) -add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc) +add_executable(BayesNetSample sample.cc ${BayesNet_SOURCE_DIR}/src/Platform/Folding.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNetSample BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/sample/sample.cc b/sample/sample.cc index f515405..2d7efa4 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -4,16 +4,12 @@ #include #include #include -#include "BaseClassifier.h" #include "ArffFiles.h" -#include "Network.h" #include "BayesMetrics.h" #include "CPPFImdlp.h" -#include "KDB.h" -#include "SPODE.h" -#include "AODE.h" -#include "TAN.h" #include "Folding.h" +#include "Models.h" +#include "modelRegister.h" using namespace std; @@ -73,9 +69,8 @@ int main(int argc, char** argv) {"mfeat-factors", true}, }; auto valid_datasets = vector(); - for (auto dataset : datasets) { - valid_datasets.push_back(dataset.first); - } + transform(datasets.begin(), datasets.end(), back_inserter(valid_datasets), + [](const pair& pair) { return pair.first; }); argparse::ArgumentParser program("BayesNetSample"); program.add_argument("-d", "--dataset") .help("Dataset file name") @@ -91,13 +86,13 @@ int main(int argc, char** argv) .default_value(string{ PATH } ); program.add_argument("-m", "--model") - .help("Model to use {AODE, KDB, SPODE, TAN}") + .help("Model to use " + platform::Models::instance()->toString()) .action([](const std::string& value) { - static const vector choices = { "AODE", "KDB", "SPODE", "TAN" }; + static const vector choices = platform::Models::instance()->getNames(); if (find(choices.begin(), choices.end(), value) != choices.end()) { return value; } - throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}"); + throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); } ); program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); @@ -153,9 +148,9 @@ int main(int argc, char** argv) // Get className & Features auto className = handler.getClassName(); vector features; - for (auto feature : handler.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), + [](const pair& item) { return item.first; }); // Discretize Dataset auto [Xd, maxes] = discretize(X, y, features); maxes[className] = *max_element(y.begin(), y.end()) + 1; @@ -164,12 +159,7 @@ int main(int argc, char** argv) states[feature] = vector(maxes[feature]); } states[className] = vector(maxes[className]); - auto classifiers = map({ - { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, - { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } - } - ); - bayesnet::BaseClassifier* clf = classifiers[model_name]; + auto clf = platform::Models::instance()->create(model_name); clf->fit(Xd, y, features, className, states); auto score = clf->score(Xd, y); auto lines = clf->show(); diff --git a/src/BayesNet/AODE.h b/src/BayesNet/AODE.h index 84386d3..bc859e7 100644 --- a/src/BayesNet/AODE.h +++ b/src/BayesNet/AODE.h @@ -8,6 +8,7 @@ namespace bayesnet { void train() override; public: AODE(); + virtual ~AODE() {}; vector graph(string title = "AODE") override; }; } diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 6671995..be15f07 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -12,8 +12,8 @@ namespace bayesnet { : features(features) , className(className) , classNumStates(classNumStates) + , samples(torch::zeros({ static_cast(vsamples[0].size()), static_cast(vsamples.size() + 1) }, torch::kInt32)) { - samples = torch::zeros({ static_cast(vsamples[0].size()), static_cast(vsamples.size() + 1) }, torch::kInt32); for (int i = 0; i < vsamples.size(); ++i) { samples.index_put_({ "...", i }, torch::tensor(vsamples[i], torch::kInt32)); } @@ -123,7 +123,6 @@ namespace bayesnet { */ vector> Metrics::maximumSpanningTree(vector features, Tensor& weights, int root) { - auto result = vector>(); auto mst = MST(features, weights, root); return mst.maximumSpanningTree(); } diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index f8557a0..427b1d4 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -11,7 +11,7 @@ namespace bayesnet { Tensor samples; vector features; string className; - int classNumStates; + int classNumStates = 0; public: Metrics() = default; Metrics(Tensor&, vector&, string&, int); diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index d77c2c8..545525e 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -125,7 +125,6 @@ namespace bayesnet { } void Classifier::addNodes() { - auto test = model.getEdges(); // Add all nodes to the network for (auto feature : features) { model.addNode(feature, states[feature].size()); diff --git a/src/BayesNet/Ensemble.cc b/src/BayesNet/Ensemble.cc index 8aa2518..dce0d3d 100644 --- a/src/BayesNet/Ensemble.cc +++ b/src/BayesNet/Ensemble.cc @@ -148,10 +148,10 @@ namespace bayesnet { } int Ensemble::getNumberOfStates() { - int states = 0; + int nstates = 0; for (auto i = 0; i < n_models; ++i) { - states += models[i]->getNumberOfStates(); + nstates += models[i]->getNumberOfStates(); } - return states; + return nstates; } } \ No newline at end of file diff --git a/src/BayesNet/KDB.h b/src/BayesNet/KDB.h index 9683955..e3f257f 100644 --- a/src/BayesNet/KDB.h +++ b/src/BayesNet/KDB.h @@ -13,7 +13,8 @@ namespace bayesnet { protected: void train() override; public: - KDB(int k, float theta = 0.03); + explicit KDB(int k, float theta = 0.03); + virtual ~KDB() {}; vector graph(string name = "KDB") override; }; } diff --git a/src/BayesNet/Mst.cc b/src/BayesNet/Mst.cc index b86812b..3a48d05 100644 --- a/src/BayesNet/Mst.cc +++ b/src/BayesNet/Mst.cc @@ -7,9 +7,8 @@ namespace bayesnet { using namespace std; - Graph::Graph(int V) + Graph::Graph(int V) : V(V), parent(vector(V)) { - parent = vector(V); for (int i = 0; i < V; i++) parent[i] = i; G.clear(); @@ -34,10 +33,10 @@ namespace bayesnet { } void Graph::kruskal_algorithm() { - int i, uSt, vEd; // sort the edges ordered on decreasing weight - sort(G.begin(), G.end(), [](auto& left, auto& right) {return left.first > right.first;}); - for (i = 0; i < G.size(); i++) { + sort(G.begin(), G.end(), [](const auto& left, const auto& right) {return left.first > right.first;}); + for (int i = 0; i < G.size(); i++) { + int uSt, vEd; uSt = find_set(G[i].second.first); vEd = find_set(G[i].second.second); if (uSt != vEd) { diff --git a/src/BayesNet/Mst.h b/src/BayesNet/Mst.h index 15b0dbb..71a46a5 100644 --- a/src/BayesNet/Mst.h +++ b/src/BayesNet/Mst.h @@ -10,7 +10,7 @@ namespace bayesnet { private: Tensor weights; vector features; - int root; + int root = 0; public: MST() = default; MST(vector& features, Tensor& weights, int root); @@ -23,7 +23,7 @@ namespace bayesnet { vector >> T; // vector for mst vector parent; public: - Graph(int V); + explicit Graph(int V); void addEdge(int u, int v, float wt); int find_set(int i); void union_set(int u, int v); diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index eb3ffeb..35b3cc5 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -8,7 +8,7 @@ namespace bayesnet { Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.getmaxThreads()), fitted(other.fitted) { - for (auto& pair : other.nodes) { + for (const auto& pair : other.nodes) { nodes[pair.first] = std::make_unique(*pair.second); } } @@ -20,7 +20,7 @@ namespace bayesnet { { return samples; } - void Network::addNode(string name, int numStates) + void Network::addNode(const string& name, int numStates) { if (find(features.begin(), features.end(), name) == features.end()) { features.push_back(name); @@ -69,7 +69,7 @@ namespace bayesnet { recStack.erase(nodeId); // remove node from recursion stack before function ends return false; } - void Network::addEdge(const string parent, const string child) + void Network::addEdge(const string& parent, const string& child) { if (nodes.find(parent) == nodes.end()) { throw invalid_argument("Parent node " + parent + " does not exist"); @@ -105,8 +105,8 @@ namespace bayesnet { for (int i = 0; i < featureNames.size(); ++i) { auto column = torch::flatten(X.index({ "...", i })); auto k = vector(); - for (auto i = 0; i < X.size(0); ++i) { - k.push_back(column[i].item()); + for (auto z = 0; z < X.size(0); ++z) { + k.push_back(column[z].item()); } dataset[featureNames[i]] = k; } @@ -145,9 +145,6 @@ namespace bayesnet { while (nextNodeIndex < nodes.size()) { unique_lock lock(mtx); cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; }); - if (nextNodeIndex >= nodes.size()) { - break; // No more work remaining - } threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() { while (true) { unique_lock lock(mtx); @@ -262,9 +259,7 @@ namespace bayesnet { // Normalize result double sum = accumulate(result.begin(), result.end(), 0.0); - for (double& value : result) { - value /= sum; - } + transform(result.begin(), result.end(), result.begin(), [sum](double& value) { return value / sum; }); return result; } vector Network::show() @@ -280,7 +275,7 @@ namespace bayesnet { } return result; } - vector Network::graph(string title) + vector Network::graph(const string& title) { auto output = vector(); auto prefix = "digraph BayesNet {\nlabel=>& getNodes(); vector getFeatures(); int getStates(); @@ -48,7 +48,7 @@ namespace bayesnet { vector> predict_proba(const vector>&); double score(const vector>&, const vector&); vector show(); - vector graph(string title); // Returns a vector of strings representing the graph in graphviz format + vector graph(const string& title); // Returns a vector of strings representing the graph in graphviz format inline string version() { return "0.1.0"; } }; } diff --git a/src/BayesNet/Node.cc b/src/BayesNet/Node.cc index d33fecf..095cff7 100644 --- a/src/BayesNet/Node.cc +++ b/src/BayesNet/Node.cc @@ -88,18 +88,15 @@ namespace bayesnet { { // Get dimensions of the CPT dimensions.push_back(numStates); - for (auto father : getParents()) { - dimensions.push_back(father->getNumStates()); - } + transform(parents.begin(), parents.end(), back_inserter(dimensions), [](const auto& parent) { return parent->getNumStates(); }); + // Create a tensor of zeros with the dimensions of the CPT cpTable = torch::zeros(dimensions, torch::kFloat) + laplaceSmoothing; // Fill table with counts for (int n_sample = 0; n_sample < dataset[name].size(); ++n_sample) { torch::List> coordinates; coordinates.push_back(torch::tensor(dataset[name][n_sample])); - for (auto father : getParents()) { - coordinates.push_back(torch::tensor(dataset[father->getName()][n_sample])); - } + transform(parents.begin(), parents.end(), back_inserter(coordinates), [&dataset, &n_sample](const auto& parent) { return torch::tensor(dataset[parent->getName()][n_sample]); }); // Increment the count of the corresponding coordinate cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1); } @@ -111,19 +108,15 @@ namespace bayesnet { torch::List> coordinates; // following predetermined order of indices in the cpTable (see Node.h) coordinates.push_back(torch::tensor(evidence[name])); - for (auto parent : getParents()) { - coordinates.push_back(torch::tensor(evidence[parent->getName()])); - } + transform(parents.begin(), parents.end(), back_inserter(coordinates), [&evidence](const auto& parent) { return torch::tensor(evidence[parent->getName()]); }); return cpTable.index({ coordinates }).item(); } - vector Node::graph(string className) + vector Node::graph(const string& className) { auto output = vector(); auto suffix = name == className ? ", fontcolor=red, fillcolor=lightblue, style=filled " : ""; output.push_back(name + " [shape=circle" + suffix + "] \n"); - for (auto& child : children) { - output.push_back(name + " -> " + child->getName()); - } + transform(children.begin(), children.end(), back_inserter(output), [this](const auto& child) { return name + " -> " + child->getName(); }); return output; } } \ No newline at end of file diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index 5c5932a..3a5bbe6 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -16,7 +16,7 @@ namespace bayesnet { vector dimensions; // dimensions of the cpTable public: vector> combinations(const vector&); - Node(const std::string&, int); + Node(const string&, int); void clear(); void addParent(Node*); void addChild(Node*); @@ -30,7 +30,7 @@ namespace bayesnet { int getNumStates() const; void setNumStates(int); unsigned minFill(); - vector graph(string clasName); // Returns a vector of strings representing the graph in graphviz format + vector graph(const string& clasName); // Returns a vector of strings representing the graph in graphviz format float getFactorValue(map&); }; } diff --git a/src/BayesNet/SPODE.h b/src/BayesNet/SPODE.h index 668bbca..30f0b46 100644 --- a/src/BayesNet/SPODE.h +++ b/src/BayesNet/SPODE.h @@ -1,6 +1,7 @@ #ifndef SPODE_H #define SPODE_H #include "Classifier.h" + namespace bayesnet { class SPODE : public Classifier { private: @@ -8,7 +9,8 @@ namespace bayesnet { protected: void train() override; public: - SPODE(int root); + explicit SPODE(int root); + virtual ~SPODE() {}; vector graph(string name = "SPODE") override; }; } diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index 9c8dfff..51f0c1b 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -18,7 +18,7 @@ namespace bayesnet { auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset); mi.push_back({ i, mi_value }); } - sort(mi.begin(), mi.end(), [](auto& left, auto& right) {return left.second < right.second;}); + sort(mi.begin(), mi.end(), [](const auto& left, const auto& right) {return left.second < right.second;}); auto root = mi[mi.size() - 1].first; // 2. Compute mutual information between each feature and the class auto weights = metrics.conditionalEdge(); diff --git a/src/BayesNet/TAN.h b/src/BayesNet/TAN.h index 11e7421..ce9b10a 100644 --- a/src/BayesNet/TAN.h +++ b/src/BayesNet/TAN.h @@ -10,6 +10,7 @@ namespace bayesnet { void train() override; public: TAN(); + virtual ~TAN() {}; vector graph(string name = "TAN") override; }; } diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index f1fea17..7de4c29 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -4,5 +4,5 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) -add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc) +add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc) target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 0c09c59..11b83ac 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -2,21 +2,11 @@ #include "platformUtils.h" #include "ArffFiles.h" namespace platform { - vector split(string text, char delimiter) - { - vector result; - stringstream ss(text); - string token; - while (getline(ss, token, delimiter)) { - result.push_back(token); - } - return result; - } void Datasets::load() { - string line; ifstream catalog(path + "/all.txt"); if (catalog.is_open()) { + string line; while (getline(catalog, line)) { vector tokens = split(line, ','); string name = tokens[0]; @@ -31,9 +21,7 @@ namespace platform { vector Datasets::getNames() { vector result; - for (auto& d : datasets) { - result.push_back(d.first); - } + transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; }); return result; } vector Datasets::getFeatures(string name) @@ -89,27 +77,12 @@ namespace platform { } return datasets[name]->getTensors(); } - bool Datasets::isDataset(string name) + bool Datasets::isDataset(const string& name) { return datasets.find(name) != datasets.end(); } - Dataset::Dataset(Dataset& dataset) + Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType) { - path = dataset.path; - name = dataset.name; - className = dataset.className; - n_samples = dataset.n_samples; - n_features = dataset.n_features; - features = dataset.features; - states = dataset.states; - loaded = dataset.loaded; - discretize = dataset.discretize; - X = dataset.X; - y = dataset.y; - Xv = dataset.Xv; - Xd = dataset.Xd; - yv = dataset.yv; - fileType = dataset.fileType; } string Dataset::getName() { @@ -178,9 +151,9 @@ namespace platform { } void Dataset::load_csv() { - string line; ifstream file(path + "/" + name + ".csv"); if (file.is_open()) { + string line; getline(file, line); vector tokens = split(line, ','); features = vector(tokens.begin(), tokens.end() - 1); @@ -218,9 +191,8 @@ namespace platform { yv = arff.getY(); // Get className & Features className = arff.getClassName(); - for (auto feature : arff.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = arff.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; }); } void Dataset::load() { diff --git a/src/Platform/Datasets.h b/src/Platform/Datasets.h index f6a4c5b..4ccd1f0 100644 --- a/src/Platform/Datasets.h +++ b/src/Platform/Datasets.h @@ -13,7 +13,7 @@ namespace platform { string name; fileType_t fileType; string className; - int n_samples, n_features; + int n_samples{ 0 }, n_features{ 0 }; vector features; map> states; bool loaded; @@ -27,8 +27,8 @@ namespace platform { void load_arff(); void computeStates(); public: - Dataset(string path, string name, string className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; - Dataset(Dataset&); + Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; + explicit Dataset(const Dataset&); string getName(); string getClassName(); vector getFeatures(); @@ -49,7 +49,7 @@ namespace platform { bool discretize; void load(); // Loads the list of datasets public: - Datasets(string path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; + explicit Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; vector getNames(); vector getFeatures(string name); int getNSamples(string name); @@ -58,7 +58,7 @@ namespace platform { pair>&, vector&> getVectors(string name); pair>&, vector&> getVectorsDiscretized(string name); pair getTensors(string name); - bool isDataset(string name); + bool isDataset(const string& name); }; }; diff --git a/src/Platform/DotEnv.h b/src/Platform/DotEnv.h new file mode 100644 index 0000000..a7e3e36 --- /dev/null +++ b/src/Platform/DotEnv.h @@ -0,0 +1,62 @@ +#ifndef DOTENV_H +#define DOTENV_H +#include +#include +#include +#include +#include "platformUtils.h" +namespace platform { + class DotEnv { + private: + std::map env; + std::string trim(const std::string& str) + { + std::string result = str; + result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) { + return !std::isspace(ch); + })); + result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) { + return !std::isspace(ch); + }).base(), result.end()); + return result; + } + public: + DotEnv() + { + std::ifstream file(".env"); + if (!file.is_open()) { + std::cerr << "File .env not found" << std::endl; + exit(1); + } + std::string line; + while (std::getline(file, line)) { + line = trim(line); + if (line.empty() || line[0] == '#') { + continue; + } + std::istringstream iss(line); + std::string key, value; + if (std::getline(iss, key, '=') && std::getline(iss, value)) { + env[key] = value; + } + } + } + std::string get(const std::string& key) + { + return env[key]; + } + std::vector getSeeds() + { + auto seeds = std::vector(); + auto seeds_str = env["seeds"]; + seeds_str = trim(seeds_str); + seeds_str = seeds_str.substr(1, seeds_str.size() - 2); + auto seeds_str_split = split(seeds_str, ','); + transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) { + return stoi(str); + }); + return seeds; + } + }; +} +#endif \ No newline at end of file diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index c48e2be..97e7289 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -1,4 +1,6 @@ #include "Experiment.h" +#include "Datasets.h" +#include "Models.h" namespace platform { using json = nlohmann::json; @@ -43,10 +45,10 @@ namespace platform { result["discretized"] = discretized; result["stratified"] = stratified; result["folds"] = nfolds; - result["seeds"] = random_seeds; + result["seeds"] = randomSeeds; result["duration"] = duration; result["results"] = json::array(); - for (auto& r : results) { + for (const auto& r : results) { json j; j["dataset"] = r.getDataset(); j["hyperparameters"] = r.getHyperparameters(); @@ -65,6 +67,10 @@ namespace platform { j["test_time_std"] = r.getTestTimeStd(); j["time"] = r.getTestTime() + r.getTrainTime(); j["time_std"] = r.getTestTimeStd() + r.getTrainTimeStd(); + j["scores_train"] = r.getScoresTrain(); + j["scores_test"] = r.getScoresTest(); + j["times_train"] = r.getTimesTrain(); + j["times_test"] = r.getTimesTest(); j["nodes"] = r.getNodes(); j["leaves"] = r.getLeaves(); j["depth"] = r.getDepth(); @@ -72,62 +78,99 @@ namespace platform { } return result; } - void Experiment::save(string path) + void Experiment::save(const string& path) { json data = build_json(); ofstream file(path + "/" + get_file_name()); file << data; file.close(); } - Result cross_validation(Fold* fold, string model_name, torch::Tensor& Xt, torch::Tensor& y, vector features, string className, map> states) + + void Experiment::show() { - auto classifiers = map({ - { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, - { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } - } - ); - auto result = Result(); - auto [values, counts] = at::_unique(y); - result.setSamples(Xt.size(1)).setFeatures(Xt.size(0)).setClasses(values.size(0)); - auto k = fold->getNumberOfFolds(); - auto accuracy_test = torch::zeros({ k }, torch::kFloat64); - auto accuracy_train = torch::zeros({ k }, torch::kFloat64); - auto train_time = torch::zeros({ k }, torch::kFloat64); - auto test_time = torch::zeros({ k }, torch::kFloat64); - auto nodes = torch::zeros({ k }, torch::kFloat64); - auto edges = torch::zeros({ k }, torch::kFloat64); - auto num_states = torch::zeros({ k }, torch::kFloat64); - Timer train_timer, test_timer; - cout << "doing Fold: " << flush; - for (int i = 0; i < k; i++) { - bayesnet::BaseClassifier* model = classifiers[model_name]; - result.setModelVersion(model->getVersion()); - train_timer.start(); - auto [train, test] = fold->getFold(i); - auto train_t = torch::tensor(train); - auto test_t = torch::tensor(test); - auto X_train = Xt.index({ "...", train_t }); - auto y_train = y.index({ train_t }); - auto X_test = Xt.index({ "...", test_t }); - auto y_test = y.index({ test_t }); - cout << i + 1 << ", " << flush; - model->fit(X_train, y_train, features, className, states); - nodes[i] = model->getNumberOfNodes(); - edges[i] = model->getNumberOfEdges(); - num_states[i] = model->getNumberOfStates(); - train_time[i] = train_timer.getDuration(); - auto accuracy_train_value = model->score(X_train, y_train); - test_timer.start(); - auto accuracy_test_value = model->score(X_test, y_test); - test_time[i] = test_timer.getDuration(); - accuracy_train[i] = accuracy_train_value; - accuracy_test[i] = accuracy_test_value; + json data = build_json(); + cout << data.dump(4) << endl; + } + + void Experiment::go(vector filesToProcess, const string& path) + { + cout << "*** Starting experiment: " << title << " ***" << endl; + for (auto fileName : filesToProcess) { + cout << "- " << setw(20) << left << fileName << " " << right << flush; + cross_validation(path, fileName); + cout << endl; + } + } + + void Experiment::cross_validation(const string& path, const string& fileName) + { + auto datasets = platform::Datasets(path, true, platform::ARFF); + // Get dataset + auto [X, y] = datasets.getTensors(fileName); + auto states = datasets.getStates(fileName); + auto features = datasets.getFeatures(fileName); + auto samples = datasets.getNSamples(fileName); + auto className = datasets.getClassName(fileName); + cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush; + // Prepare Result + auto result = Result(); + auto [values, counts] = at::_unique(y);; + result.setSamples(X.size(1)).setFeatures(X.size(0)).setClasses(values.size(0)); + int nResults = nfolds * static_cast(randomSeeds.size()); + auto accuracy_test = torch::zeros({ nResults }, torch::kFloat64); + auto accuracy_train = torch::zeros({ nResults }, torch::kFloat64); + auto train_time = torch::zeros({ nResults }, torch::kFloat64); + auto test_time = torch::zeros({ nResults }, torch::kFloat64); + auto nodes = torch::zeros({ nResults }, torch::kFloat64); + auto edges = torch::zeros({ nResults }, torch::kFloat64); + auto num_states = torch::zeros({ nResults }, torch::kFloat64); + Timer train_timer, test_timer; + int item = 0; + for (auto seed : randomSeeds) { + cout << "(" << seed << ") doing Fold: " << flush; + Fold* fold; + if (stratified) + fold = new StratifiedKFold(nfolds, y, seed); + else + fold = new KFold(nfolds, y.size(0), seed); + for (int nfold = 0; nfold < nfolds; nfold++) { + auto clf = Models::instance()->create(model); + setModelVersion(clf->getVersion()); + train_timer.start(); + auto [train, test] = fold->getFold(nfold); + auto train_t = torch::tensor(train); + auto test_t = torch::tensor(test); + auto X_train = X.index({ "...", train_t }); + auto y_train = y.index({ train_t }); + auto X_test = X.index({ "...", test_t }); + auto y_test = y.index({ test_t }); + cout << nfold + 1 << ", " << flush; + clf->fit(X_train, y_train, features, className, states); + nodes[item] = clf->getNumberOfNodes(); + edges[item] = clf->getNumberOfEdges(); + num_states[item] = clf->getNumberOfStates(); + train_time[item] = train_timer.getDuration(); + auto accuracy_train_value = clf->score(X_train, y_train); + test_timer.start(); + auto accuracy_test_value = clf->score(X_test, y_test); + test_time[item] = test_timer.getDuration(); + accuracy_train[item] = accuracy_train_value; + accuracy_test[item] = accuracy_test_value; + // Store results and times in vector + result.addScoreTrain(accuracy_train_value); + result.addScoreTest(accuracy_test_value); + result.addTimeTrain(train_time[item].item()); + result.addTimeTest(test_time[item].item()); + item++; + } + cout << "end. " << flush; + delete fold; } - cout << "end." << endl; result.setScoreTest(torch::mean(accuracy_test).item()).setScoreTrain(torch::mean(accuracy_train).item()); result.setScoreTestStd(torch::std(accuracy_test).item()).setScoreTrainStd(torch::std(accuracy_train).item()); result.setTrainTime(torch::mean(train_time).item()).setTestTime(torch::mean(test_time).item()); result.setNodes(torch::mean(nodes).item()).setLeaves(torch::mean(edges).item()).setDepth(torch::mean(num_states).item()); - return result; + result.setDataset(fileName); + addResult(result); } } \ No newline at end of file diff --git a/src/Platform/Experiment.h b/src/Platform/Experiment.h index 8e0a677..4305316 100644 --- a/src/Platform/Experiment.h +++ b/src/Platform/Experiment.h @@ -30,13 +30,14 @@ namespace platform { class Result { private: string dataset, hyperparameters, model_version; - int samples, features, classes; - double score_train, score_test, score_train_std, score_test_std, train_time, train_time_std, test_time, test_time_std; - float nodes, leaves, depth; + int samples{ 0 }, features{ 0 }, classes{ 0 }; + double score_train{ 0 }, score_test{ 0 }, score_train_std{ 0 }, score_test_std{ 0 }, train_time{ 0 }, train_time_std{ 0 }, test_time{ 0 }, test_time_std{ 0 }; + float nodes{ 0 }, leaves{ 0 }, depth{ 0 }; + vector scores_train, scores_test, times_train, times_test; public: Result() = default; - Result& setDataset(string dataset) { this->dataset = dataset; return *this; } - Result& setHyperparameters(string hyperparameters) { this->hyperparameters = hyperparameters; return *this; } + Result& setDataset(const string& dataset) { this->dataset = dataset; return *this; } + Result& setHyperparameters(const string& hyperparameters) { this->hyperparameters = hyperparameters; return *this; } Result& setSamples(int samples) { this->samples = samples; return *this; } Result& setFeatures(int features) { this->features = features; return *this; } Result& setClasses(int classes) { this->classes = classes; return *this; } @@ -51,7 +52,10 @@ namespace platform { Result& setNodes(float nodes) { this->nodes = nodes; return *this; } Result& setLeaves(float leaves) { this->leaves = leaves; return *this; } Result& setDepth(float depth) { this->depth = depth; return *this; } - Result& setModelVersion(string model_version) { this->model_version = model_version; return *this; } + Result& addScoreTrain(double score) { scores_train.push_back(score); return *this; } + Result& addScoreTest(double score) { scores_test.push_back(score); return *this; } + Result& addTimeTrain(double time) { times_train.push_back(time); return *this; } + Result& addTimeTest(double time) { times_test.push_back(time); return *this; } const float get_score_train() const { return score_train; } float get_score_test() { return score_test; } const string& getDataset() const { return dataset; } @@ -70,36 +74,40 @@ namespace platform { const float getNodes() const { return nodes; } const float getLeaves() const { return leaves; } const float getDepth() const { return depth; } - const string& getModelVersion() const { return model_version; } + const vector& getScoresTrain() const { return scores_train; } + const vector& getScoresTest() const { return scores_test; } + const vector& getTimesTrain() const { return times_train; } + const vector& getTimesTest() const { return times_test; } }; class Experiment { private: string title, model, platform, score_name, model_version, language_version, language; - bool discretized, stratified; + bool discretized{ false }, stratified{ false }; vector results; - vector random_seeds; - int nfolds; - float duration; + vector randomSeeds; + int nfolds{ 0 }; + float duration{ 0 }; json build_json(); public: Experiment() = default; - Experiment& setTitle(string title) { this->title = title; return *this; } - Experiment& setModel(string model) { this->model = model; return *this; } - Experiment& setPlatform(string platform) { this->platform = platform; return *this; } - Experiment& setScoreName(string score_name) { this->score_name = score_name; return *this; } - Experiment& setModelVersion(string model_version) { this->model_version = model_version; return *this; } - Experiment& setLanguage(string language) { this->language = language; return *this; } - Experiment& setLanguageVersion(string language_version) { this->language_version = language_version; return *this; } + Experiment& setTitle(const string& title) { this->title = title; return *this; } + Experiment& setModel(const string& model) { this->model = model; return *this; } + Experiment& setPlatform(const string& platform) { this->platform = platform; return *this; } + Experiment& setScoreName(const string& score_name) { this->score_name = score_name; return *this; } + Experiment& setModelVersion(const string& model_version) { this->model_version = model_version; return *this; } + Experiment& setLanguage(const string& language) { this->language = language; return *this; } + Experiment& setLanguageVersion(const string& language_version) { this->language_version = language_version; return *this; } Experiment& setDiscretized(bool discretized) { this->discretized = discretized; return *this; } Experiment& setStratified(bool stratified) { this->stratified = stratified; return *this; } Experiment& setNFolds(int nfolds) { this->nfolds = nfolds; return *this; } Experiment& addResult(Result result) { results.push_back(result); return *this; } - Experiment& addRandomSeed(int random_seed) { random_seeds.push_back(random_seed); return *this; } + Experiment& addRandomSeed(int randomSeed) { randomSeeds.push_back(randomSeed); return *this; } Experiment& setDuration(float duration) { this->duration = duration; return *this; } string get_file_name(); - void save(string path); - void show() { cout << "Showing experiment..." << "Score Test: " << results[0].get_score_test() << " Score Train: " << results[0].get_score_train() << endl; } + void save(const string& path); + void cross_validation(const string& path, const string& fileName); + void go(vector filesToProcess, const string& path); + void show(); }; - Result cross_validation(Fold* fold, string model_name, torch::Tensor& X, torch::Tensor& y, vector features, string className, map> states); } #endif \ No newline at end of file diff --git a/src/Platform/Folding.cc b/src/Platform/Folding.cc index ec7c4b5..7c59bce 100644 --- a/src/Platform/Folding.cc +++ b/src/Platform/Folding.cc @@ -7,9 +7,8 @@ Fold::Fold(int k, int n, int seed) : k(k), n(n), seed(seed) random_seed = default_random_engine(seed == -1 ? rd() : seed); srand(seed == -1 ? time(0) : seed); } -KFold::KFold(int k, int n, int seed) : Fold(k, n, seed) +KFold::KFold(int k, int n, int seed) : Fold(k, n, seed), indices(vector(n)) { - indices = vector(n); iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1 shuffle(indices.begin(), indices.end(), random_seed); } diff --git a/src/Platform/Folding.h b/src/Platform/Folding.h index d7736d0..eaf0c4b 100644 --- a/src/Platform/Folding.h +++ b/src/Platform/Folding.h @@ -22,7 +22,7 @@ private: vector indices; public: KFold(int k, int n, int seed = -1); - pair, vector> getFold(int nFold); + pair, vector> getFold(int nFold) override; }; class StratifiedKFold : public Fold { private: @@ -32,6 +32,6 @@ private: public: StratifiedKFold(int k, const vector& y, int seed = -1); StratifiedKFold(int k, torch::Tensor& y, int seed = -1); - pair, vector> getFold(int nFold); + pair, vector> getFold(int nFold) override; }; #endif \ No newline at end of file diff --git a/src/Platform/Models.cc b/src/Platform/Models.cc new file mode 100644 index 0000000..1a66156 --- /dev/null +++ b/src/Platform/Models.cc @@ -0,0 +1,54 @@ +#include "Models.h" +namespace platform { + using namespace std; + // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory + Models* Models::factory = nullptr;; + Models* Models::instance() + { + //manages singleton + if (factory == nullptr) + factory = new Models(); + return factory; + } + void Models::registerFactoryFunction(const string& name, + function classFactoryFunction) + { + // register the class factory function + functionRegistry[name] = classFactoryFunction; + } + shared_ptr Models::create(const string& name) + { + bayesnet::BaseClassifier* instance = nullptr; + + // find name in the registry and call factory method. + auto it = functionRegistry.find(name); + if (it != functionRegistry.end()) + instance = it->second(); + // wrap instance in a shared ptr and return + if (instance != nullptr) + return shared_ptr(instance); + else + return nullptr; + } + vector Models::getNames() + { + vector names; + transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names), + [](const pair>& pair) { return pair.first; }); + return names; + } + string Models::toString() + { + string result = ""; + for (const auto& pair : functionRegistry) { + result += pair.first + ", "; + } + return "{" + result.substr(0, result.size() - 2) + "}"; + } + + Registrar::Registrar(const string& name, function classFactoryFunction) + { + // register the class factory function + Models::instance()->registerFactoryFunction(name, classFactoryFunction); + } +} \ No newline at end of file diff --git a/src/Platform/Models.h b/src/Platform/Models.h new file mode 100644 index 0000000..0bb8d51 --- /dev/null +++ b/src/Platform/Models.h @@ -0,0 +1,32 @@ +#ifndef MODELS_H +#define MODELS_H +#include +#include "BaseClassifier.h" +#include "AODE.h" +#include "TAN.h" +#include "KDB.h" +#include "SPODE.h" +namespace platform { + class Models { + private: + map> functionRegistry; + static Models* factory; //singleton + Models() {}; + public: + Models(Models&) = delete; + void operator=(const Models&) = delete; + // Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory + static Models* instance(); + shared_ptr create(const string& name); + void registerFactoryFunction(const string& name, + function classFactoryFunction); + vector getNames(); + string toString(); + + }; + class Registrar { + public: + Registrar(const string& className, function classFactoryFunction); + }; +} +#endif \ No newline at end of file diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 9cce8ad..55c0cfe 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -3,33 +3,37 @@ #include "platformUtils.h" #include "Experiment.h" #include "Datasets.h" - +#include "DotEnv.h" +#include "Models.h" +#include "modelRegister.h" using namespace std; const string PATH_RESULTS = "results"; +const string PATH_DATASETS = "datasets"; argparse::ArgumentParser manageArguments(int argc, char** argv) { + auto env = platform::DotEnv(); argparse::ArgumentParser program("BayesNetSample"); program.add_argument("-d", "--dataset").default_value("").help("Dataset file name"); program.add_argument("-p", "--path") .help("folder where the data files are located, default") - .default_value(string{ PATH } + .default_value(string{ PATH_DATASETS } ); program.add_argument("-m", "--model") - .help("Model to use {AODE, KDB, SPODE, TAN}") + .help("Model to use " + platform::Models::instance()->toString()) .action([](const std::string& value) { - static const vector choices = { "AODE", "KDB", "SPODE", "TAN" }; + static const vector choices = platform::Models::instance()->getNames(); if (find(choices.begin(), choices.end(), value) != choices.end()) { return value; } - throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}"); + throw runtime_error("Model must be one of " + platform::Models::instance()->toString()); } ); - program.add_argument("--title").required().help("Experiment title"); - program.add_argument("--discretize").help("Discretize input dataset").default_value(false).implicit_value(true); - program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value(false).implicit_value(true); - program.add_argument("-f", "--folds").help("Number of folds").default_value(5).scan<'i', int>().action([](const string& value) { + program.add_argument("--title").default_value("").help("Experiment title"); + program.add_argument("--discretize").help("Discretize input dataset").default_value((bool)stoi(env.get("discretize"))).implicit_value(true); + program.add_argument("--stratified").help("If Stratified KFold is to be done").default_value((bool)stoi(env.get("stratified"))).implicit_value(true); + program.add_argument("-f", "--folds").help("Number of folds").default_value(stoi(env.get("n_folds"))).scan<'i', int>().action([](const string& value) { try { auto k = stoi(value); if (k < 2) { @@ -43,22 +47,22 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) catch (...) { throw runtime_error("Number of folds must be an integer"); }}); - program.add_argument("-s", "--seed").help("Random seed").default_value(-1).scan<'i', int>(); - bool class_last, discretize_dataset, stratified; - int n_folds, seed; - string model_name, file_name, path, complete_file_name, title; + auto seed_values = env.getSeeds(); + program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values); try { program.parse_args(argc, argv); - file_name = program.get("dataset"); - path = program.get("path"); - model_name = program.get("model"); - discretize_dataset = program.get("discretize"); - stratified = program.get("stratified"); - n_folds = program.get("folds"); - seed = program.get("seed"); - complete_file_name = path + file_name + ".arff"; - class_last = false;//datasets[file_name]; - title = program.get("title"); + auto file_name = program.get("dataset"); + auto path = program.get("path"); + auto model_name = program.get("model"); + auto discretize_dataset = program.get("discretize"); + auto stratified = program.get("stratified"); + auto n_folds = program.get("folds"); + auto seeds = program.get>("seeds"); + auto complete_file_name = path + file_name + ".arff"; + auto title = program.get("title"); + if (title == "" && file_name == "") { + throw runtime_error("title is mandatory if dataset is not provided"); + } } catch (const exception& err) { cerr << err.what() << endl; @@ -71,25 +75,30 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) int main(int argc, char** argv) { auto program = manageArguments(argc, argv); + bool saveResults = false; auto file_name = program.get("dataset"); auto path = program.get("path"); auto model_name = program.get("model"); auto discretize_dataset = program.get("discretize"); auto stratified = program.get("stratified"); auto n_folds = program.get("folds"); - auto seed = program.get("seed"); - vector filesToProcess; + auto seeds = program.get>("seeds"); + vector filesToTest; auto datasets = platform::Datasets(path, true, platform::ARFF); + auto title = program.get("title"); if (file_name != "") { if (!datasets.isDataset(file_name)) { cerr << "Dataset " << file_name << " not found" << endl; exit(1); } - filesToProcess.push_back(file_name); + if (title == "") { + title = "Test " + file_name + " " + model_name + " " + to_string(n_folds) + " folds"; + } + filesToTest.push_back(file_name); } else { - filesToProcess = platform::Datasets(path, true, platform::ARFF).getNames(); + filesToTest = platform::Datasets(path, true, platform::ARFF).getNames(); + saveResults = true; } - auto title = program.get("title"); /* * Begin Processing @@ -97,31 +106,18 @@ int main(int argc, char** argv) auto experiment = platform::Experiment(); experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("1.0.0"); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform("BayesNet"); - experiment.setStratified(stratified).setNFolds(n_folds).addRandomSeed(seed).setScoreName("accuracy"); - platform::Timer timer; - cout << "*** Starting experiment: " << title << " ***" << endl; - timer.start(); - for (auto fileName : filesToProcess) { - cout << "- " << setw(20) << left << fileName << " " << right << flush; - auto [X, y] = datasets.getTensors(fileName); - auto states = datasets.getStates(fileName); - auto features = datasets.getFeatures(fileName); - auto samples = datasets.getNSamples(fileName); - auto className = datasets.getClassName(fileName); - cout << " (" << setw(5) << samples << "," << setw(3) << features.size() << ") " << flush; - Fold* fold; - if (stratified) - fold = new StratifiedKFold(n_folds, y, seed); - else - fold = new KFold(n_folds, samples, seed); - auto result = platform::cross_validation(fold, model_name, X, y, features, className, states); - result.setDataset(fileName); - experiment.setModelVersion(result.getModelVersion()); - experiment.addResult(result); - delete fold; + experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy"); + for (auto seed : seeds) { + experiment.addRandomSeed(seed); } + platform::Timer timer; + timer.start(); + experiment.go(filesToTest, path); experiment.setDuration(timer.getDuration()); - experiment.save(PATH_RESULTS); + if (saveResults) + experiment.save(PATH_RESULTS); + else + experiment.show(); cout << "Done!" << endl; return 0; } diff --git a/src/Platform/modelRegister.h b/src/Platform/modelRegister.h new file mode 100644 index 0000000..a4188bc --- /dev/null +++ b/src/Platform/modelRegister.h @@ -0,0 +1,11 @@ +#ifndef MODEL_REGISTER_H +#define MODEL_REGISTER_H +static platform::Registrar registrarT("TAN", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::TAN();}); +static platform::Registrar registrarS("SPODE", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::SPODE(2);}); +static platform::Registrar registrarK("KDB", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::KDB(2);}); +static platform::Registrar registrarA("AODE", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();}); +#endif \ No newline at end of file diff --git a/src/Platform/platformUtils.cc b/src/Platform/platformUtils.cc index ea8fad3..6fca9d9 100644 --- a/src/Platform/platformUtils.cc +++ b/src/Platform/platformUtils.cc @@ -2,6 +2,17 @@ using namespace torch; +vector split(const string& text, char delimiter) +{ + vector result; + stringstream ss(text); + string token; + while (getline(ss, token, delimiter)) { + result.push_back(token); + } + return result; +} + pair, map> discretize(vector& X, mdlp::labels_t& y, vector features) { vector Xd; @@ -28,7 +39,7 @@ vector discretizeDataset(vector& X, mdlp::label return Xd; } -bool file_exists(const std::string& name) +bool file_exists(const string& name) { if (FILE* file = fopen(name.c_str(), "r")) { fclose(file); @@ -38,7 +49,7 @@ bool file_exists(const std::string& name) } } -tuple, string, map>> loadDataset(string path, string name, bool class_last, bool discretize_dataset) +tuple, string, map>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset) { auto handler = ArffFiles(); handler.load(path + static_cast(name) + ".arff", class_last); @@ -48,9 +59,8 @@ tuple, string, map>> loadData // Get className & Features auto className = handler.getClassName(); vector features; - for (auto feature : handler.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); Tensor Xd; auto states = map>(); if (discretize_dataset) { @@ -72,7 +82,7 @@ tuple, string, map>> loadData return { Xd, torch::tensor(y, torch::kInt32), features, className, states }; } -tuple>, vector, vector, string, map>> loadFile(string name) +tuple>, vector, vector, string, map>> loadFile(const string& name) { auto handler = ArffFiles(); handler.load(PATH + static_cast(name) + ".arff"); @@ -82,9 +92,8 @@ tuple>, vector, vector, string, map features; - for (auto feature : handler.getAttributes()) { - features.push_back(feature.first); - } + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); // Discretize Dataset vector Xd; map maxes; diff --git a/src/Platform/platformUtils.h b/src/Platform/platformUtils.h index abc69bd..2b4ca54 100644 --- a/src/Platform/platformUtils.h +++ b/src/Platform/platformUtils.h @@ -11,10 +11,11 @@ using namespace std; const string PATH = "../../data/"; bool file_exists(const std::string& name); +vector split(const string& text, char delimiter); pair, map> discretize(vector& X, mdlp::labels_t& y, vector features); vector discretizeDataset(vector& X, mdlp::labels_t& y); -pair>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector& features, string className); -tuple>, vector, vector, string, map>> loadFile(string name); -tuple, string, map>> loadDataset(string path, string name, bool class_last, bool discretize_dataset); +pair>> discretizeTorch(torch::Tensor& X, torch::Tensor& y, vector& features, const string& className); +tuple>, vector, vector, string, map>> loadFile(const string& name); +tuple, string, map>> loadDataset(const string& path, const string& name, bool class_last, bool discretize_dataset); map> get_states(vector& features, string className, map& maxes); #endif //PLATFORM_UTILS_H