From 644b6c9be05e00f1dfad02fd5c4aee7b4b4059b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 23 Jul 2023 01:47:57 +0200 Subject: [PATCH] Begin experiment --- sample/sample.cc | 35 ++++--------- src/BayesNet/BayesMetrics.cc | 1 - src/BayesNet/Classifier.cc | 1 - src/BayesNet/Ensemble.cc | 1 - src/BayesNet/KDB.cc | 1 - src/BayesNet/TAN.cc | 1 - src/Platform/Experiment.cc | 99 +++++++++-------------------------- src/Platform/Folding.cc | 9 ++-- src/Platform/Folding.h | 18 ++++--- src/Platform/platformUtils.cc | 56 ++++++++++++++++++++ src/Platform/platformUtils.h | 3 ++ 11 files changed, 107 insertions(+), 118 deletions(-) diff --git a/sample/sample.cc b/sample/sample.cc index f5c41f7..3b6de05 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -19,20 +18,6 @@ using namespace std; const string PATH = "../../data/"; -inline constexpr auto hash_conv(const std::string_view sv) -{ - unsigned long hash{ 5381 }; - for (unsigned char c : sv) { - hash = ((hash << 5) + hash) ^ c; - } - return hash; -} - -inline constexpr auto operator"" _sh(const char* str, size_t len) -{ - return hash_conv(std::string_view{ str, len }); -} - pair, map> discretize(vector& X, mdlp::labels_t& y, vector features) { vectorXd; @@ -98,15 +83,13 @@ int main(int argc, char** argv) throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}"); } ); - program.add_argument("--discretize").default_value(false).implicit_value(true); - bool class_last, discretize_dataset; + bool class_last; string model_name, file_name, path, complete_file_name; try { program.parse_args(argc, argv); file_name = program.get("file"); path = program.get("path"); model_name = program.get("model"); - discretize_dataset = program.get("discretize"); complete_file_name = path + file_name + ".arff"; class_last = datasets[file_name]; if (!file_exists(complete_file_name)) { @@ -134,21 +117,21 @@ int main(int argc, char** argv) features.push_back(feature.first); } // Discretize Dataset - vector Xd; - map maxes; - tie(Xd, maxes) = discretize(X, y, features); + auto [Xd, maxes] = discretize(X, y, features); maxes[className] = *max_element(y.begin(), y.end()) + 1; map> states; for (auto feature : features) { states[feature] = vector(maxes[feature]); } - states[className] = vector( - maxes[className]); - double score; - auto classifiers = map({ { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } }); + states[className] = vector(maxes[className]); + auto classifiers = map({ + { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, + { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } + } + ); bayesnet::BaseClassifier* clf = classifiers[model_name]; clf->fit(Xd, y, features, className, states); - score = clf->score(Xd, y); + auto score = clf->score(Xd, y); auto lines = clf->show(); auto graph = clf->graph(); for (auto line : lines) { diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index ce49fb1..a5abb6b 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -1,6 +1,5 @@ #include "BayesMetrics.h" #include "Mst.h" -using namespace std; namespace bayesnet { Metrics::Metrics(torch::Tensor& samples, vector& features, string& className, int classNumStates) : samples(samples) diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index b4e3abc..cc11a5d 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -2,7 +2,6 @@ #include "bayesnetUtils.h" namespace bayesnet { - using namespace std; using namespace torch; Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {} diff --git a/src/BayesNet/Ensemble.cc b/src/BayesNet/Ensemble.cc index 8a971c3..6f389bc 100644 --- a/src/BayesNet/Ensemble.cc +++ b/src/BayesNet/Ensemble.cc @@ -1,7 +1,6 @@ #include "Ensemble.h" namespace bayesnet { - using namespace std; using namespace torch; Ensemble::Ensemble() : m(0), n(0), n_models(0), metrics(Metrics()), fitted(false) {} diff --git a/src/BayesNet/KDB.cc b/src/BayesNet/KDB.cc index 32f7184..b041dac 100644 --- a/src/BayesNet/KDB.cc +++ b/src/BayesNet/KDB.cc @@ -1,7 +1,6 @@ #include "KDB.h" namespace bayesnet { - using namespace std; using namespace torch; KDB::KDB(int k, float theta) : Classifier(Network()), k(k), theta(theta) {} diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index dc3d4cd..9c8dfff 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -1,7 +1,6 @@ #include "TAN.h" namespace bayesnet { - using namespace std; using namespace torch; TAN::TAN() : Classifier(Network()) {} diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 40d1da4..60b8dec 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -12,22 +12,25 @@ #include "AODE.h" #include "TAN.h" #include "platformUtils.h" +#include "Folding.h" using namespace std; -inline constexpr auto hash_conv(const std::string_view sv) +pair cross_validation(Fold* fold, bayesnet::BaseClassifier* model, Tensor& X, Tensor& y, int k) { - unsigned long hash{ 5381 }; - for (unsigned char c : sv) { - hash = ((hash << 5) + hash) ^ c; + float accuracy = 0.0; + for (int i = 0; i < k; i++) { + auto [train, test] = fold->getFold(i); + auto X_train = X.indices{ train }; + auto y_train = y.indices{ train }; + auto X_test = X.indices{ test }; + auto y_test = y.indices{ test }; + model->fit(X_train, y_train); + auto acc = model->score(X_test, y_test); + accuracy += acc; } - return hash; -} - -inline constexpr auto operator"" _sh(const char* str, size_t len) -{ - return hash_conv(std::string_view{ str, len }); + return { accuracy / k, 0 }; } int main(int argc, char** argv) @@ -94,70 +97,18 @@ int main(int argc, char** argv) /* * Begin Processing */ - auto handler = ArffFiles(); - handler.load(complete_file_name, class_last); - // Get Dataset X, y - vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); - // Get className & Features - auto className = handler.getClassName(); - vector features; - for (auto feature : handler.getAttributes()) { - features.push_back(feature.first); + auto [X, y, features] = loadDataset(file_name, discretize_dataset); + if (discretize_dataset) { + auto [discretized, maxes] = discretize(X, y, features); } - // Discretize Dataset - vector Xd; - map maxes; - tie(Xd, maxes) = discretize(X, y, features); - maxes[className] = *max_element(y.begin(), y.end()) + 1; - map> states; - for (auto feature : features) { - states[feature] = vector(maxes[feature]); - } - states[className] = vector( - maxes[className]); - double score; - vector lines; - vector graph; - auto kdb = bayesnet::KDB(2); - auto aode = bayesnet::AODE(); - auto spode = bayesnet::SPODE(2); - auto tan = bayesnet::TAN(); - switch (hash_conv(model_name)) { - case "AODE"_sh: - aode.fit(Xd, y, features, className, states); - lines = aode.show(); - score = aode.score(Xd, y); - graph = aode.graph(); - break; - case "KDB"_sh: - kdb.fit(Xd, y, features, className, states); - lines = kdb.show(); - score = kdb.score(Xd, y); - graph = kdb.graph(); - break; - case "SPODE"_sh: - spode.fit(Xd, y, features, className, states); - lines = spode.show(); - score = spode.score(Xd, y); - graph = spode.graph(); - break; - case "TAN"_sh: - tan.fit(Xd, y, features, className, states); - lines = tan.show(); - score = tan.score(Xd, y); - graph = tan.graph(); - break; - } - for (auto line : lines) { - cout << line << endl; - } - cout << "Score: " << score << endl; - auto dot_file = model_name + "_" + file_name; - ofstream file(dot_file + ".dot"); - file << graph; - file.close(); - cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; - cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; + auto fold = StratifiedKFold(5, y, -1); + auto classifiers = map({ + { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, + { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } + } + ); + bayesnet::BaseClassifier* model = classifiers[model_name]; + auto results = cross_validation(model, X, y, fold, 5); + cout << "Accuracy: " << results.first << endl; return 0; } \ No newline at end of file diff --git a/src/Platform/Folding.cc b/src/Platform/Folding.cc index 9d9f009..9fd1d03 100644 --- a/src/Platform/Folding.cc +++ b/src/Platform/Folding.cc @@ -2,10 +2,7 @@ #include #include #include - -using namespace std; - -KFold::KFold(int k, int n, int seed) : k(k), n(n), seed(seed) +KFold::KFold(int k, int n, int seed) : Fold(k, n, seed) { indices = vector(n); iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1 @@ -31,8 +28,8 @@ pair, vector> KFold::getFold(int nFold) } return { train, test }; } -StratifiedKFold::StratifiedKFold(int k, const vector& y, int seed) : - k(k), seed(seed) +StratifiedKFold::StratifiedKFold(int k, const vector& y, int seed) + : Fold(k, y.size(), seed) { n = y.size(); stratified_indices = vector>(k); diff --git a/src/Platform/Folding.h b/src/Platform/Folding.h index 526d3bc..b12af07 100644 --- a/src/Platform/Folding.h +++ b/src/Platform/Folding.h @@ -2,21 +2,25 @@ #define FOLDING_H #include using namespace std; -class KFold { -private: + +class Fold { +protected: int k; int n; int seed; +public: + Fold(int k, int n, int seed = -1) : k(k), n(n), seed(seed) {} + virtual pair, vector> getFold(int nFold) = 0; + virtual ~Fold() = default; +}; +class KFold : public Fold { +private: vector indices; public: KFold(int k, int n, int seed = -1); pair, vector> getFold(int nFold); }; -class StratifiedKFold { -private: - int k; - int n; - int seed; +class StratifiedKFold : public Fold { vector> stratified_indices; public: StratifiedKFold(int k, const vector& y, int seed = -1); diff --git a/src/Platform/platformUtils.cc b/src/Platform/platformUtils.cc index 555a285..2b18635 100644 --- a/src/Platform/platformUtils.cc +++ b/src/Platform/platformUtils.cc @@ -1,5 +1,7 @@ #include "platformUtils.h" +using namespace torch; + pair, map> discretize(vector& X, mdlp::labels_t& y, vector features) { vector Xd; @@ -14,6 +16,18 @@ pair, map> discretize(vector discretizeDataset(vector& X, mdlp::labels_t& y) +{ + vector Xd; + auto fimdlp = mdlp::CPPFImdlp(); + for (int i = 0; i < X.size(); i++) { + fimdlp.fit(X[i], y); + mdlp::labels_t& xd = fimdlp.transform(X[i]); + Xd.push_back(xd); + } + return Xd; +} + bool file_exists(const std::string& name) { if (FILE* file = fopen(name.c_str(), "r")) { @@ -24,6 +38,48 @@ bool file_exists(const std::string& name) } } +tuple < Tensor, Tensor, vector> loadDataset(string name, bool discretize) +{ + auto handler = ArffFiles(); + handler.load(PATH + static_cast(name) + ".arff"); + // Get Dataset X, y + vector& X = handler.getX(); + mdlp::labels_t& y = handler.getY(); + // Get className & Features + auto className = handler.getClassName(); + vector features; + for (auto feature : handler.getAttributes()) { + features.push_back(feature.first); + } + Tensor Xd; + if (discretize) { + auto Xr = discretizeDataset(X, y); + Xd = torch::zeros({ static_cast(Xr[0].size()), static_cast(Xr.size()) }, torch::kInt64); + for (int i = 0; i < features.size(); ++i) { + Xd.index_put_({ "...", i }, torch::tensor(Xr[i], torch::kInt64)); + } + } else { + Xd = torch::zeros({ static_cast(X[0].size()), static_cast(X.size()) }, torch::kFloat64); + for (int i = 0; i < features.size(); ++i) { + Xd.index_put_({ "...", i }, torch::tensor(X[i], torch::kFloat64)); + } + } + return { Xd, torch::tensor(y, torch::kInt64), features }; +} + +pair , map>> discretize_info(Tensor& X, Tensor& y, vector features, string className) +{ + map maxes; + map> states; + for (int i = 0; i < X.size(1); i++) { + maxes[features[i]] = X.select(1, i).max().item() + 1; + states[features[i]] = vector(maxes[features[i]]); + } + maxes[className] = y.max().item() + 1; + states[className] = vector(maxes[className]); + return { maxes, states }; +} + tuple>, vector, vector, string, map>> loadFile(string name) { auto handler = ArffFiles(); diff --git a/src/Platform/platformUtils.h b/src/Platform/platformUtils.h index 78d90bc..80b198f 100644 --- a/src/Platform/platformUtils.h +++ b/src/Platform/platformUtils.h @@ -1,5 +1,6 @@ #ifndef PLATFORM_UTILS_H #define PLATFORM_UTILS_H +#include #include #include #include @@ -12,4 +13,6 @@ const string PATH = "../../data/"; bool file_exists(const std::string& name); pair, map> discretize(vector& X, mdlp::labels_t& y, vector features); tuple>, vector, vector, string, map>> loadFile(string name); +tuple> loadDataset(string name, bool discretize); +pair , map>> discretize_info(torch::Tensor& X, torch::Tensor& y); #endif //PLATFORM_UTILS_H