Begin experiment

2023-07-23 01:47:57 +02:00 · 2023-07-23 01:47:57 +02:00 · 644b6c9be0
commit 644b6c9be0
parent 9981ad1811
11 changed files with 107 additions and 118 deletions
--- a/sample/sample.cc
+++ b/sample/sample.cc
@ -1,6 +1,5 @@
 #include <iostream>
 #include <string>
-#include <torch/torch.h>
 #include <thread>
 #include <map>
 #include <argparse/argparse.hpp>
@ -19,20 +18,6 @@ using namespace std;

 const string PATH = "../../data/";

-inline constexpr auto hash_conv(const std::string_view sv)
-{
-    unsigned long hash{ 5381 };
-    for (unsigned char c : sv) {
-        hash = ((hash << 5) + hash) ^ c;
-    }
-    return hash;
-}
-
-inline constexpr auto operator"" _sh(const char* str, size_t len)
-{
-    return hash_conv(std::string_view{ str, len });
-}
-
 pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
 {
    vector<mdlp::labels_t>Xd;
@ -98,15 +83,13 @@ int main(int argc, char** argv)
        throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}");
            }
    );
-    program.add_argument("--discretize").default_value(false).implicit_value(true);
-    bool class_last, discretize_dataset;
+    bool class_last;
    string model_name, file_name, path, complete_file_name;
    try {
        program.parse_args(argc, argv);
        file_name = program.get<string>("file");
        path = program.get<string>("path");
        model_name = program.get<string>("model");
-        discretize_dataset = program.get<bool>("discretize");
        complete_file_name = path + file_name + ".arff";
        class_last = datasets[file_name];
        if (!file_exists(complete_file_name)) {
@ -134,21 +117,21 @@ int main(int argc, char** argv)
        features.push_back(feature.first);
    }
    // Discretize Dataset
-    vector<mdlp::labels_t> Xd;
-    map<string, int> maxes;
-    tie(Xd, maxes) = discretize(X, y, features);
+    auto [Xd, maxes] = discretize(X, y, features);
    maxes[className] = *max_element(y.begin(), y.end()) + 1;
    map<string, vector<int>> states;
    for (auto feature : features) {
        states[feature] = vector<int>(maxes[feature]);
    }
-    states[className] = vector<int>(
-        maxes[className]);
-    double score;
-    auto classifiers = map<string, bayesnet::BaseClassifier*>({ { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, { "SPODE",  new bayesnet::SPODE(2) }, { "TAN",  new bayesnet::TAN() } });
+    states[className] = vector<int>(maxes[className]);
+    auto classifiers = map<string, bayesnet::BaseClassifier*>({
+        { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) },
+        { "SPODE",  new bayesnet::SPODE(2) }, { "TAN",  new bayesnet::TAN() }
+        }
+    );
    bayesnet::BaseClassifier* clf = classifiers[model_name];
    clf->fit(Xd, y, features, className, states);
-    score = clf->score(Xd, y);
+    auto score = clf->score(Xd, y);
    auto lines = clf->show();
    auto graph = clf->graph();
    for (auto line : lines) {
--- a/src/BayesNet/BayesMetrics.cc
+++ b/src/BayesNet/BayesMetrics.cc
@ -1,6 +1,5 @@
 #include "BayesMetrics.h"
 #include "Mst.h"
-using namespace std;
 namespace bayesnet {
    Metrics::Metrics(torch::Tensor& samples, vector<string>& features, string& className, int classNumStates)
        : samples(samples)
--- a/src/BayesNet/Classifier.cc
+++ b/src/BayesNet/Classifier.cc
@ -2,7 +2,6 @@
 #include "bayesnetUtils.h"

 namespace bayesnet {
-    using namespace std;
    using namespace torch;

    Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {}
--- a/src/BayesNet/Ensemble.cc
+++ b/src/BayesNet/Ensemble.cc
@ -1,7 +1,6 @@
 #include "Ensemble.h"

 namespace bayesnet {
-    using namespace std;
    using namespace torch;

    Ensemble::Ensemble() : m(0), n(0), n_models(0), metrics(Metrics()), fitted(false) {}
--- a/src/BayesNet/KDB.cc
+++ b/src/BayesNet/KDB.cc
@ -1,7 +1,6 @@
 #include "KDB.h"

 namespace bayesnet {
-    using namespace std;
    using namespace torch;

    KDB::KDB(int k, float theta) : Classifier(Network()), k(k), theta(theta) {}
--- a/src/BayesNet/TAN.cc
+++ b/src/BayesNet/TAN.cc
@ -1,7 +1,6 @@
 #include "TAN.h"

 namespace bayesnet {
-    using namespace std;
    using namespace torch;

    TAN::TAN() : Classifier(Network()) {}
--- a/src/Platform/Experiment.cc
+++ b/src/Platform/Experiment.cc
@ -12,22 +12,25 @@
 #include "AODE.h"
 #include "TAN.h"
 #include "platformUtils.h"
+#include "Folding.h"


 using namespace std;

-inline constexpr auto hash_conv(const std::string_view sv)
+pair<float, float> cross_validation(Fold* fold, bayesnet::BaseClassifier* model, Tensor& X, Tensor& y, int k)
 {
-    unsigned long hash{ 5381 };
-    for (unsigned char c : sv) {
-        hash = ((hash << 5) + hash) ^ c;
+    float accuracy = 0.0;
+    for (int i = 0; i < k; i++) {
+        auto [train, test] = fold->getFold(i);
+        auto X_train = X.indices{ train };
+        auto y_train = y.indices{ train };
+        auto X_test = X.indices{ test };
+        auto y_test = y.indices{ test };
+        model->fit(X_train, y_train);
+        auto acc = model->score(X_test, y_test);
+        accuracy += acc;
    }
-    return hash;
-}
-
-inline constexpr auto operator"" _sh(const char* str, size_t len)
-{
-    return hash_conv(std::string_view{ str, len });
+    return { accuracy / k, 0 };
 }

 int main(int argc, char** argv)
@ -94,70 +97,18 @@ int main(int argc, char** argv)
    /*
    * Begin Processing
    */
-    auto handler = ArffFiles();
-    handler.load(complete_file_name, class_last);
-    // Get Dataset X, y
-    vector<mdlp::samples_t>& X = handler.getX();
-    mdlp::labels_t& y = handler.getY();
-    // Get className & Features
-    auto className = handler.getClassName();
-    vector<string> features;
-    for (auto feature : handler.getAttributes()) {
-        features.push_back(feature.first);
+    auto [X, y, features] = loadDataset(file_name, discretize_dataset);
+    if (discretize_dataset) {
+        auto [discretized, maxes] = discretize(X, y, features);
    }
-    // Discretize Dataset
-    vector<mdlp::labels_t> Xd;
-    map<string, int> maxes;
-    tie(Xd, maxes) = discretize(X, y, features);
-    maxes[className] = *max_element(y.begin(), y.end()) + 1;
-    map<string, vector<int>> states;
-    for (auto feature : features) {
-        states[feature] = vector<int>(maxes[feature]);
+    auto fold = StratifiedKFold(5, y, -1);
+    auto classifiers = map<string, bayesnet::BaseClassifier*>({
+        { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) },
+        { "SPODE",  new bayesnet::SPODE(2) }, { "TAN",  new bayesnet::TAN() }
        }
-    states[className] = vector<int>(
-        maxes[className]);
-    double score;
-    vector<string> lines;
-    vector<string> graph;
-    auto kdb = bayesnet::KDB(2);
-    auto aode = bayesnet::AODE();
-    auto spode = bayesnet::SPODE(2);
-    auto tan = bayesnet::TAN();
-    switch (hash_conv(model_name)) {
-        case "AODE"_sh:
-            aode.fit(Xd, y, features, className, states);
-            lines = aode.show();
-            score = aode.score(Xd, y);
-            graph = aode.graph();
-            break;
-        case "KDB"_sh:
-            kdb.fit(Xd, y, features, className, states);
-            lines = kdb.show();
-            score = kdb.score(Xd, y);
-            graph = kdb.graph();
-            break;
-        case "SPODE"_sh:
-            spode.fit(Xd, y, features, className, states);
-            lines = spode.show();
-            score = spode.score(Xd, y);
-            graph = spode.graph();
-            break;
-        case "TAN"_sh:
-            tan.fit(Xd, y, features, className, states);
-            lines = tan.show();
-            score = tan.score(Xd, y);
-            graph = tan.graph();
-            break;
-    }
-    for (auto line : lines) {
-        cout << line << endl;
-    }
-    cout << "Score: " << score << endl;
-    auto dot_file = model_name + "_" + file_name;
-    ofstream file(dot_file + ".dot");
-    file << graph;
-    file.close();
-    cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;
-    cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
+    );
+    bayesnet::BaseClassifier* model = classifiers[model_name];
+    auto results = cross_validation(model, X, y, fold, 5);
+    cout << "Accuracy: " << results.first << endl;
    return 0;
 }
--- a/src/Platform/Folding.cc
+++ b/src/Platform/Folding.cc
@ -2,10 +2,7 @@
 #include <algorithm>
 #include <map>
 #include <random>
-
-using namespace std;
-
-KFold::KFold(int k, int n, int seed) : k(k), n(n), seed(seed)
+KFold::KFold(int k, int n, int seed) : Fold(k, n, seed)
 {
    indices = vector<int>(n);
    iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1
@ -31,8 +28,8 @@ pair<vector<int>, vector<int>> KFold::getFold(int nFold)
    }
    return { train, test };
 }
-StratifiedKFold::StratifiedKFold(int k, const vector<int>& y, int seed) :
-    k(k), seed(seed)
+StratifiedKFold::StratifiedKFold(int k, const vector<int>& y, int seed)
+    : Fold(k, y.size(), seed)
 {
    n = y.size();
    stratified_indices = vector<vector<int>>(k);
--- a/src/Platform/Folding.h
+++ b/src/Platform/Folding.h
@ -2,21 +2,25 @@
 #define FOLDING_H
 #include <vector>
 using namespace std;
-class KFold {
-private:
+
+class Fold {
+protected:
    int k;
    int n;
    int seed;
+public:
+    Fold(int k, int n, int seed = -1) : k(k), n(n), seed(seed) {}
+    virtual pair<vector<int>, vector<int>> getFold(int nFold) = 0;
+    virtual ~Fold() = default;
+};
+class KFold : public Fold {
+private:
    vector<int> indices;
 public:
    KFold(int k, int n, int seed = -1);
    pair<vector<int>, vector<int>> getFold(int nFold);
 };
-class StratifiedKFold {
-private:
-    int k;
-    int n;
-    int seed;
+class StratifiedKFold : public Fold {
    vector<vector<int>> stratified_indices;
 public:
    StratifiedKFold(int k, const vector<int>& y, int seed = -1);
--- a/src/Platform/platformUtils.cc
+++ b/src/Platform/platformUtils.cc
@ -1,5 +1,7 @@
 #include "platformUtils.h"

+using namespace torch;
+
 pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
 {
    vector<mdlp::labels_t> Xd;
@ -14,6 +16,18 @@ pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t
    return { Xd, maxes };
 }

+vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
+{
+    vector<mdlp::labels_t> Xd;
+    auto fimdlp = mdlp::CPPFImdlp();
+    for (int i = 0; i < X.size(); i++) {
+        fimdlp.fit(X[i], y);
+        mdlp::labels_t& xd = fimdlp.transform(X[i]);
+        Xd.push_back(xd);
+    }
+    return Xd;
+}
+
 bool file_exists(const std::string& name)
 {
    if (FILE* file = fopen(name.c_str(), "r")) {
@ -24,6 +38,48 @@ bool file_exists(const std::string& name)
    }
 }

+tuple < Tensor, Tensor, vector<string>> loadDataset(string name, bool discretize)
+{
+    auto handler = ArffFiles();
+    handler.load(PATH + static_cast<string>(name) + ".arff");
+    // Get Dataset X, y
+    vector<mdlp::samples_t>& X = handler.getX();
+    mdlp::labels_t& y = handler.getY();
+    // Get className & Features
+    auto className = handler.getClassName();
+    vector<string> features;
+    for (auto feature : handler.getAttributes()) {
+        features.push_back(feature.first);
+    }
+    Tensor Xd;
+    if (discretize) {
+        auto Xr = discretizeDataset(X, y);
+        Xd = torch::zeros({ static_cast<int64_t>(Xr[0].size()), static_cast<int64_t>(Xr.size()) }, torch::kInt64);
+        for (int i = 0; i < features.size(); ++i) {
+            Xd.index_put_({ "...", i }, torch::tensor(Xr[i], torch::kInt64));
+        }
+    } else {
+        Xd = torch::zeros({ static_cast<int64_t>(X[0].size()), static_cast<int64_t>(X.size()) }, torch::kFloat64);
+        for (int i = 0; i < features.size(); ++i) {
+            Xd.index_put_({ "...", i }, torch::tensor(X[i], torch::kFloat64));
+        }
+    }
+    return { Xd, torch::tensor(y, torch::kInt64), features };
+}
+
+pair <map<string, int>, map<string, vector<int>>> discretize_info(Tensor& X, Tensor& y, vector<string> features, string className)
+{
+    map<string, int> maxes;
+    map<string, vector<int>> states;
+    for (int i = 0; i < X.size(1); i++) {
+        maxes[features[i]] = X.select(1, i).max().item<int>() + 1;
+        states[features[i]] = vector<int>(maxes[features[i]]);
+    }
+    maxes[className] = y.max().item<int>() + 1;
+    states[className] = vector<int>(maxes[className]);
+    return { maxes, states };
+}
+
 tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(string name)
 {
    auto handler = ArffFiles();
--- a/src/Platform/platformUtils.h
+++ b/src/Platform/platformUtils.h
@ -1,5 +1,6 @@
 #ifndef PLATFORM_UTILS_H
 #define PLATFORM_UTILS_H
+#include <torch/torch.h>
 #include <string>
 #include <vector>
 #include <map>
@ -12,4 +13,6 @@ const string PATH = "../../data/";
 bool file_exists(const std::string& name);
 pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features);
 tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(string name);
+tuple<torch::Tensor, torch::Tensor, vector<string>> loadDataset(string name, bool discretize);
+pair <map<string, int>, map<string, vector<int>>> discretize_info(torch::Tensor& X, torch::Tensor& y);
 #endif //PLATFORM_UTILS_H