Continue Test Network

2023-10-09 11:25:30 +02:00
parent e3ae073333
commit 8fdad78a8c
6 changed files with 324 additions and 82 deletions
--- a/src/BayesNet/Network.cc
+++ b/src/BayesNet/Network.cc
@@ -201,8 +201,7 @@ namespace bayesnet {
        }
        if (proba)
            return result;
-        else
-            return result.argmax(1);
+        return result.argmax(1);
    }
    // Return mxn tensor of probabilities
    Tensor Network::predict_proba(const Tensor& samples)
--- a/src/BayesNet/Network.h
+++ b/src/BayesNet/Network.h
@@ -39,6 +39,9 @@ namespace bayesnet {
        int getNumEdges() const;
        int getClassNumStates() const;
        string getClassName() const;
+        /*
+        Notice: Nodes have to be inserted in the same order as they are in the dataset, i.e., first node is first column and so on.
+        */
        void fit(const vector<vector<int>>& input_data, const vector<int>& labels, const vector<double>& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
        void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
        void fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector<string>& featureNames, const string& className, const map<string, vector<int>>& states);
--- a/src/Platform/CMakeLists.txt
+++ b/src/Platform/CMakeLists.txt
@@ -19,4 +19,4 @@ else()
    target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}")
 endif()
 target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}")
-target_link_libraries(testx ArffFiles mdlp "${TORCH_LIBRARIES}")
+target_link_libraries(testx ArffFiles mdlp BayesNet "${TORCH_LIBRARIES}")
--- a/src/Platform/testx.cpp
+++ b/src/Platform/testx.cpp
@@ -1,11 +1,16 @@
 #include "Folding.h"
+#include <torch/torch.h>
 #include "map"
-#include "Datasets.h"
-#include <map>
 #include <iostream>
 #include <sstream>
+#include "Datasets.h"
+#include "Network.h"
+#include "ArffFiles.h"
+#include "CPPFImdlp.h"
+
 using namespace std;
 using namespace platform;
+using namespace torch;

 string counts(vector<int> y, vector<int> indices)
 {
@@ -21,45 +26,187 @@ string counts(vector<int> y, vector<int> indices)
    oss << endl;
    return oss.str();
 }
+class Paths {
+public:
+    static string datasets()
+    {
+        return "datasets/";
+    }
+};

+pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
+{
+    vector<mdlp::labels_t> Xd;
+    map<string, int> maxes;
+    auto fimdlp = mdlp::CPPFImdlp();
+    for (int i = 0; i < X.size(); i++) {
+        fimdlp.fit(X[i], y);
+        mdlp::labels_t& xd = fimdlp.transform(X[i]);
+        maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
+        Xd.push_back(xd);
+    }
+    return { Xd, maxes };
+}
+
+vector<mdlp::labels_t> discretizeDataset(vector<mdlp::samples_t>& X, mdlp::labels_t& y)
+{
+    vector<mdlp::labels_t> Xd;
+    auto fimdlp = mdlp::CPPFImdlp();
+    for (int i = 0; i < X.size(); i++) {
+        fimdlp.fit(X[i], y);
+        mdlp::labels_t& xd = fimdlp.transform(X[i]);
+        Xd.push_back(xd);
+    }
+    return Xd;
+}
+
+bool file_exists(const string& name)
+{
+    if (FILE* file = fopen(name.c_str(), "r")) {
+        fclose(file);
+        return true;
+    } else {
+        return false;
+    }
+}
+
+tuple<Tensor, Tensor, vector<string>, string, map<string, vector<int>>> loadDataset(const string& name, bool class_last, bool discretize_dataset)
+{
+    auto handler = ArffFiles();
+    handler.load(Paths::datasets() + static_cast<string>(name) + ".arff", class_last);
+    // Get Dataset X, y
+    vector<mdlp::samples_t>& X = handler.getX();
+    mdlp::labels_t& y = handler.getY();
+    // Get className & Features
+    auto className = handler.getClassName();
+    vector<string> features;
+    auto attributes = handler.getAttributes();
+    transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
+    Tensor Xd;
+    auto states = map<string, vector<int>>();
+    if (discretize_dataset) {
+        auto Xr = discretizeDataset(X, y);
+        Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
+        for (int i = 0; i < features.size(); ++i) {
+            states[features[i]] = vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
+            auto item = states.at(features[i]);
+            iota(begin(item), end(item), 0);
+            Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
+        }
+        states[className] = vector<int>(*max_element(y.begin(), y.end()) + 1);
+        iota(begin(states.at(className)), end(states.at(className)), 0);
+    } else {
+        Xd = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
+        for (int i = 0; i < features.size(); ++i) {
+            Xd.index_put_({ i, "..." }, torch::tensor(X[i]));
+        }
+    }
+    return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
+}
+
+tuple<vector<vector<int>>, vector<int>, vector<string>, string, map<string, vector<int>>> loadFile(const string& name)
+{
+    auto handler = ArffFiles();
+    handler.load(Paths::datasets() + static_cast<string>(name) + ".arff");
+    // Get Dataset X, y
+    vector<mdlp::samples_t>& X = handler.getX();
+    mdlp::labels_t& y = handler.getY();
+    // Get className & Features
+    auto className = handler.getClassName();
+    vector<string> features;
+    auto attributes = handler.getAttributes();
+    transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
+    // Discretize Dataset
+    vector<mdlp::labels_t> Xd;
+    map<string, int> maxes;
+    tie(Xd, maxes) = discretize(X, y, features);
+    maxes[className] = *max_element(y.begin(), y.end()) + 1;
+    map<string, vector<int>> states;
+    for (auto feature : features) {
+        states[feature] = vector<int>(maxes[feature]);
+    }
+    states[className] = vector<int>(maxes[className]);
+    return { Xd, y, features, className, states };
+}
+class RawDatasets {
+public:
+    RawDatasets(const string& file_name, bool discretize)
+    {
+        // Xt can be either discretized or not
+        tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize);
+        // Xv is always discretized
+        tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name);
+        auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
+        dataset = torch::cat({ Xt, yresized }, 0);
+        nSamples = dataset.size(1);
+        weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
+        weightsv = vector<double>(nSamples, 1.0 / nSamples);
+        classNumStates = discretize ? statest.at(classNamet).size() : 0;
+    }
+    torch::Tensor Xt, yt, dataset, weights;
+    vector<vector<int>> Xv;
+    vector<double> weightsv;
+    vector<int> yv;
+    vector<string> featurest, featuresv;
+    map<string, vector<int>> statest, statesv;
+    string classNamet, classNamev;
+    int nSamples, classNumStates;
+    double epsilon = 1e-5;
+};
 int main()
 {
-    map<string, string> balance = {
-        {"iris", "33,33% (50) / 33,33% (50) / 33,33% (50)"},
-        {"diabetes", "34,90% (268) / 65,10% (500)"},
-        {"ecoli", "42,56% (143) / 22,92% (77) / 0,60% (2) / 0,60% (2) / 10,42% (35) / 5,95% (20) / 1,49% (5) / 15,48% (52)"},
-        {"glass", "32,71% (70) / 7,94% (17) / 4,21% (9) / 35,51% (76) / 13,55% (29) / 6,07% (13)"}
-    };
-    for (const auto& file_name : { "iris", "glass", "ecoli", "diabetes" }) {
-        auto dt = Datasets(true, "Arff");
-        auto [X, y] = dt.getVectors(file_name);
-        //auto fold = KFold(5, 150);
-        auto fold = StratifiedKFold(5, y, -1);
-        cout << "***********************************************************************************************" << endl;
-        cout << "Dataset: " << file_name << endl;
-        cout << "Nº Samples: " << dt.getNSamples(file_name) << endl;
-        cout << "Class states: " << dt.getNClasses(file_name) << endl;
-        cout << "Balance: " << balance.at(file_name) << endl;
-        for (int i = 0; i < 5; ++i) {
-            cout << "Fold: " << i << endl;
-            auto [train, test] = fold.getFold(i);
-            cout << "Train: ";
-            cout << "(" << train.size() << "): ";
-            // for (auto j = 0; j < static_cast<int>(train.size()); j++)
-            //     cout << train[j] << ", ";
-            cout << endl;
-            cout << "Train Statistics : " << counts(y, train);
-            cout << "-------------------------------------------------------------------------------" << endl;
-            cout << "Test: ";
-            cout << "(" << test.size() << "): ";
-            // for (auto j = 0; j < static_cast<int>(test.size()); j++)
-            //     cout << test[j] << ", ";
-            cout << endl;
-            cout << "Test Statistics: " << counts(y, test);
-            cout << "==============================================================================" << endl;
-        }
-        cout << "***********************************************************************************************" << endl;
+    // map<string, string> balance = {
+    //     {"iris", "33,33% (50) / 33,33% (50) / 33,33% (50)"},
+    //     {"diabetes", "34,90% (268) / 65,10% (500)"},
+    //     {"ecoli", "42,56% (143) / 22,92% (77) / 0,60% (2) / 0,60% (2) / 10,42% (35) / 5,95% (20) / 1,49% (5) / 15,48% (52)"},
+    //     {"glass", "32,71% (70) / 7,94% (17) / 4,21% (9) / 35,51% (76) / 13,55% (29) / 6,07% (13)"}
+    // };
+    // for (const auto& file_name : { "iris", "glass", "ecoli", "diabetes" }) {
+    //     auto dt = Datasets(true, "Arff");
+    //     auto [X, y] = dt.getVectors(file_name);
+    //     //auto fold = KFold(5, 150);
+    //     auto fold = StratifiedKFold(5, y, -1);
+    //     cout << "***********************************************************************************************" << endl;
+    //     cout << "Dataset: " << file_name << endl;
+    //     cout << "Nº Samples: " << dt.getNSamples(file_name) << endl;
+    //     cout << "Class states: " << dt.getNClasses(file_name) << endl;
+    //     cout << "Balance: " << balance.at(file_name) << endl;
+    //     for (int i = 0; i < 5; ++i) {
+    //         cout << "Fold: " << i << endl;
+    //         auto [train, test] = fold.getFold(i);
+    //         cout << "Train: ";
+    //         cout << "(" << train.size() << "): ";
+    //         // for (auto j = 0; j < static_cast<int>(train.size()); j++)
+    //         //     cout << train[j] << ", ";
+    //         cout << endl;
+    //         cout << "Train Statistics : " << counts(y, train);
+    //         cout << "-------------------------------------------------------------------------------" << endl;
+    //         cout << "Test: ";
+    //         cout << "(" << test.size() << "): ";
+    //         // for (auto j = 0; j < static_cast<int>(test.size()); j++)
+    //         //     cout << test[j] << ", ";
+    //         cout << endl;
+    //         cout << "Test Statistics: " << counts(y, test);
+    //         cout << "==============================================================================" << endl;
+    //     }
+    //     cout << "***********************************************************************************************" << endl;
+    // }
+    const string file_name = "iris";
+    auto net = bayesnet::Network();
+    auto dt = Datasets(true, "Arff");
+    auto raw = RawDatasets("iris", true);
+    auto [X, y] = dt.getVectors(file_name);
+    cout << "Dataset dims " << raw.dataset.sizes() << endl;
+    cout << "weights dims " << raw.weights.sizes() << endl;
+    cout << "States dims " << raw.statest.size() << endl;
+    cout << "features: ";
+    for (const auto& feature : raw.featurest) {
+        cout << feature << ", ";
+        net.addNode(feature);
    }
+    net.addNode(raw.classNamet);
+    cout << endl;
+    net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);

 }