BayesNet/sample/sample.cc

#include <iostream>
#include <string>
#include <torch/torch.h>
#include <thread>
#include <map>
#include <argparse/argparse.hpp>
#include "BaseClassifier.h"
#include "ArffFiles.h"
#include "Network.h"
#include "BayesMetrics.h"
#include "CPPFImdlp.h"
#include "KDB.h"
#include "SPODE.h"
#include "AODE.h"
#include "TAN.h"


using namespace std;

const string PATH = "../../data/";

inline constexpr auto hash_conv(const std::string_view sv)
{
    unsigned long hash{ 5381 };
    for (unsigned char c : sv) {
        hash = ((hash << 5) + hash) ^ c;
    }
    return hash;
}

inline constexpr auto operator"" _sh(const char* str, size_t len)
{
    return hash_conv(std::string_view{ str, len });
}

pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)
{
    vector<mdlp::labels_t>Xd;
    map<string, int> maxes;

    auto fimdlp = mdlp::CPPFImdlp();
    for (int i = 0; i < X.size(); i++) {
        fimdlp.fit(X[i], y);
        mdlp::labels_t& xd = fimdlp.transform(X[i]);
        maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
        Xd.push_back(xd);
    }
    return { Xd, maxes };
}

bool file_exists(const std::string& name)
{
    if (FILE* file = fopen(name.c_str(), "r")) {
        fclose(file);
        return true;
    } else {
        return false;
    }
}

int main(int argc, char** argv)
{
    map<string, bool> datasets = {
            {"diabetes",           true},
            {"ecoli",              true},
            {"glass",              true},
            {"iris",               true},
            {"kdd_JapaneseVowels", false},
            {"letter",             true},
            {"liver-disorders",    true},
            {"mfeat-factors",      true},
    };
    auto valid_datasets = vector<string>();
    for (auto dataset : datasets) {
        valid_datasets.push_back(dataset.first);
    }
    argparse::ArgumentParser program("BayesNetSample");
    program.add_argument("-f", "--file")
        .help("Dataset file name")
        .action([valid_datasets](const std::string& value) {
        if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {
            return value;
        }
        throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");
            }
    );
    program.add_argument("-p", "--path")
        .help(" folder where the data files are located, default")
        .default_value(string{ PATH }
    );
    program.add_argument("-m", "--model")
        .help("Model to use {AODE, KDB, SPODE, TAN}")
        .action([](const std::string& value) {
        static const vector<string> choices = { "AODE", "KDB", "SPODE", "TAN" };
        if (find(choices.begin(), choices.end(), value) != choices.end()) {
            return value;
        }
        throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}");
            }
    );
    program.add_argument("--discretize").default_value(false).implicit_value(true);
    bool class_last, discretize_dataset;
    string model_name, file_name, path, complete_file_name;
    try {
        program.parse_args(argc, argv);
        file_name = program.get<string>("file");
        path = program.get<string>("path");
        model_name = program.get<string>("model");
        discretize_dataset = program.get<bool>("discretize");
        complete_file_name = path + file_name + ".arff";
        class_last = datasets[file_name];
        if (!file_exists(complete_file_name)) {
            throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");
        }
    }
    catch (const exception& err) {
        cerr << err.what() << endl;
        cerr << program;
        exit(1);
    }

    /*
    * Begin Processing
    */
    auto handler = ArffFiles();
    handler.load(complete_file_name, class_last);
    // Get Dataset X, y
    vector<mdlp::samples_t>& X = handler.getX();
    mdlp::labels_t& y = handler.getY();
    // Get className & Features
    auto className = handler.getClassName();
    vector<string> features;
    for (auto feature : handler.getAttributes()) {
        features.push_back(feature.first);
    }
    // Discretize Dataset
    vector<mdlp::labels_t> Xd;
    map<string, int> maxes;
    tie(Xd, maxes) = discretize(X, y, features);
    maxes[className] = *max_element(y.begin(), y.end()) + 1;
    map<string, vector<int>> states;
    for (auto feature : features) {
        states[feature] = vector<int>(maxes[feature]);
    }
    states[className] = vector<int>(
        maxes[className]);
    double score;
    auto classifiers = map<string, bayesnet::BaseClassifier*>({ { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, { "SPODE",  new bayesnet::SPODE(2) }, { "TAN",  new bayesnet::TAN() } });
    bayesnet::BaseClassifier* clf = classifiers[model_name];
    clf->fit(Xd, y, features, className, states);
    score = clf->score(Xd, y);
    auto lines = clf->show();
    auto graph = clf->graph();
    for (auto line : lines) {
        cout << line << endl;
    }
    cout << "Score: " << score << endl;
    auto dot_file = model_name + "_" + file_name;
    ofstream file(dot_file + ".dot");
    file << graph;
    file.close();
    cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;
    cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;
    return 0;
}
Begin Network build 2023-06-29 20:00:41 +00:00			`#include <iostream>`
			`#include <string>`
			`#include <torch/torch.h>`
Add threads to exactInference 2023-07-06 09:59:48 +00:00			`#include <thread>`
Refactor Library renaming Base classes 2023-07-22 21:07:56 +00:00			`#include <map>`
refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`#include <argparse/argparse.hpp>`
Refactor Library renaming Base classes 2023-07-22 21:07:56 +00:00			`#include "BaseClassifier.h"`
Begin Network build 2023-06-29 20:00:41 +00:00			`#include "ArffFiles.h"`
			`#include "Network.h"`
Fix some mistakes to correct tests 2023-07-20 16:55:56 +00:00			`#include "BayesMetrics.h"`
Make fit build the network 2023-06-30 00:46:06 +00:00			`#include "CPPFImdlp.h"`
Begin implementing KDB 2023-07-13 01:15:42 +00:00			`#include "KDB.h"`
Complete SPODE & AODE 2023-07-14 23:59:30 +00:00			`#include "SPODE.h"`
			`#include "AODE.h"`
Complete TAN with Maximum Spanning Tree 2023-07-15 16:31:50 +00:00			`#include "TAN.h"`
Make fit build the network 2023-06-30 00:46:06 +00:00
Begin Network build 2023-06-29 20:00:41 +00:00
			`using namespace std;`

refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`const string PATH = "../../data/";`
Add Makefile & tests 2023-07-17 20:51:15 +00:00
			`inline constexpr auto hash_conv(const std::string_view sv)`
			`{`
			`unsigned long hash{ 5381 };`
			`for (unsigned char c : sv) {`
			`hash = ((hash << 5) + hash) ^ c;`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`}`
Add Makefile & tests 2023-07-17 20:51:15 +00:00			`return hash;`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`}`

Add Makefile & tests 2023-07-17 20:51:15 +00:00			`inline constexpr auto operator"" _sh(const char* str, size_t len)`
			`{`
			`return hash_conv(std::string_view{ str, len });`
			`}`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00
			`pair<vector<mdlp::labels_t>, map<string, int>> discretize(vector<mdlp::samples_t>& X, mdlp::labels_t& y, vector<string> features)`
Make fit build the network 2023-06-30 00:46:06 +00:00			`{`
			`vector<mdlp::labels_t>Xd;`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`map<string, int> maxes;`
Begin with parameter estimation 2023-06-30 19:24:12 +00:00
Make fit build the network 2023-06-30 00:46:06 +00:00			`auto fimdlp = mdlp::CPPFImdlp();`
			`for (int i = 0; i < X.size(); i++) {`
			`fimdlp.fit(X[i], y);`
Begin with parameter estimation 2023-06-30 19:24:12 +00:00			`mdlp::labels_t& xd = fimdlp.transform(X[i]);`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;`
Begin with parameter estimation 2023-06-30 19:24:12 +00:00			`Xd.push_back(xd);`
Make fit build the network 2023-06-30 00:46:06 +00:00			`}`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`return { Xd, maxes };`
Make fit build the network 2023-06-30 00:46:06 +00:00			`}`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00
			`bool file_exists(const std::string& name)`
			`{`
			`if (FILE* file = fopen(name.c_str(), "r")) {`
			`fclose(file);`
			`return true;`
			`} else {`
			`return false;`
			`}`
			`}`

refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`int main(int argc, char** argv)`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`{`
			`map<string, bool> datasets = {`
			`{"diabetes", true},`
Add entropy, conditionalEntropy, mutualInformation and conditionalEdgeWeight methods 2023-07-11 15:42:20 +00:00			`{"ecoli", true},`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`{"glass", true},`
			`{"iris", true},`
			`{"kdd_JapaneseVowels", false},`
			`{"letter", true},`
			`{"liver-disorders", true},`
			`{"mfeat-factors", true},`
			`};`
refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`auto valid_datasets = vector<string>();`
			`for (auto dataset : datasets) {`
			`valid_datasets.push_back(dataset.first);`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`}`
refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`argparse::ArgumentParser program("BayesNetSample");`
			`program.add_argument("-f", "--file")`
			`.help("Dataset file name")`
			`.action([valid_datasets](const std::string& value) {`
			`if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) {`
			`return value;`
			`}`
			`throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}");`
			`}`
			`);`
			`program.add_argument("-p", "--path")`
			`.help(" folder where the data files are located, default")`
			`.default_value(string{ PATH }`
			`);`
			`program.add_argument("-m", "--model")`
			`.help("Model to use {AODE, KDB, SPODE, TAN}")`
			`.action([](const std::string& value) {`
			`static const vector<string> choices = { "AODE", "KDB", "SPODE", "TAN" };`
			`if (find(choices.begin(), choices.end(), value) != choices.end()) {`
			`return value;`
			`}`
			`throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}");`
			`}`
			`);`
			`program.add_argument("--discretize").default_value(false).implicit_value(true);`
			`bool class_last, discretize_dataset;`
			`string model_name, file_name, path, complete_file_name;`
			`try {`
			`program.parse_args(argc, argv);`
			`file_name = program.get<string>("file");`
			`path = program.get<string>("path");`
			`model_name = program.get<string>("model");`
			`discretize_dataset = program.get<bool>("discretize");`
			`complete_file_name = path + file_name + ".arff";`
			`class_last = datasets[file_name];`
			`if (!file_exists(complete_file_name)) {`
			`throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist");`
			`}`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`}`
refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`catch (const exception& err) {`
			`cerr << err.what() << endl;`
			`cerr << program;`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`exit(1);`
			`}`

refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`/*`
			`* Begin Processing`
			`*/`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`auto handler = ArffFiles();`
refactor sample to use new argparse library 2023-07-21 00:12:47 +00:00			`handler.load(complete_file_name, class_last);`
Implement predict and predict_proba Add samples and add parameters to main 2023-07-01 12:45:44 +00:00			`// Get Dataset X, y`
			`vector<mdlp::samples_t>& X = handler.getX();`
			`mdlp::labels_t& y = handler.getY();`
			`// Get className & Features`
			`auto className = handler.getClassName();`
			`vector<string> features;`
			`for (auto feature : handler.getAttributes()) {`
			`features.push_back(feature.first);`
			`}`
			`// Discretize Dataset`
			`vector<mdlp::labels_t> Xd;`
			`map<string, int> maxes;`
			`tie(Xd, maxes) = discretize(X, y, features);`
			`maxes[className] = *max_element(y.begin(), y.end()) + 1;`
Begin implementing KDB 2023-07-13 01:15:42 +00:00			`map<string, vector<int>> states;`
			`for (auto feature : features) {`
			`states[feature] = vector<int>(maxes[feature]);`
			`}`
Complete predict and score of kdb Change new/delete to make_unique 2023-07-14 23:05:36 +00:00			`states[className] = vector<int>(`
			`maxes[className]);`
Add Makefile & tests 2023-07-17 20:51:15 +00:00			`double score;`
Refactor Library renaming Base classes 2023-07-22 21:07:56 +00:00			`auto classifiers = map<string, bayesnet::BaseClassifier*>({ { "AODE", new bayesnet::AODE() }, { "KDB", new bayesnet::KDB(2) }, { "SPODE", new bayesnet::SPODE(2) }, { "TAN", new bayesnet::TAN() } });`
			`bayesnet::BaseClassifier* clf = classifiers[model_name];`
			`clf->fit(Xd, y, features, className, states);`
			`score = clf->score(Xd, y);`
			`auto lines = clf->show();`
			`auto graph = clf->graph();`
Add Makefile & tests 2023-07-17 20:51:15 +00:00			`for (auto line : lines) {`
Complete TAN with Maximum Spanning Tree 2023-07-15 16:31:50 +00:00			`cout << line << endl;`
			`}`
Add Makefile & tests 2023-07-17 20:51:15 +00:00			`cout << "Score: " << score << endl;`
			`auto dot_file = model_name + "_" + file_name;`
			`ofstream file(dot_file + ".dot");`
			`file << graph;`
Add graphviz output to models 2023-07-15 23:20:47 +00:00			`file.close();`
Add Makefile & tests 2023-07-17 20:51:15 +00:00			`cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl;`
			`cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl;`
Begin Network build 2023-06-29 20:00:41 +00:00			`return 0;`
			`}`