diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index b15bbe7..426efea 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -2,5 +2,6 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) -add_executable(main Experiment.cc platformUtils.cc) +include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) +add_executable(main Experiment.cc Folding.cc platformUtils.cc) target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Experiment.cc b/src/Platform/Experiment.cc index 7564645..d2191f7 100644 --- a/src/Platform/Experiment.cc +++ b/src/Platform/Experiment.cc @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include "ArffFiles.h" #include "Network.h" #include "BayesMetrics.h" @@ -11,71 +11,11 @@ #include "SPODE.h" #include "AODE.h" #include "TAN.h" -#include "platformUtils.h" using namespace std; -/* print a description of all supported options */ -void usage(const char* path) -{ - /* take only the last portion of the path */ - const char* basename = strrchr(path, '/'); - basename = basename ? basename + 1 : path; - - cout << "usage: " << basename << "[OPTION]" << endl; - cout << " -h, --help\t\t Print this help and exit." << endl; - cout - << " -f, --file[=FILENAME]\t {diabetes, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}." - << endl; - cout << " -p, --path[=FILENAME]\t folder where the data files are located, default " << PATH << endl; - cout << " -m, --model={AODE, KDB, SPODE, TAN}\t " << endl; -} - -tuple parse_arguments(int argc, char** argv) -{ - string file_name; - string model_name; - string path = PATH; - const vector long_options = { - {"help", no_argument, nullptr, 'h'}, - {"file", required_argument, nullptr, 'f'}, - {"path", required_argument, nullptr, 'p'}, - {"model", required_argument, nullptr, 'm'}, - {nullptr, no_argument, nullptr, 0} - }; - while (true) { - const auto c = getopt_long(argc, argv, "hf:p:m:", long_options.data(), nullptr); - if (c == -1) - break; - switch (c) { - case 'h': - usage(argv[0]); - exit(0); - case 'f': - file_name = string(optarg); - break; - case 'm': - model_name = string(optarg); - break; - case 'p': - path = optarg; - if (path.back() != '/') - path += '/'; - break; - case '?': - usage(argv[0]); - exit(1); - default: - abort(); - } - } - if (file_name.empty()) { - usage(argv[0]); - exit(1); - } - return make_tuple(file_name, path, model_name); -} +const string PATH = "../../data/"; inline constexpr auto hash_conv(const std::string_view sv) { @@ -91,9 +31,32 @@ inline constexpr auto operator"" _sh(const char* str, size_t len) return hash_conv(std::string_view{ str, len }); } +pair, map> discretize(vector& X, mdlp::labels_t& y, vector features) +{ + vectorXd; + map maxes; + auto fimdlp = mdlp::CPPFImdlp(); + for (int i = 0; i < X.size(); i++) { + fimdlp.fit(X[i], y); + mdlp::labels_t& xd = fimdlp.transform(X[i]); + maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1; + Xd.push_back(xd); + } + return { Xd, maxes }; +} -tuple get_options(int argc, char** argv) +bool file_exists(const std::string& name) +{ + if (FILE* file = fopen(name.c_str(), "r")) { + fclose(file); + return true; + } else { + return false; + } +} + +int main(int argc, char** argv) { map datasets = { {"diabetes", true}, @@ -105,35 +68,60 @@ tuple get_options(int argc, char** argv) {"liver-disorders", true}, {"mfeat-factors", true}, }; - vector models = { "AODE", "KDB", "SPODE", "TAN" }; - string file_name; - string path; - string model_name; - tie(file_name, path, model_name) = parse_arguments(argc, argv); - if (datasets.find(file_name) == datasets.end()) { - cout << "Invalid file name: " << file_name << endl; - usage(argv[0]); + auto valid_datasets = vector(); + for (auto dataset : datasets) { + valid_datasets.push_back(dataset.first); + } + argparse::ArgumentParser program("BayesNetSample"); + program.add_argument("-f", "--file") + .help("Dataset file name") + .action([valid_datasets](const std::string& value) { + if (find(valid_datasets.begin(), valid_datasets.end(), value) != valid_datasets.end()) { + return value; + } + throw runtime_error("file must be one of {diabetes, ecoli, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors}"); + } + ); + program.add_argument("-p", "--path") + .help(" folder where the data files are located, default") + .default_value(string{ PATH } + ); + program.add_argument("-m", "--model") + .help("Model to use {AODE, KDB, SPODE, TAN}") + .action([](const std::string& value) { + static const vector choices = { "AODE", "KDB", "SPODE", "TAN" }; + if (find(choices.begin(), choices.end(), value) != choices.end()) { + return value; + } + throw runtime_error("Model must be one of {AODE, KDB, SPODE, TAN}"); + } + ); + program.add_argument("--discretize").default_value(false).implicit_value(true); + bool class_last, discretize_dataset; + string model_name, file_name, path, complete_file_name; + try { + program.parse_args(argc, argv); + file_name = program.get("file"); + path = program.get("path"); + model_name = program.get("model"); + discretize_dataset = program.get("discretize"); + complete_file_name = path + file_name + ".arff"; + class_last = datasets[file_name]; + if (!file_exists(complete_file_name)) { + throw runtime_error("Data File " + path + file_name + ".arff" + " does not exist"); + } + } + catch (const exception& err) { + cerr << err.what() << endl; + cerr << program; exit(1); } - if (!file_exists(path + file_name + ".arff")) { - cout << "Data File " << path + file_name + ".arff" << " does not exist" << endl; - usage(argv[0]); - exit(1); - } - if (find(models.begin(), models.end(), model_name) == models.end()) { - cout << "Invalid model name: " << model_name << endl; - usage(argv[0]); - exit(1); - } - return { file_name, path, model_name }; -} -int main(int argc, char** argv) -{ - string file_name, path, model_name; - tie(file_name, path, model_name) = get_options(argc, argv); + /* + * Begin Processing + */ auto handler = ArffFiles(); - handler.load(path + file_name + ".arff"); + handler.load(complete_file_name, class_last); // Get Dataset X, y vector& X = handler.getX(); mdlp::labels_t& y = handler.getY(); diff --git a/src/Platform/Folding.cc b/src/Platform/Folding.cc new file mode 100644 index 0000000..11fca29 --- /dev/null +++ b/src/Platform/Folding.cc @@ -0,0 +1,31 @@ +#include "Folding.h" +#include +#include + +using namespace std; + +KFold::KFold(int k, int n, int seed) +{ + this->k = k; + this->n = n; + indices = vector(n); + iota(begin(indices), end(indices), 0); // fill with 0, 1, ..., n - 1 + shuffle(indices.begin(), indices.end(), default_random_engine(seed)); +} +pair, vector> KFold::getFold(int nFold) +{ + if (nFold >= k || nFold < 0) { + throw invalid_argument("nFold (" + to_string(nFold) + ") must be less than k (" + to_string(k) + ")"); + } + int nTest = n / k; + auto train = vector(); + auto test = vector(); + for (int i = 0; i < n; i++) { + if (i >= nTest * nFold && i < nTest * (nFold + 1)) { + test.push_back(indices[i]); + } else { + train.push_back(indices[i]); + } + } + return { train, test }; +} \ No newline at end of file diff --git a/src/Platform/Folding.h b/src/Platform/Folding.h new file mode 100644 index 0000000..f851b2f --- /dev/null +++ b/src/Platform/Folding.h @@ -0,0 +1,18 @@ +#ifndef FOLDING_H +#define FOLDING_H +#include +using namespace std; +class KFold { +private: + int k; + int n; + vector indices; + +public: + KFold(int k, int n, int seed); + pair, vector> getFold(int); +}; +class KStratifiedFold { + +}; +#endif \ No newline at end of file diff --git a/src/Platform/m b/src/Platform/m new file mode 100755 index 0000000..8b81161 Binary files /dev/null and b/src/Platform/m differ diff --git a/src/Platform/main.cpp b/src/Platform/main.cpp new file mode 100644 index 0000000..5adcf48 --- /dev/null +++ b/src/Platform/main.cpp @@ -0,0 +1,51 @@ +#include "Folding.h" +#include +using namespace std; +class A { +private: + int a; +public: + A(int a) : a(a) {} + int getA() { return a; } +}; +class B : public A { +private: + int b; +public: + B(int a, int b) : A(a), b(b) {} + int getB() { return b; } +}; +class C : public A { +private: + int c; +public: + C(int a, int c) : A(a), c(c) {} + int getC() { return c; } +}; +int main() +{ + auto fold = KFold(5, 100, 1); + for (int i = 0; i < 5; ++i) { + cout << "Fold: " << i << endl; + auto [train, test] = fold.getFold(i); + cout << "Train: "; + cout << "(" << train.size() << "): "; + for (auto j = 0; j < static_cast(train.size()); j++) + cout << train[j] << ", "; + cout << endl; + cout << "Test: "; + cout << "(" << train.size() << "): "; + for (auto j = 0; j < static_cast(test.size()); j++) + cout << test[j] << ", "; + cout << endl; + cout << "Vector poly" << endl; + auto some = vector(); + auto cx = C(5, 4); + auto bx = B(7, 6); + some.push_back(cx); + some.push_back(bx); + for (auto& obj : some) { + cout << "Obj :" << obj.getA() << endl; + } + } +}