From 8fdad78a8cd047b900f0568aa473c7f34bbd1605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Mon, 9 Oct 2023 11:25:30 +0200 Subject: [PATCH] Continue Test Network --- Makefile | 14 +-- src/BayesNet/Network.cc | 3 +- src/BayesNet/Network.h | 3 + src/Platform/CMakeLists.txt | 2 +- src/Platform/testx.cpp | 221 ++++++++++++++++++++++++++++++------ tests/TestBayesNetwork.cc | 163 ++++++++++++++++++++------ 6 files changed, 324 insertions(+), 82 deletions(-) diff --git a/Makefile b/Makefile index 0aae4a2..c289e16 100644 --- a/Makefile +++ b/Makefile @@ -47,10 +47,10 @@ dependency: ## Create a dependency graph diagram of the project (build/dependenc cd $(f_debug) && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png buildd: ## Build the debug targets - cmake --build $(f_debug) -t $(app_targets) $(n_procs) + cmake --build $(f_debug) -t $(app_targets) -j $(n_procs) buildr: ## Build the release targets - cmake --build $(f_release) -t $(app_targets) $(n_procs) + cmake --build $(f_release) -t $(app_targets) -j $(n_procs) clean: ## Clean the tests info @echo ">>> Cleaning Debug BayesNet tests..."; @@ -64,21 +64,21 @@ debug: ## Build a debug version of the project @echo ">>> Building Debug BayesNet..."; @if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi @mkdir $(f_debug); - @cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON $(n_procs) ; + @cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON @echo ">>> Done"; release: ## Build a Release version of the project @echo ">>> Building Release BayesNet..."; @if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi @mkdir $(f_release); - @cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release $(n_procs); + @cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release @echo ">>> Done"; opt = "" test: ## Run tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section @echo ">>> Running BayesNet & Platform tests..."; @$(MAKE) clean - @cmake --build $(f_debug) -t $(test_targets) $(n_procs) + @cmake --build $(f_debug) -t $(test_targets) -j $(n_procs) @for t in $(test_targets); do \ if [ -f $(f_debug)/tests/$$t ]; then \ cd $(f_debug)/tests ; \ @@ -91,7 +91,7 @@ opt = "" testp: ## Run platform tests (opt="-s") to verbose output the tests, (opt="-c='Stratified Fold Test'") to run only that section @echo ">>> Running Platform tests..."; @$(MAKE) clean - @cmake --build $(f_debug) --target unit_tests_platform $(n_procs) ; + @cmake --build $(f_debug) --target unit_tests_platform -j $(n_procs) @if [ -f $(f_debug)/tests/unit_tests_platform ]; then cd $(f_debug)/tests ; ./unit_tests_platform $(opt) ; fi ; @echo ">>> Done"; @@ -99,7 +99,7 @@ opt = "" testb: ## Run BayesNet tests (opt="-s") to verbose output the tests, (opt="-c='Test Maximum Spanning Tree'") to run only that section @echo ">>> Running BayesNet tests..."; @$(MAKE) clean - @cmake --build $(f_debug) --target unit_tests_bayesnet $(n_procs) ; + @cmake --build $(f_debug) --target unit_tests_bayesnet -j $(n_procs) @if [ -f $(f_debug)/tests/unit_tests_bayesnet ]; then cd $(f_debug)/tests ; ./unit_tests_bayesnet $(opt) ; fi ; @echo ">>> Done"; diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index 6434d5d..88f3610 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -201,8 +201,7 @@ namespace bayesnet { } if (proba) return result; - else - return result.argmax(1); + return result.argmax(1); } // Return mxn tensor of probabilities Tensor Network::predict_proba(const Tensor& samples) diff --git a/src/BayesNet/Network.h b/src/BayesNet/Network.h index e720c52..2b89a47 100644 --- a/src/BayesNet/Network.h +++ b/src/BayesNet/Network.h @@ -39,6 +39,9 @@ namespace bayesnet { int getNumEdges() const; int getClassNumStates() const; string getClassName() const; + /* + Notice: Nodes have to be inserted in the same order as they are in the dataset, i.e., first node is first column and so on. + */ void fit(const vector>& input_data, const vector& labels, const vector& weights, const vector& featureNames, const string& className, const map>& states); void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector& featureNames, const string& className, const map>& states); void fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector& featureNames, const string& className, const map>& states); diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 05b8804..4111c34 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -19,4 +19,4 @@ else() target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}") endif() target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}") -target_link_libraries(testx ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file +target_link_libraries(testx ArffFiles mdlp BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index 7bc392b..43ab29c 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -1,11 +1,16 @@ #include "Folding.h" +#include #include "map" -#include "Datasets.h" -#include #include #include +#include "Datasets.h" +#include "Network.h" +#include "ArffFiles.h" +#include "CPPFImdlp.h" + using namespace std; using namespace platform; +using namespace torch; string counts(vector y, vector indices) { @@ -21,45 +26,187 @@ string counts(vector y, vector indices) oss << endl; return oss.str(); } +class Paths { +public: + static string datasets() + { + return "datasets/"; + } +}; +pair, map> discretize(vector& X, mdlp::labels_t& y, vector features) +{ + vector Xd; + map maxes; + auto fimdlp = mdlp::CPPFImdlp(); + for (int i = 0; i < X.size(); i++) { + fimdlp.fit(X[i], y); + mdlp::labels_t& xd = fimdlp.transform(X[i]); + maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1; + Xd.push_back(xd); + } + return { Xd, maxes }; +} + +vector discretizeDataset(vector& X, mdlp::labels_t& y) +{ + vector Xd; + auto fimdlp = mdlp::CPPFImdlp(); + for (int i = 0; i < X.size(); i++) { + fimdlp.fit(X[i], y); + mdlp::labels_t& xd = fimdlp.transform(X[i]); + Xd.push_back(xd); + } + return Xd; +} + +bool file_exists(const string& name) +{ + if (FILE* file = fopen(name.c_str(), "r")) { + fclose(file); + return true; + } else { + return false; + } +} + +tuple, string, map>> loadDataset(const string& name, bool class_last, bool discretize_dataset) +{ + auto handler = ArffFiles(); + handler.load(Paths::datasets() + static_cast(name) + ".arff", class_last); + // Get Dataset X, y + vector& X = handler.getX(); + mdlp::labels_t& y = handler.getY(); + // Get className & Features + auto className = handler.getClassName(); + vector features; + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); + Tensor Xd; + auto states = map>(); + if (discretize_dataset) { + auto Xr = discretizeDataset(X, y); + Xd = torch::zeros({ static_cast(Xr.size()), static_cast(Xr[0].size()) }, torch::kInt32); + for (int i = 0; i < features.size(); ++i) { + states[features[i]] = vector(*max_element(Xr[i].begin(), Xr[i].end()) + 1); + auto item = states.at(features[i]); + iota(begin(item), end(item), 0); + Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32)); + } + states[className] = vector(*max_element(y.begin(), y.end()) + 1); + iota(begin(states.at(className)), end(states.at(className)), 0); + } else { + Xd = torch::zeros({ static_cast(X.size()), static_cast(X[0].size()) }, torch::kFloat32); + for (int i = 0; i < features.size(); ++i) { + Xd.index_put_({ i, "..." }, torch::tensor(X[i])); + } + } + return { Xd, torch::tensor(y, torch::kInt32), features, className, states }; +} + +tuple>, vector, vector, string, map>> loadFile(const string& name) +{ + auto handler = ArffFiles(); + handler.load(Paths::datasets() + static_cast(name) + ".arff"); + // Get Dataset X, y + vector& X = handler.getX(); + mdlp::labels_t& y = handler.getY(); + // Get className & Features + auto className = handler.getClassName(); + vector features; + auto attributes = handler.getAttributes(); + transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); + // Discretize Dataset + vector Xd; + map maxes; + tie(Xd, maxes) = discretize(X, y, features); + maxes[className] = *max_element(y.begin(), y.end()) + 1; + map> states; + for (auto feature : features) { + states[feature] = vector(maxes[feature]); + } + states[className] = vector(maxes[className]); + return { Xd, y, features, className, states }; +} +class RawDatasets { +public: + RawDatasets(const string& file_name, bool discretize) + { + // Xt can be either discretized or not + tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize); + // Xv is always discretized + tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name); + auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1); + dataset = torch::cat({ Xt, yresized }, 0); + nSamples = dataset.size(1); + weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble); + weightsv = vector(nSamples, 1.0 / nSamples); + classNumStates = discretize ? statest.at(classNamet).size() : 0; + } + torch::Tensor Xt, yt, dataset, weights; + vector> Xv; + vector weightsv; + vector yv; + vector featurest, featuresv; + map> statest, statesv; + string classNamet, classNamev; + int nSamples, classNumStates; + double epsilon = 1e-5; +}; int main() { - map balance = { - {"iris", "33,33% (50) / 33,33% (50) / 33,33% (50)"}, - {"diabetes", "34,90% (268) / 65,10% (500)"}, - {"ecoli", "42,56% (143) / 22,92% (77) / 0,60% (2) / 0,60% (2) / 10,42% (35) / 5,95% (20) / 1,49% (5) / 15,48% (52)"}, - {"glass", "32,71% (70) / 7,94% (17) / 4,21% (9) / 35,51% (76) / 13,55% (29) / 6,07% (13)"} - }; - for (const auto& file_name : { "iris", "glass", "ecoli", "diabetes" }) { - auto dt = Datasets(true, "Arff"); - auto [X, y] = dt.getVectors(file_name); - //auto fold = KFold(5, 150); - auto fold = StratifiedKFold(5, y, -1); - cout << "***********************************************************************************************" << endl; - cout << "Dataset: " << file_name << endl; - cout << "Nº Samples: " << dt.getNSamples(file_name) << endl; - cout << "Class states: " << dt.getNClasses(file_name) << endl; - cout << "Balance: " << balance.at(file_name) << endl; - for (int i = 0; i < 5; ++i) { - cout << "Fold: " << i << endl; - auto [train, test] = fold.getFold(i); - cout << "Train: "; - cout << "(" << train.size() << "): "; - // for (auto j = 0; j < static_cast(train.size()); j++) - // cout << train[j] << ", "; - cout << endl; - cout << "Train Statistics : " << counts(y, train); - cout << "-------------------------------------------------------------------------------" << endl; - cout << "Test: "; - cout << "(" << test.size() << "): "; - // for (auto j = 0; j < static_cast(test.size()); j++) - // cout << test[j] << ", "; - cout << endl; - cout << "Test Statistics: " << counts(y, test); - cout << "==============================================================================" << endl; - } - cout << "***********************************************************************************************" << endl; + // map balance = { + // {"iris", "33,33% (50) / 33,33% (50) / 33,33% (50)"}, + // {"diabetes", "34,90% (268) / 65,10% (500)"}, + // {"ecoli", "42,56% (143) / 22,92% (77) / 0,60% (2) / 0,60% (2) / 10,42% (35) / 5,95% (20) / 1,49% (5) / 15,48% (52)"}, + // {"glass", "32,71% (70) / 7,94% (17) / 4,21% (9) / 35,51% (76) / 13,55% (29) / 6,07% (13)"} + // }; + // for (const auto& file_name : { "iris", "glass", "ecoli", "diabetes" }) { + // auto dt = Datasets(true, "Arff"); + // auto [X, y] = dt.getVectors(file_name); + // //auto fold = KFold(5, 150); + // auto fold = StratifiedKFold(5, y, -1); + // cout << "***********************************************************************************************" << endl; + // cout << "Dataset: " << file_name << endl; + // cout << "Nº Samples: " << dt.getNSamples(file_name) << endl; + // cout << "Class states: " << dt.getNClasses(file_name) << endl; + // cout << "Balance: " << balance.at(file_name) << endl; + // for (int i = 0; i < 5; ++i) { + // cout << "Fold: " << i << endl; + // auto [train, test] = fold.getFold(i); + // cout << "Train: "; + // cout << "(" << train.size() << "): "; + // // for (auto j = 0; j < static_cast(train.size()); j++) + // // cout << train[j] << ", "; + // cout << endl; + // cout << "Train Statistics : " << counts(y, train); + // cout << "-------------------------------------------------------------------------------" << endl; + // cout << "Test: "; + // cout << "(" << test.size() << "): "; + // // for (auto j = 0; j < static_cast(test.size()); j++) + // // cout << test[j] << ", "; + // cout << endl; + // cout << "Test Statistics: " << counts(y, test); + // cout << "==============================================================================" << endl; + // } + // cout << "***********************************************************************************************" << endl; + // } + const string file_name = "iris"; + auto net = bayesnet::Network(); + auto dt = Datasets(true, "Arff"); + auto raw = RawDatasets("iris", true); + auto [X, y] = dt.getVectors(file_name); + cout << "Dataset dims " << raw.dataset.sizes() << endl; + cout << "weights dims " << raw.weights.sizes() << endl; + cout << "States dims " << raw.statest.size() << endl; + cout << "features: "; + for (const auto& feature : raw.featurest) { + cout << feature << ", "; + net.addNode(feature); } + net.addNode(raw.classNamet); + cout << endl; + net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); } diff --git a/tests/TestBayesNetwork.cc b/tests/TestBayesNetwork.cc index 128bd16..d680e50 100644 --- a/tests/TestBayesNetwork.cc +++ b/tests/TestBayesNetwork.cc @@ -5,14 +5,29 @@ #include "TestUtils.h" #include "Network.h" +void buildModel(bayesnet::Network& net, const vector& features, const string& className) +{ + vector> network = { {0, 1}, {0, 2}, {1, 3} }; + for (const auto& feature : features) { + net.addNode(feature); + } + net.addNode(className); + for (const auto& edge : network) { + net.addEdge(features.at(edge.first), features.at(edge.second)); + } + for (const auto& feature : features) { + net.addEdge(className, feature); + } +} + TEST_CASE("Test Bayesian Network", "[BayesNet]") { auto raw = RawDatasets("iris", true); + auto net = bayesnet::Network(); SECTION("Test get features") { - auto net = bayesnet::Network(); net.addNode("A"); net.addNode("B"); REQUIRE(net.getFeatures() == vector{"A", "B"}); @@ -21,7 +36,6 @@ TEST_CASE("Test Bayesian Network", "[BayesNet]") } SECTION("Test get edges") { - auto net = bayesnet::Network(); net.addNode("A"); net.addNode("B"); net.addNode("C"); @@ -35,7 +49,6 @@ TEST_CASE("Test Bayesian Network", "[BayesNet]") } SECTION("Test getNodes") { - auto net = bayesnet::Network(); net.addNode("A"); net.addNode("B"); auto& nodes = net.getNodes(); @@ -43,13 +56,119 @@ TEST_CASE("Test Bayesian Network", "[BayesNet]") REQUIRE(nodes.count("B") == 1); } - SECTION("Test fit") + SECTION("Test fit Network") + { + auto net2 = bayesnet::Network(); + auto net3 = bayesnet::Network(); + net3.initialize(); + net2.initialize(); + net.initialize(); + buildModel(net, raw.featuresv, raw.classNamev); + buildModel(net2, raw.featurest, raw.classNamet); + buildModel(net3, raw.featurest, raw.classNamet); + vector> edges = { + {"class", "sepallength"}, {"class", "sepalwidth"}, {"class", "petallength"}, + {"class", "petalwidth" }, {"sepallength", "sepalwidth"}, {"sepallength", "petallength"}, + {"sepalwidth", "petalwidth"} + }; + REQUIRE(net.getEdges() == edges); + REQUIRE(net2.getEdges() == edges); + REQUIRE(net3.getEdges() == edges); + vector features = { "sepallength", "sepalwidth", "petallength", "petalwidth", "class" }; + REQUIRE(net.getFeatures() == features); + REQUIRE(net2.getFeatures() == features); + REQUIRE(net3.getFeatures() == features); + auto& nodes = net.getNodes(); + auto& nodes2 = net2.getNodes(); + auto& nodes3 = net3.getNodes(); + // Check Nodes parents & children + for (const auto& feature : features) { + // Parents + vector parents, parents2, parents3, children, children2, children3; + auto nodeParents = nodes[feature]->getParents(); + auto nodeParents2 = nodes2[feature]->getParents(); + auto nodeParents3 = nodes3[feature]->getParents(); + transform(nodeParents.begin(), nodeParents.end(), back_inserter(parents), [](const auto& p) { return p->getName(); }); + transform(nodeParents2.begin(), nodeParents2.end(), back_inserter(parents2), [](const auto& p) { return p->getName(); }); + transform(nodeParents3.begin(), nodeParents3.end(), back_inserter(parents3), [](const auto& p) { return p->getName(); }); + REQUIRE(parents == parents2); + REQUIRE(parents == parents3); + // Children + auto nodeChildren = nodes[feature]->getChildren(); + auto nodeChildren2 = nodes2[feature]->getChildren(); + auto nodeChildren3 = nodes2[feature]->getChildren(); + transform(nodeChildren.begin(), nodeChildren.end(), back_inserter(children), [](const auto& p) { return p->getName(); }); + transform(nodeChildren2.begin(), nodeChildren2.end(), back_inserter(children2), [](const auto& p) { return p->getName(); }); + transform(nodeChildren3.begin(), nodeChildren3.end(), back_inserter(children3), [](const auto& p) { return p->getName(); }); + REQUIRE(children == children2); + REQUIRE(children == children3); + } + // Fit networks + net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv); + net2.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); + net3.fit(raw.Xt, raw.yt, raw.weights, raw.featurest, raw.classNamet, raw.statest); + REQUIRE(net.getStates() == net2.getStates()); + REQUIRE(net.getStates() == net3.getStates()); + // Check Conditional Probabilities tables + for (int i = 0; i < features.size(); ++i) { + auto feature = features.at(i); + for (const auto& feature : features) { + auto cpt = nodes[feature]->getCPT(); + auto cpt2 = nodes2[feature]->getCPT(); + auto cpt3 = nodes3[feature]->getCPT(); + REQUIRE(cpt.equal(cpt2)); + REQUIRE(cpt.equal(cpt3)); + } + } + } + SECTION("Test show") { auto net = bayesnet::Network(); - // net.fit(raw.Xv, raw.yv, raw.weightsv, raw.featuresv, raw.classNamev, raw.statesv); - net.fit(raw.Xt, raw.yt, raw.weights, raw.featurest, raw.classNamet, raw.statest); - REQUIRE(net.getClassName() == "class"); + net.addNode("A"); + net.addNode("B"); + net.addNode("C"); + net.addEdge("A", "B"); + net.addEdge("A", "C"); + auto str = net.show(); + REQUIRE(str.size() == 3); + REQUIRE(str[0] == "A -> B, C, "); + REQUIRE(str[1] == "B -> "); + REQUIRE(str[2] == "C -> "); } + SECTION("Test topological_sort") + { + auto net = bayesnet::Network(); + net.addNode("A"); + net.addNode("B"); + net.addNode("C"); + net.addEdge("A", "B"); + net.addEdge("A", "C"); + auto sorted = net.topological_sort(); + REQUIRE(sorted.size() == 3); + REQUIRE(sorted[0] == "A"); + bool result = sorted[1] == "B" && sorted[2] == "C"; + REQUIRE(result); + } + SECTION("Test graph") + { + auto net = bayesnet::Network(); + net.addNode("A"); + net.addNode("B"); + net.addNode("C"); + net.addEdge("A", "B"); + net.addEdge("A", "C"); + auto str = net.graph("Test Graph"); + REQUIRE(str.size() == 7); + cout << str << endl; + REQUIRE(str[0] == "digraph BayesNet {\nlabel=\nfontsize=30\nfontcolor=blue\nlabelloc=t\nlayout=circo\n"); + REQUIRE(str[1] == "A [shape=circle] \n"); + REQUIRE(str[2] == "A -> B"); + REQUIRE(str[3] == "A -> C"); + REQUIRE(str[4] == "B [shape=circle] \n"); + REQUIRE(str[5] == "C [shape=circle] \n"); + REQUIRE(str[6] == "}\n"); + } + // SECTION("Test predict") // { @@ -81,34 +200,8 @@ TEST_CASE("Test Bayesian Network", "[BayesNet]") // REQUIRE(score == Catch::Approx(); // } -// SECTION("Test topological_sort") -// { -// auto net = bayesnet::Network(); -// net.addNode("A"); -// net.addNode("B"); -// net.addNode("C"); -// net.addEdge("A", "B"); -// net.addEdge("A", "C"); -// auto sorted = net.topological_sort(); -// REQUIRE(sorted.size() == 3); -// REQUIRE(sorted[0] == "A"); -// REQUIRE((sorted[1] == "B" && sorted[2] == "C") || (sorted[1] == "C" && sorted[2] == "B")); -// } - -// SECTION("Test show") -// { -// auto net = bayesnet::Network(); -// net.addNode("A"); -// net.addNode("B"); -// net.addNode("C"); -// net.addEdge("A", "B"); -// net.addEdge("A", "C"); -// auto str = net.show(); -// REQUIRE(str.size() == 3); -// REQUIRE(str[0] == "A"); -// REQUIRE(str[1] == "B -> C"); -// REQUIRE(str[2] == "C"); -// } +// +// // SECTION("Test graph") // {