From 5022a4dc90345c27bf373d351b039f9a0d83285c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 13 Oct 2023 12:29:25 +0200 Subject: [PATCH] Complete CFS tested with Python mufs --- .vscode/c_cpp_properties.json | 18 +++++++++++++ src/BayesNet/BayesMetrics.cc | 20 +++++--------- src/BayesNet/BayesMetrics.h | 12 ++++++++- src/BayesNet/BoostAODE.cc | 6 +++-- src/BayesNet/CFS.cc | 49 +++++++++++++++++++++++++++++++--- src/BayesNet/CFS.h | 1 + src/Platform/CMakeLists.txt | 2 +- src/Platform/testx.cpp | 50 +++++++++++++++++++++++++---------- 8 files changed, 123 insertions(+), 35 deletions(-) create mode 100644 .vscode/c_cpp_properties.json diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..6faaf51 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Mac", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "macFrameworkPath": [ + "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks" + ], + "cStandard": "c17", + "cppStandard": "c++17", + "compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 86de9ea..e98f41a 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -60,24 +60,13 @@ namespace bayesnet { { return scoresKBest; } - template - vector> Metrics::doCombinations(const vector& source) - { - vector> result; - for (int i = 0; i < source.size(); ++i) { - T temp = source[i]; - for (int j = i + 1; j < source.size(); ++j) { - result.push_back({ temp, source[j] }); - } - } - return result; - } + torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights) { auto result = vector(); auto source = vector(features); source.push_back(className); - auto combinations = doCombinations(source); + auto combinations = doCombinations(source); // Compute class prior auto margin = torch::zeros({ classNumStates }, torch::kFloat); for (int value = 0; value < classNumStates; ++value) { @@ -123,6 +112,11 @@ namespace bayesnet { torch::Tensor counts = feature.bincount(weights); double totalWeight = counts.sum().item(); torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; + // cout << "Probs: "; + // for (int i = 0; i < probs.size(0); ++i) { + // cout << probs[i].item() << ", "; + // } + // cout << endl; torch::Tensor logProbs = torch::log(probs); torch::Tensor entropy = -probs * logProbs; return entropy.nansum().item(); diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 30606c0..341951e 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -18,7 +18,17 @@ namespace bayesnet { double entropy(const Tensor& feature, const Tensor& weights); vector features; template - vector> doCombinations(const vector& source); + vector> doCombinations(const vector& source) + { + vector> result; + for (int i = 0; i < source.size(); ++i) { + T temp = source[i]; + for (int j = i + 1; j < source.size(); ++j) { + result.push_back({ temp, source[j] }); + } + } + return result; + } public: Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index a9120a0..a95d6e2 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -2,11 +2,11 @@ #include #include #include "BoostAODE.h" -#include "BayesMetrics.h" #include "Colors.h" #include "Folding.h" #include "Paths.h" #include +#include "CFS.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} @@ -98,13 +98,15 @@ namespace bayesnet { } } output += "]"; + Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + int maxFeatures = 0; + auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); // std::size_t str_hash = std::hash{}(output); string str_hash = sha256(output); stringstream oss; oss << platform::Paths::cfs() << str_hash << ".json"; string name = oss.str(); ifstream file(name); - Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); if (file.is_open()) { nlohmann::json cfsFeatures = nlohmann::json::parse(file); file.close(); diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index b3473cd..51e30dc 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -17,14 +17,22 @@ namespace bayesnet { */ auto x = samples.index({ a, "..." }); auto y = samples.index({ b, "..." }); - return 2.0 * mutualInformation(y, x, weights) / (entropy(x, weights) + entropy(y, weights)); + auto mu = mutualInformation(x, y, weights); + // cout << "Mutual Information: (" << a << ", " << b << ") = " << mu << endl; + auto hx = entropy(x, weights); + // cout << "Entropy X: " << hx << endl; + auto hy = entropy(y, weights); + // cout << "Entropy Y: " << hy << endl; + return 2.0 * mu / (hx + hy); } void CFS::computeSuLabels() { // Compute Simmetrical Uncertainty between features and labels // https://en.wikipedia.org/wiki/Symmetric_uncertainty + // cout << "SuLabels" << endl; for (int i = 0; i < features.size(); ++i) { - suLabels[i] = symmetricalUncertainty(i, -1); + suLabels.push_back(symmetricalUncertainty(i, -1)); + // cout << i << " -> " << suLabels[i] << endl; } } @@ -44,7 +52,7 @@ namespace bayesnet { } double rff = 0; int n = cfsFeatures.size(); - for (const auto& item : doCombinations(cfsFeatures)) { + for (const auto& item : doCombinations(cfsFeatures)) { rff += computeSuFeatures(item.first, item.second); } return rcf / sqrt(n + (n * n - n) * rff); @@ -58,25 +66,58 @@ namespace bayesnet { auto feature = featureOrder[0]; cfsFeatures.push_back(feature); cfsScores.push_back(suLabels[feature]); + cfsFeatures.erase(cfsFeatures.begin()); while (continueCondition) { double merit = numeric_limits::lowest(); int bestFeature = -1; for (auto feature : featureOrder) { cfsFeatures.push_back(feature); auto meritNew = computeMerit(); // Compute merit with cfsFeatures + //cout << "MeritNew: " << meritNew << " Merit: " << merit << endl; if (meritNew > merit) { merit = meritNew; bestFeature = feature; } cfsFeatures.pop_back(); } + if (bestFeature == -1) { + throw runtime_error("Feature not found"); + } cfsFeatures.push_back(bestFeature); cfsScores.push_back(merit); - featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), feature), featureOrder.end()); + featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end()); continueCondition = computeContinueCondition(featureOrder); } fitted = true; } + void CFS::test() + { + cout << "H(y): " << entropy(samples.index({ -1, "..." }), weights) << endl; + cout << "y: "; + auto y = samples.index({ -1, "..." }); + for (int i = 0; i < y.size(0); ++i) { + cout << y[i].item() << ", "; + } + cout << endl; + computeSuLabels(); + // cout << "Probabilites of features: " << endl; + // for (const auto& featureName : features) { + // int featureIdx = find(features.begin(), features.end(), featureName) - features.begin(); + // cout << featureName << "(" << featureIdx << "): "; + // auto feature = samples.index({ featureIdx, "..." }); + // torch::Tensor counts = feature.bincount(weights); + // double totalWeight = counts.sum().item(); + // torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; + // for (int i = 0; i < probs.size(0); ++i) { + // cout << probs[i].item() << ", "; + // } + // cout << endl; + // // for (int i = 0; i < x.size(0); ++i) { + // // cout << x[i].item() << ", "; + // // } + // // cout << endl; + // } + } bool CFS::computeContinueCondition(const vector& featureOrder) { if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h index 1cf621d..556659a 100644 --- a/src/BayesNet/CFS.h +++ b/src/BayesNet/CFS.h @@ -11,6 +11,7 @@ namespace bayesnet { CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); virtual ~CFS() {}; void fit(); + void test(); vector getFeatures() const; vector getScores() const; private: diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 4111c34..75e846f 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -9,7 +9,7 @@ add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Mo add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc) add_executable(b_list list.cc Datasets.cc Dataset.cc) add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc) -add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc) +add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc ) target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux") target_link_libraries(b_manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs) diff --git a/src/Platform/testx.cpp b/src/Platform/testx.cpp index 43ab29c..c6b733e 100644 --- a/src/Platform/testx.cpp +++ b/src/Platform/testx.cpp @@ -7,6 +7,7 @@ #include "Network.h" #include "ArffFiles.h" #include "CPPFImdlp.h" +#include "CFS.h" using namespace std; using namespace platform; @@ -191,22 +192,43 @@ int main() // } // cout << "***********************************************************************************************" << endl; // } - const string file_name = "iris"; - auto net = bayesnet::Network(); + // const string file_name = "iris"; + // auto net = bayesnet::Network(); + // auto dt = Datasets(true, "Arff"); + // auto raw = RawDatasets("iris", true); + // auto [X, y] = dt.getVectors(file_name); + // cout << "Dataset dims " << raw.dataset.sizes() << endl; + // cout << "weights dims " << raw.weights.sizes() << endl; + // cout << "States dims " << raw.statest.size() << endl; + // cout << "features: "; + // for (const auto& feature : raw.featurest) { + // cout << feature << ", "; + // net.addNode(feature); + // } + // net.addNode(raw.classNamet); + // cout << endl; + // net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); auto dt = Datasets(true, "Arff"); - auto raw = RawDatasets("iris", true); - auto [X, y] = dt.getVectors(file_name); - cout << "Dataset dims " << raw.dataset.sizes() << endl; - cout << "weights dims " << raw.weights.sizes() << endl; - cout << "States dims " << raw.statest.size() << endl; - cout << "features: "; - for (const auto& feature : raw.featurest) { - cout << feature << ", "; - net.addNode(feature); + for (const auto& name : dt.getNames()) { + //for (const auto& name : { "iris" }) { + auto [X, y] = dt.getTensors(name); + auto features = dt.getFeatures(name); + auto states = dt.getStates(name); + auto className = dt.getClassName(name); + int maxFeatures = 0; + auto classNumStates = states.at(className).size(); + torch::Tensor weights = torch::full({ X.size(1) }, 1.0 / X.size(1), torch::kDouble); + auto dataset = X; + auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1); + dataset = torch::cat({ dataset, yresized }, 0); + auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights); + cfs.fit(); + cout << "Dataset: " << name << " CFS features: "; + for (const auto& feature : cfs.getFeatures()) { + cout << feature << ", "; + } + cout << "end." << endl; } - net.addNode(raw.classNamet); - cout << endl; - net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest); }