From 40d1dad5d827f3729e38c7b3fe448e1b511bb880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 11 Oct 2023 21:17:26 +0200 Subject: [PATCH] Begin CFS implementation --- src/BayesNet/BayesMetrics.cc | 9 +-- src/BayesNet/BayesMetrics.h | 14 ++-- src/BayesNet/CFS.cc | 127 +++++++++++++++++++++++++++++++++++ src/BayesNet/CFS.h | 31 +++++++++ src/BayesNet/CMakeLists.txt | 2 +- src/BayesNet/Node.h | 2 +- 6 files changed, 173 insertions(+), 12 deletions(-) create mode 100644 src/BayesNet/CFS.cc create mode 100644 src/BayesNet/CFS.h diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 623656e..86de9ea 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -60,11 +60,12 @@ namespace bayesnet { { return scoresKBest; } - vector> Metrics::doCombinations(const vector& source) + template + vector> Metrics::doCombinations(const vector& source) { - vector> result; + vector> result; for (int i = 0; i < source.size(); ++i) { - string temp = source[i]; + T temp = source[i]; for (int j = i + 1; j < source.size(); ++j) { result.push_back({ temp, source[j] }); } @@ -76,7 +77,7 @@ namespace bayesnet { auto result = vector(); auto source = vector(features); source.push_back(className); - auto combinations = doCombinations(source); + auto combinations = doCombinations(source); // Compute class prior auto margin = torch::zeros({ classNumStates }, torch::kFloat); for (int value = 0; value < classNumStates; ++value) { diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 01841a7..30606c0 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -8,20 +8,22 @@ namespace bayesnet { using namespace torch; class Metrics { private: - Tensor samples; // nxm tensor used to fit the model - vector features; - string className; int classNumStates = 0; vector scoresKBest; vector featuresKBest; // sorted indices of the features - double entropy(const Tensor& feature, const Tensor& weights); double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); - vector> doCombinations(const vector&); + protected: + Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector + string className; + double entropy(const Tensor& feature, const Tensor& weights); + vector features; + template + vector> doCombinations(const vector& source); public: Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); Metrics(const vector>& vsamples, const vector& labels, const vector& features, const string& className, const int classNumStates); - vector SelectKBestWeighted(const torch::Tensor& weights, bool ascending=false, unsigned k = 0); + vector SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0); vector getScoresKBest() const; double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); vector conditionalEdgeWeights(vector& weights); // To use in Python diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc new file mode 100644 index 0000000..b3473cd --- /dev/null +++ b/src/BayesNet/CFS.cc @@ -0,0 +1,127 @@ +#include "CFS.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + CFS::CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) + + { + } + double CFS::symmetricalUncertainty(int a, int b) + { + /* + Compute symmetrical uncertainty. Normalize* information gain (mutual + information) with the entropies of the features in order to compensate + the bias due to high cardinality features. *Range [0, 1] + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + */ + auto x = samples.index({ a, "..." }); + auto y = samples.index({ b, "..." }); + return 2.0 * mutualInformation(y, x, weights) / (entropy(x, weights) + entropy(y, weights)); + } + void CFS::computeSuLabels() + { + // Compute Simmetrical Uncertainty between features and labels + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + for (int i = 0; i < features.size(); ++i) { + suLabels[i] = symmetricalUncertainty(i, -1); + } + + } + double CFS::computeSuFeatures(const int firstFeature, const int secondFeature) + { + // Compute Simmetrical Uncertainty between features + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + // TODO: Implement Cache in this function + return symmetricalUncertainty(firstFeature, secondFeature); + } + double CFS::computeMerit() + { + double result; + double rcf = 0; + for (auto feature : cfsFeatures) { + rcf += suLabels[feature]; + } + double rff = 0; + int n = cfsFeatures.size(); + for (const auto& item : doCombinations(cfsFeatures)) { + rff += computeSuFeatures(item.first, item.second); + } + return rcf / sqrt(n + (n * n - n) * rff); + } + void CFS::fit() + { + cfsFeatures.clear(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto continueCondition = true; + auto feature = featureOrder[0]; + cfsFeatures.push_back(feature); + cfsScores.push_back(suLabels[feature]); + while (continueCondition) { + double merit = numeric_limits::lowest(); + int bestFeature = -1; + for (auto feature : featureOrder) { + cfsFeatures.push_back(feature); + auto meritNew = computeMerit(); // Compute merit with cfsFeatures + if (meritNew > merit) { + merit = meritNew; + bestFeature = feature; + } + cfsFeatures.pop_back(); + } + cfsFeatures.push_back(bestFeature); + cfsScores.push_back(merit); + featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), feature), featureOrder.end()); + continueCondition = computeContinueCondition(featureOrder); + } + fitted = true; + } + bool CFS::computeContinueCondition(const vector& featureOrder) + { + if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { + return false; + } + if (cfsScores.size() >= 5) { + /* + "To prevent the best first search from exploring the entire + feature subset search space, a stopping criterion is imposed. + The search will terminate if five consecutive fully expanded + subsets show no improvement over the current best subset." + as stated in Mark A.Hall Thesis + */ + double item_ant = numeric_limits::lowest(); + int num = 0; + vector lastFive(cfsScores.end() - 5, cfsScores.end()); + for (auto item : lastFive) { + if (item_ant == numeric_limits::lowest()) { + item_ant = item; + } + if (item > item_ant) { + break; + } else { + num++; + item_ant = item; + } + } + if (num == 5) { + return false; + } + } + return true; + } + vector CFS::getFeatures() const + { + if (!fitted) { + throw runtime_error("CFS not fitted"); + } + return cfsFeatures; + } + vector CFS::getScores() const + { + if (!fitted) { + throw runtime_error("CFS not fitted"); + } + return cfsScores; + } +} \ No newline at end of file diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h new file mode 100644 index 0000000..1cf621d --- /dev/null +++ b/src/BayesNet/CFS.h @@ -0,0 +1,31 @@ +#ifndef CFS_H +#define CFS_H +#include +#include +#include "BayesMetrics.h" +using namespace std; +namespace bayesnet { + class CFS : public Metrics { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); + virtual ~CFS() {}; + void fit(); + vector getFeatures() const; + vector getScores() const; + private: + void computeSuLabels(); + double computeSuFeatures(const int a, const int b); + double symmetricalUncertainty(int a, int b); + double computeMerit(); + bool computeContinueCondition(const vector& featureOrder); + vector> combinations(const vector& features); + const torch::Tensor& weights; + int maxFeatures; + vector cfsFeatures; + vector cfsScores; + vector suLabels; + bool fitted = false; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 6ca1238..e22827e 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto) \ No newline at end of file diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index 6758c5c..4979007 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -14,8 +14,8 @@ namespace bayesnet { int numStates; // number of states of the variable torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ... vector dimensions; // dimensions of the cpTable - public: vector> combinations(const vector&); + public: explicit Node(const string&); void clear(); void addParent(Node*);