From 6d5a25cdc8c9d44b1b13c7c155f15cf029f9a55c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 14 Oct 2023 11:27:46 +0200 Subject: [PATCH] Refactor CFS class creating abstract base class --- src/BayesNet/CFS.cc | 94 ++++++----------------------------- src/BayesNet/CFS.h | 26 +++------- src/BayesNet/CMakeLists.txt | 2 +- src/BayesNet/FeatureSelect.cc | 74 +++++++++++++++++++++++++++ src/BayesNet/FeatureSelect.h | 31 ++++++++++++ 5 files changed, 127 insertions(+), 100 deletions(-) create mode 100644 src/BayesNet/FeatureSelect.cc create mode 100644 src/BayesNet/FeatureSelect.h diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index 6b64220..50c0ea8 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -2,90 +2,38 @@ #include #include "bayesnetUtils.h" namespace bayesnet { - CFS::CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : - Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) - { - } - double CFS::symmetricalUncertainty(int a, int b) - { - /* - Compute symmetrical uncertainty. Normalize* information gain (mutual - information) with the entropies of the features in order to compensate - the bias due to high cardinality features. *Range [0, 1] - (https://www.sciencedirect.com/science/article/pii/S0020025519303603) - */ - auto x = samples.index({ a, "..." }); - auto y = samples.index({ b, "..." }); - auto mu = mutualInformation(x, y, weights); - auto hx = entropy(x, weights); - auto hy = entropy(y, weights); - return 2.0 * mu / (hx + hy); - } - void CFS::computeSuLabels() - { - // Compute Simmetrical Uncertainty between features and labels - // https://en.wikipedia.org/wiki/Symmetric_uncertainty - for (int i = 0; i < features.size(); ++i) { - suLabels.push_back(symmetricalUncertainty(i, -1)); - } - } - double CFS::computeSuFeatures(const int firstFeature, const int secondFeature) - { - // Compute Simmetrical Uncertainty between features - // https://en.wikipedia.org/wiki/Symmetric_uncertainty - try { - return suFeatures.at({ firstFeature, secondFeature }); - } - catch (const out_of_range& e) { - auto result = symmetricalUncertainty(firstFeature, secondFeature); - suFeatures[{firstFeature, secondFeature}] = result; - return result; - } - } - double CFS::computeMerit() - { - double result; - double rcf = 0; - for (auto feature : cfsFeatures) { - rcf += suLabels[feature]; - } - double rff = 0; - int n = cfsFeatures.size(); - for (const auto& item : doCombinations(cfsFeatures)) { - rff += computeSuFeatures(item.first, item.second); - } - return rcf / sqrt(n + (n * n - n) * rff); - } + + void CFS::fit() { - cfsFeatures.clear(); + selectedFeatures.clear(); computeSuLabels(); auto featureOrder = argsort(suLabels); // sort descending order auto continueCondition = true; auto feature = featureOrder[0]; - cfsFeatures.push_back(feature); - cfsScores.push_back(suLabels[feature]); - cfsFeatures.erase(cfsFeatures.begin()); + selectedFeatures.push_back(feature); + selectedScores.push_back(suLabels[feature]); + selectedFeatures.erase(selectedFeatures.begin()); while (continueCondition) { double merit = numeric_limits::lowest(); int bestFeature = -1; for (auto feature : featureOrder) { - cfsFeatures.push_back(feature); - auto meritNew = computeMerit(); // Compute merit with cfsFeatures + selectedFeatures.push_back(feature); + auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures if (meritNew > merit) { merit = meritNew; bestFeature = feature; } - cfsFeatures.pop_back(); + selectedFeatures.pop_back(); } if (bestFeature == -1) { // meritNew has to be nan due to constant features break; } - cfsFeatures.push_back(bestFeature); - cfsScores.push_back(merit); + selectedFeatures.push_back(bestFeature); + selectedScores.push_back(merit); featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end()); continueCondition = computeContinueCondition(featureOrder); } @@ -93,10 +41,10 @@ namespace bayesnet { } bool CFS::computeContinueCondition(const vector& featureOrder) { - if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { + if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) { return false; } - if (cfsScores.size() >= 5) { + if (selectedScores.size() >= 5) { /* "To prevent the best first search from exploring the entire feature subset search space, a stopping criterion is imposed. @@ -106,7 +54,7 @@ namespace bayesnet { */ double item_ant = numeric_limits::lowest(); int num = 0; - vector lastFive(cfsScores.end() - 5, cfsScores.end()); + vector lastFive(selectedScores.end() - 5, selectedScores.end()); for (auto item : lastFive) { if (item_ant == numeric_limits::lowest()) { item_ant = item; @@ -124,18 +72,4 @@ namespace bayesnet { } return true; } - vector CFS::getFeatures() const - { - if (!fitted) { - throw runtime_error("CFS not fitted"); - } - return cfsFeatures; - } - vector CFS::getScores() const - { - if (!fitted) { - throw runtime_error("CFS not fitted"); - } - return cfsScores; - } } \ No newline at end of file diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h index eff5da6..36b7c52 100644 --- a/src/BayesNet/CFS.h +++ b/src/BayesNet/CFS.h @@ -2,32 +2,20 @@ #define CFS_H #include #include -#include "BayesMetrics.h" +#include "FeatureSelect.h" using namespace std; namespace bayesnet { - class CFS : public Metrics { + class CFS : public FeatureSelect { public: // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector - CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); + CFS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights) + { + } virtual ~CFS() {}; - void fit(); - void test(); - vector getFeatures() const; - vector getScores() const; + void fit() override; private: - void computeSuLabels(); - double computeSuFeatures(const int a, const int b); - double symmetricalUncertainty(int a, int b); - double computeMerit(); bool computeContinueCondition(const vector& featureOrder); - vector> combinations(const vector& features); - const torch::Tensor& weights; - int maxFeatures; - vector cfsFeatures; - vector cfsScores; - vector suLabels; - map, double> suFeatures; - bool fitted = false; }; } #endif \ No newline at end of file diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 27a2d3a..c9543ea 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.cc b/src/BayesNet/FeatureSelect.cc new file mode 100644 index 0000000..4eb45fe --- /dev/null +++ b/src/BayesNet/FeatureSelect.cc @@ -0,0 +1,74 @@ +#include "FeatureSelect.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : + Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) + + { + } + double FeatureSelect::symmetricalUncertainty(int a, int b) + { + /* + Compute symmetrical uncertainty. Normalize* information gain (mutual + information) with the entropies of the features in order to compensate + the bias due to high cardinality features. *Range [0, 1] + (https://www.sciencedirect.com/science/article/pii/S0020025519303603) + */ + auto x = samples.index({ a, "..." }); + auto y = samples.index({ b, "..." }); + auto mu = mutualInformation(x, y, weights); + auto hx = entropy(x, weights); + auto hy = entropy(y, weights); + return 2.0 * mu / (hx + hy); + } + void FeatureSelect::computeSuLabels() + { + // Compute Simmetrical Uncertainty between features and labels + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + for (int i = 0; i < features.size(); ++i) { + suLabels.push_back(symmetricalUncertainty(i, -1)); + } + } + double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature) + { + // Compute Simmetrical Uncertainty between features + // https://en.wikipedia.org/wiki/Symmetric_uncertainty + try { + return suFeatures.at({ firstFeature, secondFeature }); + } + catch (const out_of_range& e) { + double result = symmetricalUncertainty(firstFeature, secondFeature); + suFeatures[{firstFeature, secondFeature}] = result; + return result; + } + } + double FeatureSelect::computeMeritCFS() + { + double result; + double rcf = 0; + for (auto feature : selectedFeatures) { + rcf += suLabels[feature]; + } + double rff = 0; + int n = selectedFeatures.size(); + for (const auto& item : doCombinations(selectedFeatures)) { + rff += computeSuFeatures(item.first, item.second); + } + return rcf / sqrt(n + (n * n - n) * rff); + } + vector FeatureSelect::getFeatures() const + { + if (!fitted) { + throw runtime_error("FeatureSelect not fitted"); + } + return selectedFeatures; + } + vector FeatureSelect::getScores() const + { + if (!fitted) { + throw runtime_error("FeatureSelect not fitted"); + } + return selectedScores; + } +} \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.h b/src/BayesNet/FeatureSelect.h new file mode 100644 index 0000000..c1e280c --- /dev/null +++ b/src/BayesNet/FeatureSelect.h @@ -0,0 +1,31 @@ +#ifndef FEATURE_SELECT_H +#define FEATURE_SELECT_H +#include +#include +#include "BayesMetrics.h" +using namespace std; +namespace bayesnet { + class FeatureSelect : public Metrics { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + FeatureSelect(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); + virtual ~FeatureSelect() {}; + virtual void fit() = 0; + vector getFeatures() const; + vector getScores() const; + protected: + void computeSuLabels(); + double computeSuFeatures(const int a, const int b); + double symmetricalUncertainty(int a, int b); + double computeMeritCFS(); + vector> combinations(const vector& features); + const torch::Tensor& weights; + int maxFeatures; + vector selectedFeatures; + vector selectedScores; + vector suLabels; + map, double> suFeatures; + bool fitted = false; + }; +} +#endif \ No newline at end of file