Refactor CFS class creating abstract base class

This commit is contained in:
Ricardo Montañana Gómez 2023-10-14 11:27:46 +02:00
parent d00b08cbe8
commit 6d5a25cdc8
Signed by: rmontanana
GPG Key ID: 46064262FD9A7ADE
5 changed files with 127 additions and 100 deletions

View File

@ -2,90 +2,38 @@
#include <limits> #include <limits>
#include "bayesnetUtils.h" #include "bayesnetUtils.h"
namespace bayesnet { namespace bayesnet {
CFS::CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
{
}
double CFS::symmetricalUncertainty(int a, int b)
{
/*
Compute symmetrical uncertainty. Normalize* information gain (mutual
information) with the entropies of the features in order to compensate
the bias due to high cardinality features. *Range [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
*/
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." });
auto mu = mutualInformation(x, y, weights);
auto hx = entropy(x, weights);
auto hy = entropy(y, weights);
return 2.0 * mu / (hx + hy);
}
void CFS::computeSuLabels()
{
// Compute Simmetrical Uncertainty between features and labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
for (int i = 0; i < features.size(); ++i) {
suLabels.push_back(symmetricalUncertainty(i, -1));
}
}
double CFS::computeSuFeatures(const int firstFeature, const int secondFeature)
{
// Compute Simmetrical Uncertainty between features
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
try {
return suFeatures.at({ firstFeature, secondFeature });
}
catch (const out_of_range& e) {
auto result = symmetricalUncertainty(firstFeature, secondFeature);
suFeatures[{firstFeature, secondFeature}] = result;
return result;
}
}
double CFS::computeMerit()
{
double result;
double rcf = 0;
for (auto feature : cfsFeatures) {
rcf += suLabels[feature];
}
double rff = 0;
int n = cfsFeatures.size();
for (const auto& item : doCombinations(cfsFeatures)) {
rff += computeSuFeatures(item.first, item.second);
}
return rcf / sqrt(n + (n * n - n) * rff);
}
void CFS::fit() void CFS::fit()
{ {
cfsFeatures.clear(); selectedFeatures.clear();
computeSuLabels(); computeSuLabels();
auto featureOrder = argsort(suLabels); // sort descending order auto featureOrder = argsort(suLabels); // sort descending order
auto continueCondition = true; auto continueCondition = true;
auto feature = featureOrder[0]; auto feature = featureOrder[0];
cfsFeatures.push_back(feature); selectedFeatures.push_back(feature);
cfsScores.push_back(suLabels[feature]); selectedScores.push_back(suLabels[feature]);
cfsFeatures.erase(cfsFeatures.begin()); selectedFeatures.erase(selectedFeatures.begin());
while (continueCondition) { while (continueCondition) {
double merit = numeric_limits<double>::lowest(); double merit = numeric_limits<double>::lowest();
int bestFeature = -1; int bestFeature = -1;
for (auto feature : featureOrder) { for (auto feature : featureOrder) {
cfsFeatures.push_back(feature); selectedFeatures.push_back(feature);
auto meritNew = computeMerit(); // Compute merit with cfsFeatures auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures
if (meritNew > merit) { if (meritNew > merit) {
merit = meritNew; merit = meritNew;
bestFeature = feature; bestFeature = feature;
} }
cfsFeatures.pop_back(); selectedFeatures.pop_back();
} }
if (bestFeature == -1) { if (bestFeature == -1) {
// meritNew has to be nan due to constant features // meritNew has to be nan due to constant features
break; break;
} }
cfsFeatures.push_back(bestFeature); selectedFeatures.push_back(bestFeature);
cfsScores.push_back(merit); selectedScores.push_back(merit);
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end()); featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
continueCondition = computeContinueCondition(featureOrder); continueCondition = computeContinueCondition(featureOrder);
} }
@ -93,10 +41,10 @@ namespace bayesnet {
} }
bool CFS::computeContinueCondition(const vector<int>& featureOrder) bool CFS::computeContinueCondition(const vector<int>& featureOrder)
{ {
if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) { if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
return false; return false;
} }
if (cfsScores.size() >= 5) { if (selectedScores.size() >= 5) {
/* /*
"To prevent the best first search from exploring the entire "To prevent the best first search from exploring the entire
feature subset search space, a stopping criterion is imposed. feature subset search space, a stopping criterion is imposed.
@ -106,7 +54,7 @@ namespace bayesnet {
*/ */
double item_ant = numeric_limits<double>::lowest(); double item_ant = numeric_limits<double>::lowest();
int num = 0; int num = 0;
vector<double> lastFive(cfsScores.end() - 5, cfsScores.end()); vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
for (auto item : lastFive) { for (auto item : lastFive) {
if (item_ant == numeric_limits<double>::lowest()) { if (item_ant == numeric_limits<double>::lowest()) {
item_ant = item; item_ant = item;
@ -124,18 +72,4 @@ namespace bayesnet {
} }
return true; return true;
} }
vector<int> CFS::getFeatures() const
{
if (!fitted) {
throw runtime_error("CFS not fitted");
}
return cfsFeatures;
}
vector<double> CFS::getScores() const
{
if (!fitted) {
throw runtime_error("CFS not fitted");
}
return cfsScores;
}
} }

View File

@ -2,32 +2,20 @@
#define CFS_H #define CFS_H
#include <torch/torch.h> #include <torch/torch.h>
#include <vector> #include <vector>
#include "BayesMetrics.h" #include "FeatureSelect.h"
using namespace std; using namespace std;
namespace bayesnet { namespace bayesnet {
class CFS : public Metrics { class CFS : public FeatureSelect {
public: public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights); CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
{
}
virtual ~CFS() {}; virtual ~CFS() {};
void fit(); void fit() override;
void test();
vector<int> getFeatures() const;
vector<double> getScores() const;
private: private:
void computeSuLabels();
double computeSuFeatures(const int a, const int b);
double symmetricalUncertainty(int a, int b);
double computeMerit();
bool computeContinueCondition(const vector<int>& featureOrder); bool computeContinueCondition(const vector<int>& featureOrder);
vector<pair<int, int>> combinations(const vector<int>& features);
const torch::Tensor& weights;
int maxFeatures;
vector<int> cfsFeatures;
vector<double> cfsScores;
vector<double> suLabels;
map<pair<int, int>, double> suFeatures;
bool fitted = false;
}; };
} }
#endif #endif

View File

@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
include_directories(${BayesNet_SOURCE_DIR}/src/Platform) include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")

View File

@ -0,0 +1,74 @@
#include "FeatureSelect.h"
#include <limits>
#include "bayesnetUtils.h"
namespace bayesnet {
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
{
}
double FeatureSelect::symmetricalUncertainty(int a, int b)
{
/*
Compute symmetrical uncertainty. Normalize* information gain (mutual
information) with the entropies of the features in order to compensate
the bias due to high cardinality features. *Range [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
*/
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." });
auto mu = mutualInformation(x, y, weights);
auto hx = entropy(x, weights);
auto hy = entropy(y, weights);
return 2.0 * mu / (hx + hy);
}
void FeatureSelect::computeSuLabels()
{
// Compute Simmetrical Uncertainty between features and labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
for (int i = 0; i < features.size(); ++i) {
suLabels.push_back(symmetricalUncertainty(i, -1));
}
}
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
{
// Compute Simmetrical Uncertainty between features
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
try {
return suFeatures.at({ firstFeature, secondFeature });
}
catch (const out_of_range& e) {
double result = symmetricalUncertainty(firstFeature, secondFeature);
suFeatures[{firstFeature, secondFeature}] = result;
return result;
}
}
double FeatureSelect::computeMeritCFS()
{
double result;
double rcf = 0;
for (auto feature : selectedFeatures) {
rcf += suLabels[feature];
}
double rff = 0;
int n = selectedFeatures.size();
for (const auto& item : doCombinations(selectedFeatures)) {
rff += computeSuFeatures(item.first, item.second);
}
return rcf / sqrt(n + (n * n - n) * rff);
}
vector<int> FeatureSelect::getFeatures() const
{
if (!fitted) {
throw runtime_error("FeatureSelect not fitted");
}
return selectedFeatures;
}
vector<double> FeatureSelect::getScores() const
{
if (!fitted) {
throw runtime_error("FeatureSelect not fitted");
}
return selectedScores;
}
}

View File

@ -0,0 +1,31 @@
#ifndef FEATURE_SELECT_H
#define FEATURE_SELECT_H
#include <torch/torch.h>
#include <vector>
#include "BayesMetrics.h"
using namespace std;
namespace bayesnet {
class FeatureSelect : public Metrics {
public:
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
virtual ~FeatureSelect() {};
virtual void fit() = 0;
vector<int> getFeatures() const;
vector<double> getScores() const;
protected:
void computeSuLabels();
double computeSuFeatures(const int a, const int b);
double symmetricalUncertainty(int a, int b);
double computeMeritCFS();
vector<pair<int, int>> combinations(const vector<int>& features);
const torch::Tensor& weights;
int maxFeatures;
vector<int> selectedFeatures;
vector<double> selectedScores;
vector<double> suLabels;
map<pair<int, int>, double> suFeatures;
bool fitted = false;
};
}
#endif