Refactor CFS class creating abstract base class
This commit is contained in:
parent
d00b08cbe8
commit
6d5a25cdc8
@ -2,90 +2,38 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include "bayesnetUtils.h"
|
#include "bayesnetUtils.h"
|
||||||
namespace bayesnet {
|
namespace bayesnet {
|
||||||
CFS::CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
|
||||||
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
|
|
||||||
|
|
||||||
{
|
|
||||||
}
|
|
||||||
double CFS::symmetricalUncertainty(int a, int b)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
Compute symmetrical uncertainty. Normalize* information gain (mutual
|
|
||||||
information) with the entropies of the features in order to compensate
|
|
||||||
the bias due to high cardinality features. *Range [0, 1]
|
|
||||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
|
||||||
*/
|
|
||||||
auto x = samples.index({ a, "..." });
|
|
||||||
auto y = samples.index({ b, "..." });
|
|
||||||
auto mu = mutualInformation(x, y, weights);
|
|
||||||
auto hx = entropy(x, weights);
|
|
||||||
auto hy = entropy(y, weights);
|
|
||||||
return 2.0 * mu / (hx + hy);
|
|
||||||
}
|
|
||||||
void CFS::computeSuLabels()
|
|
||||||
{
|
|
||||||
// Compute Simmetrical Uncertainty between features and labels
|
|
||||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
|
||||||
for (int i = 0; i < features.size(); ++i) {
|
|
||||||
suLabels.push_back(symmetricalUncertainty(i, -1));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
double CFS::computeSuFeatures(const int firstFeature, const int secondFeature)
|
|
||||||
{
|
|
||||||
// Compute Simmetrical Uncertainty between features
|
|
||||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
|
||||||
try {
|
|
||||||
return suFeatures.at({ firstFeature, secondFeature });
|
|
||||||
}
|
|
||||||
catch (const out_of_range& e) {
|
|
||||||
auto result = symmetricalUncertainty(firstFeature, secondFeature);
|
|
||||||
suFeatures[{firstFeature, secondFeature}] = result;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
double CFS::computeMerit()
|
|
||||||
{
|
|
||||||
double result;
|
|
||||||
double rcf = 0;
|
|
||||||
for (auto feature : cfsFeatures) {
|
|
||||||
rcf += suLabels[feature];
|
|
||||||
}
|
|
||||||
double rff = 0;
|
|
||||||
int n = cfsFeatures.size();
|
|
||||||
for (const auto& item : doCombinations(cfsFeatures)) {
|
|
||||||
rff += computeSuFeatures(item.first, item.second);
|
|
||||||
}
|
|
||||||
return rcf / sqrt(n + (n * n - n) * rff);
|
|
||||||
}
|
|
||||||
void CFS::fit()
|
void CFS::fit()
|
||||||
{
|
{
|
||||||
cfsFeatures.clear();
|
selectedFeatures.clear();
|
||||||
computeSuLabels();
|
computeSuLabels();
|
||||||
auto featureOrder = argsort(suLabels); // sort descending order
|
auto featureOrder = argsort(suLabels); // sort descending order
|
||||||
auto continueCondition = true;
|
auto continueCondition = true;
|
||||||
auto feature = featureOrder[0];
|
auto feature = featureOrder[0];
|
||||||
cfsFeatures.push_back(feature);
|
selectedFeatures.push_back(feature);
|
||||||
cfsScores.push_back(suLabels[feature]);
|
selectedScores.push_back(suLabels[feature]);
|
||||||
cfsFeatures.erase(cfsFeatures.begin());
|
selectedFeatures.erase(selectedFeatures.begin());
|
||||||
while (continueCondition) {
|
while (continueCondition) {
|
||||||
double merit = numeric_limits<double>::lowest();
|
double merit = numeric_limits<double>::lowest();
|
||||||
int bestFeature = -1;
|
int bestFeature = -1;
|
||||||
for (auto feature : featureOrder) {
|
for (auto feature : featureOrder) {
|
||||||
cfsFeatures.push_back(feature);
|
selectedFeatures.push_back(feature);
|
||||||
auto meritNew = computeMerit(); // Compute merit with cfsFeatures
|
auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures
|
||||||
if (meritNew > merit) {
|
if (meritNew > merit) {
|
||||||
merit = meritNew;
|
merit = meritNew;
|
||||||
bestFeature = feature;
|
bestFeature = feature;
|
||||||
}
|
}
|
||||||
cfsFeatures.pop_back();
|
selectedFeatures.pop_back();
|
||||||
}
|
}
|
||||||
if (bestFeature == -1) {
|
if (bestFeature == -1) {
|
||||||
// meritNew has to be nan due to constant features
|
// meritNew has to be nan due to constant features
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
cfsFeatures.push_back(bestFeature);
|
selectedFeatures.push_back(bestFeature);
|
||||||
cfsScores.push_back(merit);
|
selectedScores.push_back(merit);
|
||||||
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
|
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
|
||||||
continueCondition = computeContinueCondition(featureOrder);
|
continueCondition = computeContinueCondition(featureOrder);
|
||||||
}
|
}
|
||||||
@ -93,10 +41,10 @@ namespace bayesnet {
|
|||||||
}
|
}
|
||||||
bool CFS::computeContinueCondition(const vector<int>& featureOrder)
|
bool CFS::computeContinueCondition(const vector<int>& featureOrder)
|
||||||
{
|
{
|
||||||
if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) {
|
if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (cfsScores.size() >= 5) {
|
if (selectedScores.size() >= 5) {
|
||||||
/*
|
/*
|
||||||
"To prevent the best first search from exploring the entire
|
"To prevent the best first search from exploring the entire
|
||||||
feature subset search space, a stopping criterion is imposed.
|
feature subset search space, a stopping criterion is imposed.
|
||||||
@ -106,7 +54,7 @@ namespace bayesnet {
|
|||||||
*/
|
*/
|
||||||
double item_ant = numeric_limits<double>::lowest();
|
double item_ant = numeric_limits<double>::lowest();
|
||||||
int num = 0;
|
int num = 0;
|
||||||
vector<double> lastFive(cfsScores.end() - 5, cfsScores.end());
|
vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
|
||||||
for (auto item : lastFive) {
|
for (auto item : lastFive) {
|
||||||
if (item_ant == numeric_limits<double>::lowest()) {
|
if (item_ant == numeric_limits<double>::lowest()) {
|
||||||
item_ant = item;
|
item_ant = item;
|
||||||
@ -124,18 +72,4 @@ namespace bayesnet {
|
|||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
vector<int> CFS::getFeatures() const
|
|
||||||
{
|
|
||||||
if (!fitted) {
|
|
||||||
throw runtime_error("CFS not fitted");
|
|
||||||
}
|
|
||||||
return cfsFeatures;
|
|
||||||
}
|
|
||||||
vector<double> CFS::getScores() const
|
|
||||||
{
|
|
||||||
if (!fitted) {
|
|
||||||
throw runtime_error("CFS not fitted");
|
|
||||||
}
|
|
||||||
return cfsScores;
|
|
||||||
}
|
|
||||||
}
|
}
|
@ -2,32 +2,20 @@
|
|||||||
#define CFS_H
|
#define CFS_H
|
||||||
#include <torch/torch.h>
|
#include <torch/torch.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include "BayesMetrics.h"
|
#include "FeatureSelect.h"
|
||||||
using namespace std;
|
using namespace std;
|
||||||
namespace bayesnet {
|
namespace bayesnet {
|
||||||
class CFS : public Metrics {
|
class CFS : public FeatureSelect {
|
||||||
public:
|
public:
|
||||||
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
|
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
|
||||||
CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
|
CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
||||||
|
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
|
||||||
|
{
|
||||||
|
}
|
||||||
virtual ~CFS() {};
|
virtual ~CFS() {};
|
||||||
void fit();
|
void fit() override;
|
||||||
void test();
|
|
||||||
vector<int> getFeatures() const;
|
|
||||||
vector<double> getScores() const;
|
|
||||||
private:
|
private:
|
||||||
void computeSuLabels();
|
|
||||||
double computeSuFeatures(const int a, const int b);
|
|
||||||
double symmetricalUncertainty(int a, int b);
|
|
||||||
double computeMerit();
|
|
||||||
bool computeContinueCondition(const vector<int>& featureOrder);
|
bool computeContinueCondition(const vector<int>& featureOrder);
|
||||||
vector<pair<int, int>> combinations(const vector<int>& features);
|
|
||||||
const torch::Tensor& weights;
|
|
||||||
int maxFeatures;
|
|
||||||
vector<int> cfsFeatures;
|
|
||||||
vector<double> cfsScores;
|
|
||||||
vector<double> suLabels;
|
|
||||||
map<pair<int, int>, double> suFeatures;
|
|
||||||
bool fitted = false;
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
|||||||
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
||||||
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
|
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
|
||||||
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
|
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
|
||||||
Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
|
Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
|
||||||
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
|
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
|
74
src/BayesNet/FeatureSelect.cc
Normal file
74
src/BayesNet/FeatureSelect.cc
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
#include "FeatureSelect.h"
|
||||||
|
#include <limits>
|
||||||
|
#include "bayesnetUtils.h"
|
||||||
|
namespace bayesnet {
|
||||||
|
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
||||||
|
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
|
||||||
|
|
||||||
|
{
|
||||||
|
}
|
||||||
|
double FeatureSelect::symmetricalUncertainty(int a, int b)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
Compute symmetrical uncertainty. Normalize* information gain (mutual
|
||||||
|
information) with the entropies of the features in order to compensate
|
||||||
|
the bias due to high cardinality features. *Range [0, 1]
|
||||||
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||||
|
*/
|
||||||
|
auto x = samples.index({ a, "..." });
|
||||||
|
auto y = samples.index({ b, "..." });
|
||||||
|
auto mu = mutualInformation(x, y, weights);
|
||||||
|
auto hx = entropy(x, weights);
|
||||||
|
auto hy = entropy(y, weights);
|
||||||
|
return 2.0 * mu / (hx + hy);
|
||||||
|
}
|
||||||
|
void FeatureSelect::computeSuLabels()
|
||||||
|
{
|
||||||
|
// Compute Simmetrical Uncertainty between features and labels
|
||||||
|
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||||
|
for (int i = 0; i < features.size(); ++i) {
|
||||||
|
suLabels.push_back(symmetricalUncertainty(i, -1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
|
||||||
|
{
|
||||||
|
// Compute Simmetrical Uncertainty between features
|
||||||
|
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||||
|
try {
|
||||||
|
return suFeatures.at({ firstFeature, secondFeature });
|
||||||
|
}
|
||||||
|
catch (const out_of_range& e) {
|
||||||
|
double result = symmetricalUncertainty(firstFeature, secondFeature);
|
||||||
|
suFeatures[{firstFeature, secondFeature}] = result;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double FeatureSelect::computeMeritCFS()
|
||||||
|
{
|
||||||
|
double result;
|
||||||
|
double rcf = 0;
|
||||||
|
for (auto feature : selectedFeatures) {
|
||||||
|
rcf += suLabels[feature];
|
||||||
|
}
|
||||||
|
double rff = 0;
|
||||||
|
int n = selectedFeatures.size();
|
||||||
|
for (const auto& item : doCombinations(selectedFeatures)) {
|
||||||
|
rff += computeSuFeatures(item.first, item.second);
|
||||||
|
}
|
||||||
|
return rcf / sqrt(n + (n * n - n) * rff);
|
||||||
|
}
|
||||||
|
vector<int> FeatureSelect::getFeatures() const
|
||||||
|
{
|
||||||
|
if (!fitted) {
|
||||||
|
throw runtime_error("FeatureSelect not fitted");
|
||||||
|
}
|
||||||
|
return selectedFeatures;
|
||||||
|
}
|
||||||
|
vector<double> FeatureSelect::getScores() const
|
||||||
|
{
|
||||||
|
if (!fitted) {
|
||||||
|
throw runtime_error("FeatureSelect not fitted");
|
||||||
|
}
|
||||||
|
return selectedScores;
|
||||||
|
}
|
||||||
|
}
|
31
src/BayesNet/FeatureSelect.h
Normal file
31
src/BayesNet/FeatureSelect.h
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#ifndef FEATURE_SELECT_H
|
||||||
|
#define FEATURE_SELECT_H
|
||||||
|
#include <torch/torch.h>
|
||||||
|
#include <vector>
|
||||||
|
#include "BayesMetrics.h"
|
||||||
|
using namespace std;
|
||||||
|
namespace bayesnet {
|
||||||
|
class FeatureSelect : public Metrics {
|
||||||
|
public:
|
||||||
|
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
|
||||||
|
FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
|
||||||
|
virtual ~FeatureSelect() {};
|
||||||
|
virtual void fit() = 0;
|
||||||
|
vector<int> getFeatures() const;
|
||||||
|
vector<double> getScores() const;
|
||||||
|
protected:
|
||||||
|
void computeSuLabels();
|
||||||
|
double computeSuFeatures(const int a, const int b);
|
||||||
|
double symmetricalUncertainty(int a, int b);
|
||||||
|
double computeMeritCFS();
|
||||||
|
vector<pair<int, int>> combinations(const vector<int>& features);
|
||||||
|
const torch::Tensor& weights;
|
||||||
|
int maxFeatures;
|
||||||
|
vector<int> selectedFeatures;
|
||||||
|
vector<double> selectedScores;
|
||||||
|
vector<double> suLabels;
|
||||||
|
map<pair<int, int>, double> suFeatures;
|
||||||
|
bool fitted = false;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
Loading…
Reference in New Issue
Block a user