Refactor library structure
This commit is contained in:
72
bayesnet/feature_selection/CFS.cc
Normal file
72
bayesnet/feature_selection/CFS.cc
Normal file
@@ -0,0 +1,72 @@
|
||||
#include <limits>
|
||||
#include "bayesnet/utils/bayesnetUtils.h"
|
||||
#include "CFS.h"
|
||||
namespace bayesnet {
|
||||
void CFS::fit()
|
||||
{
|
||||
initialize();
|
||||
computeSuLabels();
|
||||
auto featureOrder = argsort(suLabels); // sort descending order
|
||||
auto continueCondition = true;
|
||||
auto feature = featureOrder[0];
|
||||
selectedFeatures.push_back(feature);
|
||||
selectedScores.push_back(suLabels[feature]);
|
||||
selectedFeatures.erase(selectedFeatures.begin());
|
||||
while (continueCondition) {
|
||||
double merit = std::numeric_limits<double>::lowest();
|
||||
int bestFeature = -1;
|
||||
for (auto feature : featureOrder) {
|
||||
selectedFeatures.push_back(feature);
|
||||
// Compute merit with selectedFeatures
|
||||
auto meritNew = computeMeritCFS();
|
||||
if (meritNew > merit) {
|
||||
merit = meritNew;
|
||||
bestFeature = feature;
|
||||
}
|
||||
selectedFeatures.pop_back();
|
||||
}
|
||||
if (bestFeature == -1) {
|
||||
// meritNew has to be nan due to constant features
|
||||
break;
|
||||
}
|
||||
selectedFeatures.push_back(bestFeature);
|
||||
selectedScores.push_back(merit);
|
||||
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
|
||||
continueCondition = computeContinueCondition(featureOrder);
|
||||
}
|
||||
fitted = true;
|
||||
}
|
||||
bool CFS::computeContinueCondition(const std::vector<int>& featureOrder)
|
||||
{
|
||||
if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
if (selectedScores.size() >= 5) {
|
||||
/*
|
||||
"To prevent the best first search from exploring the entire
|
||||
feature subset search space, a stopping criterion is imposed.
|
||||
The search will terminate if five consecutive fully expanded
|
||||
subsets show no improvement over the current best subset."
|
||||
as stated in Mark A.Hall Thesis
|
||||
*/
|
||||
double item_ant = std::numeric_limits<double>::lowest();
|
||||
int num = 0;
|
||||
std::vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
|
||||
for (auto item : lastFive) {
|
||||
if (item_ant == std::numeric_limits<double>::lowest()) {
|
||||
item_ant = item;
|
||||
}
|
||||
if (item > item_ant) {
|
||||
break;
|
||||
} else {
|
||||
num++;
|
||||
item_ant = item;
|
||||
}
|
||||
}
|
||||
if (num == 5) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
20
bayesnet/feature_selection/CFS.h
Normal file
20
bayesnet/feature_selection/CFS.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef CFS_H
|
||||
#define CFS_H
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
#include "bayesnet/feature_selection/FeatureSelect.h"
|
||||
namespace bayesnet {
|
||||
class CFS : public FeatureSelect {
|
||||
public:
|
||||
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
|
||||
CFS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
||||
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
|
||||
{
|
||||
}
|
||||
virtual ~CFS() {};
|
||||
void fit() override;
|
||||
private:
|
||||
bool computeContinueCondition(const std::vector<int>& featureOrder);
|
||||
};
|
||||
}
|
||||
#endif
|
44
bayesnet/feature_selection/FCBF.cc
Normal file
44
bayesnet/feature_selection/FCBF.cc
Normal file
@@ -0,0 +1,44 @@
|
||||
#include "bayesnet/utils/bayesnetUtils.h"
|
||||
#include "FCBF.h"
|
||||
namespace bayesnet {
|
||||
|
||||
FCBF::FCBF(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
|
||||
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
|
||||
{
|
||||
if (threshold < 1e-7) {
|
||||
throw std::invalid_argument("Threshold cannot be less than 1e-7");
|
||||
}
|
||||
}
|
||||
void FCBF::fit()
|
||||
{
|
||||
initialize();
|
||||
computeSuLabels();
|
||||
auto featureOrder = argsort(suLabels); // sort descending order
|
||||
auto featureOrderCopy = featureOrder;
|
||||
for (const auto& feature : featureOrder) {
|
||||
// Don't self compare
|
||||
featureOrderCopy.erase(featureOrderCopy.begin());
|
||||
if (suLabels.at(feature) == 0.0) {
|
||||
// The feature has been removed from the list
|
||||
continue;
|
||||
}
|
||||
if (suLabels.at(feature) < threshold) {
|
||||
break;
|
||||
}
|
||||
// Remove redundant features
|
||||
for (const auto& featureCopy : featureOrderCopy) {
|
||||
double value = computeSuFeatures(feature, featureCopy);
|
||||
if (value >= suLabels.at(featureCopy)) {
|
||||
// Remove feature from list
|
||||
suLabels[featureCopy] = 0.0;
|
||||
}
|
||||
}
|
||||
selectedFeatures.push_back(feature);
|
||||
selectedScores.push_back(suLabels[feature]);
|
||||
if (selectedFeatures.size() == maxFeatures) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
fitted = true;
|
||||
}
|
||||
}
|
17
bayesnet/feature_selection/FCBF.h
Normal file
17
bayesnet/feature_selection/FCBF.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#ifndef FCBF_H
|
||||
#define FCBF_H
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
#include "bayesnet/feature_selection/FeatureSelect.h"
|
||||
namespace bayesnet {
|
||||
class FCBF : public FeatureSelect {
|
||||
public:
|
||||
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
|
||||
FCBF(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
|
||||
virtual ~FCBF() {};
|
||||
void fit() override;
|
||||
private:
|
||||
double threshold = -1;
|
||||
};
|
||||
}
|
||||
#endif
|
78
bayesnet/feature_selection/FeatureSelect.cc
Normal file
78
bayesnet/feature_selection/FeatureSelect.cc
Normal file
@@ -0,0 +1,78 @@
|
||||
#include <limits>
|
||||
#include "bayesnet/utils/bayesnetUtils.h"
|
||||
#include "FeatureSelect.h"
|
||||
namespace bayesnet {
|
||||
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
||||
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
|
||||
|
||||
{
|
||||
}
|
||||
void FeatureSelect::initialize()
|
||||
{
|
||||
selectedFeatures.clear();
|
||||
selectedScores.clear();
|
||||
}
|
||||
double FeatureSelect::symmetricalUncertainty(int a, int b)
|
||||
{
|
||||
/*
|
||||
Compute symmetrical uncertainty. Normalize* information gain (mutual
|
||||
information) with the entropies of the features in order to compensate
|
||||
the bias due to high cardinality features. *Range [0, 1]
|
||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||
*/
|
||||
auto x = samples.index({ a, "..." });
|
||||
auto y = samples.index({ b, "..." });
|
||||
auto mu = mutualInformation(x, y, weights);
|
||||
auto hx = entropy(x, weights);
|
||||
auto hy = entropy(y, weights);
|
||||
return 2.0 * mu / (hx + hy);
|
||||
}
|
||||
void FeatureSelect::computeSuLabels()
|
||||
{
|
||||
// Compute Simmetrical Uncertainty between features and labels
|
||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
suLabels.push_back(symmetricalUncertainty(i, -1));
|
||||
}
|
||||
}
|
||||
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
|
||||
{
|
||||
// Compute Simmetrical Uncertainty between features
|
||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||
try {
|
||||
return suFeatures.at({ firstFeature, secondFeature });
|
||||
}
|
||||
catch (const std::out_of_range& e) {
|
||||
double result = symmetricalUncertainty(firstFeature, secondFeature);
|
||||
suFeatures[{firstFeature, secondFeature}] = result;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
double FeatureSelect::computeMeritCFS()
|
||||
{
|
||||
double rcf = 0;
|
||||
for (auto feature : selectedFeatures) {
|
||||
rcf += suLabels[feature];
|
||||
}
|
||||
double rff = 0;
|
||||
int n = selectedFeatures.size();
|
||||
for (const auto& item : doCombinations(selectedFeatures)) {
|
||||
rff += computeSuFeatures(item.first, item.second);
|
||||
}
|
||||
return rcf / sqrt(n + (n * n - n) * rff);
|
||||
}
|
||||
std::vector<int> FeatureSelect::getFeatures() const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error("FeatureSelect not fitted");
|
||||
}
|
||||
return selectedFeatures;
|
||||
}
|
||||
std::vector<double> FeatureSelect::getScores() const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error("FeatureSelect not fitted");
|
||||
}
|
||||
return selectedScores;
|
||||
}
|
||||
}
|
30
bayesnet/feature_selection/FeatureSelect.h
Normal file
30
bayesnet/feature_selection/FeatureSelect.h
Normal file
@@ -0,0 +1,30 @@
|
||||
#ifndef FEATURE_SELECT_H
|
||||
#define FEATURE_SELECT_H
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
#include "bayesnet/utils/BayesMetrics.h"
|
||||
namespace bayesnet {
|
||||
class FeatureSelect : public Metrics {
|
||||
public:
|
||||
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
|
||||
FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
|
||||
virtual ~FeatureSelect() {};
|
||||
virtual void fit() = 0;
|
||||
std::vector<int> getFeatures() const;
|
||||
std::vector<double> getScores() const;
|
||||
protected:
|
||||
void initialize();
|
||||
void computeSuLabels();
|
||||
double computeSuFeatures(const int a, const int b);
|
||||
double symmetricalUncertainty(int a, int b);
|
||||
double computeMeritCFS();
|
||||
const torch::Tensor& weights;
|
||||
int maxFeatures;
|
||||
std::vector<int> selectedFeatures;
|
||||
std::vector<double> selectedScores;
|
||||
std::vector<double> suLabels;
|
||||
std::map<std::pair<int, int>, double> suFeatures;
|
||||
bool fitted = false;
|
||||
};
|
||||
}
|
||||
#endif
|
47
bayesnet/feature_selection/IWSS.cc
Normal file
47
bayesnet/feature_selection/IWSS.cc
Normal file
@@ -0,0 +1,47 @@
|
||||
#include <limits>
|
||||
#include "bayesnet/utils/bayesnetUtils.h"
|
||||
#include "IWSS.h"
|
||||
namespace bayesnet {
|
||||
IWSS::IWSS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
|
||||
FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
|
||||
{
|
||||
if (threshold < 0 || threshold > .5) {
|
||||
throw std::invalid_argument("Threshold has to be in [0, 0.5]");
|
||||
}
|
||||
}
|
||||
void IWSS::fit()
|
||||
{
|
||||
initialize();
|
||||
computeSuLabels();
|
||||
auto featureOrder = argsort(suLabels); // sort descending order
|
||||
auto featureOrderCopy = featureOrder;
|
||||
// Add first and second features to result
|
||||
// First with its own score
|
||||
auto first_feature = pop_first(featureOrderCopy);
|
||||
selectedFeatures.push_back(first_feature);
|
||||
selectedScores.push_back(suLabels.at(first_feature));
|
||||
// Second with the score of the candidates
|
||||
selectedFeatures.push_back(pop_first(featureOrderCopy));
|
||||
auto merit = computeMeritCFS();
|
||||
selectedScores.push_back(merit);
|
||||
for (const auto feature : featureOrderCopy) {
|
||||
selectedFeatures.push_back(feature);
|
||||
// Compute merit with selectedFeatures
|
||||
auto meritNew = computeMeritCFS();
|
||||
double delta = merit != 0.0 ? std::abs(merit - meritNew) / merit : 0.0;
|
||||
if (meritNew > merit || delta < threshold) {
|
||||
if (meritNew > merit) {
|
||||
merit = meritNew;
|
||||
}
|
||||
selectedScores.push_back(meritNew);
|
||||
} else {
|
||||
selectedFeatures.pop_back();
|
||||
break;
|
||||
}
|
||||
if (selectedFeatures.size() == maxFeatures) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
fitted = true;
|
||||
}
|
||||
}
|
17
bayesnet/feature_selection/IWSS.h
Normal file
17
bayesnet/feature_selection/IWSS.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#ifndef IWSS_H
|
||||
#define IWSS_H
|
||||
#include <vector>
|
||||
#include <torch/torch.h>
|
||||
#include "FeatureSelect.h"
|
||||
namespace bayesnet {
|
||||
class IWSS : public FeatureSelect {
|
||||
public:
|
||||
// dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
|
||||
IWSS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
|
||||
virtual ~IWSS() {};
|
||||
void fit() override;
|
||||
private:
|
||||
double threshold = -1;
|
||||
};
|
||||
}
|
||||
#endif
|
Reference in New Issue
Block a user