Begin CFS implementation
This commit is contained in:
parent
47e2b138c5
commit
40d1dad5d8
@ -60,11 +60,12 @@ namespace bayesnet {
|
||||
{
|
||||
return scoresKBest;
|
||||
}
|
||||
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
|
||||
template <class T>
|
||||
vector<pair<T, T>> Metrics::doCombinations(const vector<T>& source)
|
||||
{
|
||||
vector<pair<string, string>> result;
|
||||
vector<pair<T, T>> result;
|
||||
for (int i = 0; i < source.size(); ++i) {
|
||||
string temp = source[i];
|
||||
T temp = source[i];
|
||||
for (int j = i + 1; j < source.size(); ++j) {
|
||||
result.push_back({ temp, source[j] });
|
||||
}
|
||||
@ -76,7 +77,7 @@ namespace bayesnet {
|
||||
auto result = vector<double>();
|
||||
auto source = vector<string>(features);
|
||||
source.push_back(className);
|
||||
auto combinations = doCombinations(source);
|
||||
auto combinations = doCombinations<string>(source);
|
||||
// Compute class prior
|
||||
auto margin = torch::zeros({ classNumStates }, torch::kFloat);
|
||||
for (int value = 0; value < classNumStates; ++value) {
|
||||
|
@ -8,20 +8,22 @@ namespace bayesnet {
|
||||
using namespace torch;
|
||||
class Metrics {
|
||||
private:
|
||||
Tensor samples; // nxm tensor used to fit the model
|
||||
vector<string> features;
|
||||
string className;
|
||||
int classNumStates = 0;
|
||||
vector<double> scoresKBest;
|
||||
vector<int> featuresKBest; // sorted indices of the features
|
||||
double entropy(const Tensor& feature, const Tensor& weights);
|
||||
double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
|
||||
vector<pair<string, string>> doCombinations(const vector<string>&);
|
||||
protected:
|
||||
Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector
|
||||
string className;
|
||||
double entropy(const Tensor& feature, const Tensor& weights);
|
||||
vector<string> features;
|
||||
template <class T>
|
||||
vector<pair<T, T>> doCombinations(const vector<T>& source);
|
||||
public:
|
||||
Metrics() = default;
|
||||
Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
|
||||
Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
|
||||
vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending=false, unsigned k = 0);
|
||||
vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0);
|
||||
vector<double> getScoresKBest() const;
|
||||
double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
|
||||
vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
|
||||
|
127
src/BayesNet/CFS.cc
Normal file
127
src/BayesNet/CFS.cc
Normal file
@ -0,0 +1,127 @@
|
||||
#include "CFS.h"
|
||||
#include <limits>
|
||||
#include "bayesnetUtils.h"
|
||||
namespace bayesnet {
|
||||
CFS::CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
||||
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
|
||||
|
||||
{
|
||||
}
|
||||
double CFS::symmetricalUncertainty(int a, int b)
|
||||
{
|
||||
/*
|
||||
Compute symmetrical uncertainty. Normalize* information gain (mutual
|
||||
information) with the entropies of the features in order to compensate
|
||||
the bias due to high cardinality features. *Range [0, 1]
|
||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||
*/
|
||||
auto x = samples.index({ a, "..." });
|
||||
auto y = samples.index({ b, "..." });
|
||||
return 2.0 * mutualInformation(y, x, weights) / (entropy(x, weights) + entropy(y, weights));
|
||||
}
|
||||
void CFS::computeSuLabels()
|
||||
{
|
||||
// Compute Simmetrical Uncertainty between features and labels
|
||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
suLabels[i] = symmetricalUncertainty(i, -1);
|
||||
}
|
||||
|
||||
}
|
||||
double CFS::computeSuFeatures(const int firstFeature, const int secondFeature)
|
||||
{
|
||||
// Compute Simmetrical Uncertainty between features
|
||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||
// TODO: Implement Cache in this function
|
||||
return symmetricalUncertainty(firstFeature, secondFeature);
|
||||
}
|
||||
double CFS::computeMerit()
|
||||
{
|
||||
double result;
|
||||
double rcf = 0;
|
||||
for (auto feature : cfsFeatures) {
|
||||
rcf += suLabels[feature];
|
||||
}
|
||||
double rff = 0;
|
||||
int n = cfsFeatures.size();
|
||||
for (const auto& item : doCombinations<int>(cfsFeatures)) {
|
||||
rff += computeSuFeatures(item.first, item.second);
|
||||
}
|
||||
return rcf / sqrt(n + (n * n - n) * rff);
|
||||
}
|
||||
void CFS::fit()
|
||||
{
|
||||
cfsFeatures.clear();
|
||||
computeSuLabels();
|
||||
auto featureOrder = argsort(suLabels); // sort descending order
|
||||
auto continueCondition = true;
|
||||
auto feature = featureOrder[0];
|
||||
cfsFeatures.push_back(feature);
|
||||
cfsScores.push_back(suLabels[feature]);
|
||||
while (continueCondition) {
|
||||
double merit = numeric_limits<double>::lowest();
|
||||
int bestFeature = -1;
|
||||
for (auto feature : featureOrder) {
|
||||
cfsFeatures.push_back(feature);
|
||||
auto meritNew = computeMerit(); // Compute merit with cfsFeatures
|
||||
if (meritNew > merit) {
|
||||
merit = meritNew;
|
||||
bestFeature = feature;
|
||||
}
|
||||
cfsFeatures.pop_back();
|
||||
}
|
||||
cfsFeatures.push_back(bestFeature);
|
||||
cfsScores.push_back(merit);
|
||||
featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), feature), featureOrder.end());
|
||||
continueCondition = computeContinueCondition(featureOrder);
|
||||
}
|
||||
fitted = true;
|
||||
}
|
||||
bool CFS::computeContinueCondition(const vector<int>& featureOrder)
|
||||
{
|
||||
if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
if (cfsScores.size() >= 5) {
|
||||
/*
|
||||
"To prevent the best first search from exploring the entire
|
||||
feature subset search space, a stopping criterion is imposed.
|
||||
The search will terminate if five consecutive fully expanded
|
||||
subsets show no improvement over the current best subset."
|
||||
as stated in Mark A.Hall Thesis
|
||||
*/
|
||||
double item_ant = numeric_limits<double>::lowest();
|
||||
int num = 0;
|
||||
vector<double> lastFive(cfsScores.end() - 5, cfsScores.end());
|
||||
for (auto item : lastFive) {
|
||||
if (item_ant == numeric_limits<double>::lowest()) {
|
||||
item_ant = item;
|
||||
}
|
||||
if (item > item_ant) {
|
||||
break;
|
||||
} else {
|
||||
num++;
|
||||
item_ant = item;
|
||||
}
|
||||
}
|
||||
if (num == 5) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
vector<int> CFS::getFeatures() const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw runtime_error("CFS not fitted");
|
||||
}
|
||||
return cfsFeatures;
|
||||
}
|
||||
vector<double> CFS::getScores() const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw runtime_error("CFS not fitted");
|
||||
}
|
||||
return cfsScores;
|
||||
}
|
||||
}
|
31
src/BayesNet/CFS.h
Normal file
31
src/BayesNet/CFS.h
Normal file
@ -0,0 +1,31 @@
|
||||
#ifndef CFS_H
|
||||
#define CFS_H
|
||||
#include <torch/torch.h>
|
||||
#include <vector>
|
||||
#include "BayesMetrics.h"
|
||||
using namespace std;
|
||||
namespace bayesnet {
|
||||
class CFS : public Metrics {
|
||||
public:
|
||||
// dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
|
||||
CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
|
||||
virtual ~CFS() {};
|
||||
void fit();
|
||||
vector<int> getFeatures() const;
|
||||
vector<double> getScores() const;
|
||||
private:
|
||||
void computeSuLabels();
|
||||
double computeSuFeatures(const int a, const int b);
|
||||
double symmetricalUncertainty(int a, int b);
|
||||
double computeMerit();
|
||||
bool computeContinueCondition(const vector<int>& featureOrder);
|
||||
vector<pair<int, int>> combinations(const vector<int>& features);
|
||||
const torch::Tensor& weights;
|
||||
int maxFeatures;
|
||||
vector<int> cfsFeatures;
|
||||
vector<double> cfsScores;
|
||||
vector<double> suLabels;
|
||||
bool fitted = false;
|
||||
};
|
||||
}
|
||||
#endif
|
@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
||||
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
||||
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
|
||||
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
|
||||
Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
|
||||
Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
|
||||
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto)
|
@ -14,8 +14,8 @@ namespace bayesnet {
|
||||
int numStates; // number of states of the variable
|
||||
torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
|
||||
vector<int64_t> dimensions; // dimensions of the cpTable
|
||||
public:
|
||||
vector<pair<string, string>> combinations(const vector<string>&);
|
||||
public:
|
||||
explicit Node(const string&);
|
||||
void clear();
|
||||
void addParent(Node*);
|
||||
|
Loading…
Reference in New Issue
Block a user