2024-04-11 16:02:49 +00:00
|
|
|
// ***************************************************************
|
|
|
|
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
|
|
|
// SPDX-FileType: SOURCE
|
|
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
// ***************************************************************
|
|
|
|
|
2023-10-14 09:27:46 +00:00
|
|
|
#include <limits>
|
2024-03-08 21:20:54 +00:00
|
|
|
#include "bayesnet/utils/bayesnetUtils.h"
|
|
|
|
#include "FeatureSelect.h"
|
2023-10-14 09:27:46 +00:00
|
|
|
namespace bayesnet {
|
2023-11-08 17:45:35 +00:00
|
|
|
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
2023-10-14 09:27:46 +00:00
|
|
|
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
|
|
|
|
|
|
|
|
{
|
|
|
|
}
|
2023-10-14 11:12:04 +00:00
|
|
|
void FeatureSelect::initialize()
|
|
|
|
{
|
|
|
|
selectedFeatures.clear();
|
|
|
|
selectedScores.clear();
|
|
|
|
}
|
2023-10-14 09:27:46 +00:00
|
|
|
double FeatureSelect::symmetricalUncertainty(int a, int b)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
Compute symmetrical uncertainty. Normalize* information gain (mutual
|
|
|
|
information) with the entropies of the features in order to compensate
|
|
|
|
the bias due to high cardinality features. *Range [0, 1]
|
|
|
|
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
|
|
|
*/
|
|
|
|
auto x = samples.index({ a, "..." });
|
|
|
|
auto y = samples.index({ b, "..." });
|
|
|
|
auto mu = mutualInformation(x, y, weights);
|
|
|
|
auto hx = entropy(x, weights);
|
|
|
|
auto hy = entropy(y, weights);
|
|
|
|
return 2.0 * mu / (hx + hy);
|
|
|
|
}
|
|
|
|
void FeatureSelect::computeSuLabels()
|
|
|
|
{
|
|
|
|
// Compute Simmetrical Uncertainty between features and labels
|
|
|
|
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
|
|
|
for (int i = 0; i < features.size(); ++i) {
|
|
|
|
suLabels.push_back(symmetricalUncertainty(i, -1));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
|
|
|
|
{
|
|
|
|
// Compute Simmetrical Uncertainty between features
|
|
|
|
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
|
|
|
try {
|
|
|
|
return suFeatures.at({ firstFeature, secondFeature });
|
|
|
|
}
|
2023-11-08 17:45:35 +00:00
|
|
|
catch (const std::out_of_range& e) {
|
2023-10-14 09:27:46 +00:00
|
|
|
double result = symmetricalUncertainty(firstFeature, secondFeature);
|
|
|
|
suFeatures[{firstFeature, secondFeature}] = result;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
double FeatureSelect::computeMeritCFS()
|
|
|
|
{
|
|
|
|
double rcf = 0;
|
|
|
|
for (auto feature : selectedFeatures) {
|
|
|
|
rcf += suLabels[feature];
|
|
|
|
}
|
|
|
|
double rff = 0;
|
|
|
|
int n = selectedFeatures.size();
|
|
|
|
for (const auto& item : doCombinations(selectedFeatures)) {
|
|
|
|
rff += computeSuFeatures(item.first, item.second);
|
|
|
|
}
|
|
|
|
return rcf / sqrt(n + (n * n - n) * rff);
|
|
|
|
}
|
2023-11-08 17:45:35 +00:00
|
|
|
std::vector<int> FeatureSelect::getFeatures() const
|
2023-10-14 09:27:46 +00:00
|
|
|
{
|
|
|
|
if (!fitted) {
|
2023-11-08 17:45:35 +00:00
|
|
|
throw std::runtime_error("FeatureSelect not fitted");
|
2023-10-14 09:27:46 +00:00
|
|
|
}
|
|
|
|
return selectedFeatures;
|
|
|
|
}
|
2023-11-08 17:45:35 +00:00
|
|
|
std::vector<double> FeatureSelect::getScores() const
|
2023-10-14 09:27:46 +00:00
|
|
|
{
|
|
|
|
if (!fitted) {
|
2023-11-08 17:45:35 +00:00
|
|
|
throw std::runtime_error("FeatureSelect not fitted");
|
2023-10-14 09:27:46 +00:00
|
|
|
}
|
|
|
|
return selectedScores;
|
|
|
|
}
|
|
|
|
}
|