Compare commits
1 Commits
v1.1.0
...
FixSelectF
Author | SHA1 | Date | |
---|---|---|---|
3615a1463c
|
@@ -1,84 +1,141 @@
|
||||
// ***************************************************************
|
||||
// **
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
|
||||
// SPDX-FileType: SOURCE
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
// **
|
||||
|
||||
#include <limits>
|
||||
#include "bayesnet/utils/bayesnetUtils.h"
|
||||
#include "FeatureSelect.h"
|
||||
namespace bayesnet {
|
||||
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
|
||||
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
|
||||
|
||||
namespace bayesnet {
|
||||
|
||||
using namespace torch::indexing; // for Ellipsis constant
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// ctor
|
||||
//---------------------------------------------------------------------
|
||||
FeatureSelect::FeatureSelect(const torch::Tensor& samples,
|
||||
const std::vector<std::string>& features,
|
||||
const std::string& className,
|
||||
int maxFeatures,
|
||||
int classNumStates,
|
||||
const torch::Tensor& weights)
|
||||
: Metrics(samples, features, className, classNumStates),
|
||||
maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures),
|
||||
weights(weights)
|
||||
{
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// public helpers
|
||||
//---------------------------------------------------------------------
|
||||
void FeatureSelect::initialize()
|
||||
{
|
||||
selectedFeatures.clear();
|
||||
selectedScores.clear();
|
||||
suLabels.clear();
|
||||
suFeatures.clear();
|
||||
|
||||
fitted = false;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Symmetrical Uncertainty (SU)
|
||||
//---------------------------------------------------------------------
|
||||
double FeatureSelect::symmetricalUncertainty(int a, int b)
|
||||
{
|
||||
/*
|
||||
Compute symmetrical uncertainty. Normalize* information gain (mutual
|
||||
information) with the entropies of the features in order to compensate
|
||||
the bias due to high cardinality features. *Range [0, 1]
|
||||
(https://www.sciencedirect.com/science/article/pii/S0020025519303603)
|
||||
* Compute symmetrical uncertainty. Normalises the information gain
|
||||
* (mutual information) with the entropies of the variables to compensate
|
||||
* the bias due to high‑cardinality features. Range: [0, 1]
|
||||
* See: https://www.sciencedirect.com/science/article/pii/S0020025519303603
|
||||
*/
|
||||
auto x = samples.index({ a, "..." });
|
||||
auto y = samples.index({ b, "..." });
|
||||
auto mu = mutualInformation(x, y, weights);
|
||||
auto hx = entropy(x, weights);
|
||||
auto hy = entropy(y, weights);
|
||||
return 2.0 * mu / (hx + hy);
|
||||
|
||||
auto x = samples.index({ a, Ellipsis }); // row a => feature a
|
||||
auto y = (b >= 0) ? samples.index({ b, Ellipsis }) // row b (>=0) => feature b
|
||||
: samples.index({ -1, Ellipsis }); // ‑1 treated as last row = labels
|
||||
|
||||
double mu = mutualInformation(x, y, weights);
|
||||
double hx = entropy(x, weights);
|
||||
double hy = entropy(y, weights);
|
||||
|
||||
const double denom = hx + hy;
|
||||
if (denom == 0.0) return 0.0; // perfectly pure variables
|
||||
|
||||
return 2.0 * mu / denom;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// SU feature–class
|
||||
//---------------------------------------------------------------------
|
||||
void FeatureSelect::computeSuLabels()
|
||||
{
|
||||
// Compute Simmetrical Uncertainty between features and labels
|
||||
// Compute Symmetrical Uncertainty between each feature and the class labels
|
||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
suLabels.push_back(symmetricalUncertainty(i, -1));
|
||||
const int classIdx = static_cast<int>(samples.size(0)) - 1; // labels in last row
|
||||
suLabels.reserve(features.size());
|
||||
for (int i = 0; i < static_cast<int>(features.size()); ++i) {
|
||||
suLabels.emplace_back(symmetricalUncertainty(i, classIdx));
|
||||
}
|
||||
}
|
||||
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// SU feature–feature with cache
|
||||
//---------------------------------------------------------------------
|
||||
double FeatureSelect::computeSuFeatures(int firstFeature, int secondFeature)
|
||||
{
|
||||
// Compute Simmetrical Uncertainty between features
|
||||
// https://en.wikipedia.org/wiki/Symmetric_uncertainty
|
||||
try {
|
||||
return suFeatures.at({ firstFeature, secondFeature });
|
||||
}
|
||||
catch (const std::out_of_range& e) {
|
||||
double result = symmetricalUncertainty(firstFeature, secondFeature);
|
||||
suFeatures[{firstFeature, secondFeature}] = result;
|
||||
// Order the pair to exploit symmetry => only one entry in the map
|
||||
auto ordered = std::minmax(firstFeature, secondFeature);
|
||||
const std::pair<int, int> key{ ordered.first, ordered.second };
|
||||
|
||||
auto it = suFeatures.find(key);
|
||||
if (it != suFeatures.end()) return it->second;
|
||||
|
||||
double result = symmetricalUncertainty(key.first, key.second);
|
||||
suFeatures[key] = result; // store once (symmetry handled by ordering)
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// Correlation‑based Feature Selection (CFS) merit
|
||||
//---------------------------------------------------------------------
|
||||
double FeatureSelect::computeMeritCFS()
|
||||
{
|
||||
double rcf = 0;
|
||||
for (auto feature : selectedFeatures) {
|
||||
rcf += suLabels[feature];
|
||||
}
|
||||
double rff = 0;
|
||||
int n = selectedFeatures.size();
|
||||
for (const auto& item : doCombinations(selectedFeatures)) {
|
||||
rff += computeSuFeatures(item.first, item.second);
|
||||
}
|
||||
return rcf / sqrt(n + (n * n - n) * rff);
|
||||
const int n = static_cast<int>(selectedFeatures.size());
|
||||
if (n == 0) return 0.0;
|
||||
|
||||
// average r_cf (feature–class)
|
||||
double rcf_sum = 0.0;
|
||||
for (int f : selectedFeatures) rcf_sum += suLabels[f];
|
||||
const double rcf_avg = rcf_sum / n;
|
||||
|
||||
// average r_ff (feature–feature)
|
||||
double rff_sum = 0.0;
|
||||
const auto& pairs = doCombinations(selectedFeatures); // generates each unordered pair once
|
||||
for (const auto& p : pairs) rff_sum += computeSuFeatures(p.first, p.second);
|
||||
|
||||
const double numPairs = n * (n - 1) * 0.5;
|
||||
const double rff_avg = (numPairs > 0) ? rff_sum / numPairs : 0.0;
|
||||
|
||||
// Merit_S = k * r_cf / sqrt( k + k*(k‑1) * r_ff ) (Hall, 1999)
|
||||
const double k = static_cast<double>(n);
|
||||
return (k * rcf_avg) / std::sqrt(k + k * (k - 1) * rff_avg);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
// getters
|
||||
//---------------------------------------------------------------------
|
||||
std::vector<int> FeatureSelect::getFeatures() const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error("FeatureSelect not fitted");
|
||||
}
|
||||
if (!fitted) throw std::runtime_error("FeatureSelect not fitted");
|
||||
return selectedFeatures;
|
||||
}
|
||||
|
||||
std::vector<double> FeatureSelect::getScores() const
|
||||
{
|
||||
if (!fitted) {
|
||||
throw std::runtime_error("FeatureSelect not fitted");
|
||||
}
|
||||
if (!fitted) throw std::runtime_error("FeatureSelect not fitted");
|
||||
return selectedScores;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace bayesnet
|
||||
|
1
lib/catch2
Submodule
1
lib/catch2
Submodule
Submodule lib/catch2 added at 029fe3b460
1
lib/folding
Submodule
1
lib/folding
Submodule
Submodule lib/folding added at 2ac43e32ac
1
lib/json
Submodule
1
lib/json
Submodule
Submodule lib/json added at 620034ecec
1
lib/mdlp
Submodule
1
lib/mdlp
Submodule
Submodule lib/mdlp added at 7d62d6af4a
1
tests/lib/Files
Submodule
1
tests/lib/Files
Submodule
Submodule tests/lib/Files added at a4329f5f9d
1
tests/lib/catch2
Submodule
1
tests/lib/catch2
Submodule
Submodule tests/lib/catch2 added at 506276c592
Reference in New Issue
Block a user