Compare commits

...

1 Commits

Author SHA1 Message Date
3615a1463c Fix some issues in FeatureSelect 2025-05-31 14:36:51 +02:00
7 changed files with 111 additions and 48 deletions

View File

@@ -1,84 +1,141 @@
// *************************************************************** // **
// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez // SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
// SPDX-FileType: SOURCE // SPDX-FileType: SOURCE
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// *************************************************************** // **
#include <limits>
#include "bayesnet/utils/bayesnetUtils.h" #include "bayesnet/utils/bayesnetUtils.h"
#include "FeatureSelect.h" #include "FeatureSelect.h"
namespace bayesnet {
FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
namespace bayesnet {
using namespace torch::indexing; // for Ellipsis constant
//---------------------------------------------------------------------
// ctor
//---------------------------------------------------------------------
FeatureSelect::FeatureSelect(const torch::Tensor& samples,
const std::vector<std::string>& features,
const std::string& className,
int maxFeatures,
int classNumStates,
const torch::Tensor& weights)
: Metrics(samples, features, className, classNumStates),
maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures),
weights(weights)
{ {
} }
//---------------------------------------------------------------------
// public helpers
//---------------------------------------------------------------------
void FeatureSelect::initialize() void FeatureSelect::initialize()
{ {
selectedFeatures.clear(); selectedFeatures.clear();
selectedScores.clear(); selectedScores.clear();
suLabels.clear();
suFeatures.clear();
fitted = false;
} }
//---------------------------------------------------------------------
// Symmetrical Uncertainty (SU)
//---------------------------------------------------------------------
double FeatureSelect::symmetricalUncertainty(int a, int b) double FeatureSelect::symmetricalUncertainty(int a, int b)
{ {
/* /*
Compute symmetrical uncertainty. Normalize* information gain (mutual * Compute symmetrical uncertainty. Normalises the information gain
information) with the entropies of the features in order to compensate * (mutual information) with the entropies of the variables to compensate
the bias due to high cardinality features. *Range [0, 1] * the bias due to highcardinality features. Range: [0, 1]
(https://www.sciencedirect.com/science/article/pii/S0020025519303603) * See: https://www.sciencedirect.com/science/article/pii/S0020025519303603
*/ */
auto x = samples.index({ a, "..." });
auto y = samples.index({ b, "..." }); auto x = samples.index({ a, Ellipsis }); // row a => feature a
auto mu = mutualInformation(x, y, weights); auto y = (b >= 0) ? samples.index({ b, Ellipsis }) // row b (>=0) => feature b
auto hx = entropy(x, weights); : samples.index({ -1, Ellipsis }); // 1 treated as last row = labels
auto hy = entropy(y, weights);
return 2.0 * mu / (hx + hy); double mu = mutualInformation(x, y, weights);
double hx = entropy(x, weights);
double hy = entropy(y, weights);
const double denom = hx + hy;
if (denom == 0.0) return 0.0; // perfectly pure variables
return 2.0 * mu / denom;
} }
//---------------------------------------------------------------------
// SU featureclass
//---------------------------------------------------------------------
void FeatureSelect::computeSuLabels() void FeatureSelect::computeSuLabels()
{ {
// Compute Simmetrical Uncertainty between features and labels // Compute Symmetrical Uncertainty between each feature and the class labels
// https://en.wikipedia.org/wiki/Symmetric_uncertainty // https://en.wikipedia.org/wiki/Symmetric_uncertainty
for (int i = 0; i < features.size(); ++i) { const int classIdx = static_cast<int>(samples.size(0)) - 1; // labels in last row
suLabels.push_back(symmetricalUncertainty(i, -1)); suLabels.reserve(features.size());
for (int i = 0; i < static_cast<int>(features.size()); ++i) {
suLabels.emplace_back(symmetricalUncertainty(i, classIdx));
} }
} }
double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
//---------------------------------------------------------------------
// SU featurefeature with cache
//---------------------------------------------------------------------
double FeatureSelect::computeSuFeatures(int firstFeature, int secondFeature)
{ {
// Compute Simmetrical Uncertainty between features // Order the pair to exploit symmetry => only one entry in the map
// https://en.wikipedia.org/wiki/Symmetric_uncertainty auto ordered = std::minmax(firstFeature, secondFeature);
try { const std::pair<int, int> key{ ordered.first, ordered.second };
return suFeatures.at({ firstFeature, secondFeature });
} auto it = suFeatures.find(key);
catch (const std::out_of_range& e) { if (it != suFeatures.end()) return it->second;
double result = symmetricalUncertainty(firstFeature, secondFeature);
suFeatures[{firstFeature, secondFeature}] = result; double result = symmetricalUncertainty(key.first, key.second);
return result; suFeatures[key] = result; // store once (symmetry handled by ordering)
} return result;
} }
//---------------------------------------------------------------------
// Correlationbased Feature Selection (CFS) merit
//---------------------------------------------------------------------
double FeatureSelect::computeMeritCFS() double FeatureSelect::computeMeritCFS()
{ {
double rcf = 0; const int n = static_cast<int>(selectedFeatures.size());
for (auto feature : selectedFeatures) { if (n == 0) return 0.0;
rcf += suLabels[feature];
} // average r_cf (featureclass)
double rff = 0; double rcf_sum = 0.0;
int n = selectedFeatures.size(); for (int f : selectedFeatures) rcf_sum += suLabels[f];
for (const auto& item : doCombinations(selectedFeatures)) { const double rcf_avg = rcf_sum / n;
rff += computeSuFeatures(item.first, item.second);
} // average r_ff (featurefeature)
return rcf / sqrt(n + (n * n - n) * rff); double rff_sum = 0.0;
const auto& pairs = doCombinations(selectedFeatures); // generates each unordered pair once
for (const auto& p : pairs) rff_sum += computeSuFeatures(p.first, p.second);
const double numPairs = n * (n - 1) * 0.5;
const double rff_avg = (numPairs > 0) ? rff_sum / numPairs : 0.0;
// Merit_S = k * r_cf / sqrt( k + k*(k1) * r_ff ) (Hall, 1999)
const double k = static_cast<double>(n);
return (k * rcf_avg) / std::sqrt(k + k * (k - 1) * rff_avg);
} }
//---------------------------------------------------------------------
// getters
//---------------------------------------------------------------------
std::vector<int> FeatureSelect::getFeatures() const std::vector<int> FeatureSelect::getFeatures() const
{ {
if (!fitted) { if (!fitted) throw std::runtime_error("FeatureSelect not fitted");
throw std::runtime_error("FeatureSelect not fitted");
}
return selectedFeatures; return selectedFeatures;
} }
std::vector<double> FeatureSelect::getScores() const std::vector<double> FeatureSelect::getScores() const
{ {
if (!fitted) { if (!fitted) throw std::runtime_error("FeatureSelect not fitted");
throw std::runtime_error("FeatureSelect not fitted");
}
return selectedScores; return selectedScores;
} }
}
} // namespace bayesnet

1
lib/catch2 Submodule

Submodule lib/catch2 added at 029fe3b460

1
lib/folding Submodule

Submodule lib/folding added at 2ac43e32ac

1
lib/json Submodule

Submodule lib/json added at 620034ecec

1
lib/mdlp Submodule

Submodule lib/mdlp added at 7d62d6af4a

1
tests/lib/Files Submodule

Submodule tests/lib/Files added at a4329f5f9d

1
tests/lib/catch2 Submodule

Submodule tests/lib/catch2 added at 506276c592