diff --git a/bayesnet/feature_selection/FeatureSelect.cc b/bayesnet/feature_selection/FeatureSelect.cc index 8e70591..130bd3e 100644 --- a/bayesnet/feature_selection/FeatureSelect.cc +++ b/bayesnet/feature_selection/FeatureSelect.cc @@ -1,84 +1,141 @@ -// *************************************************************** +// ** // SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez // SPDX-FileType: SOURCE // SPDX-License-Identifier: MIT -// *************************************************************** +// ** -#include #include "bayesnet/utils/bayesnetUtils.h" #include "FeatureSelect.h" -namespace bayesnet { - FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) : - Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights) +namespace bayesnet { + + using namespace torch::indexing; // for Ellipsis constant + + //--------------------------------------------------------------------- + // ctor + //--------------------------------------------------------------------- + FeatureSelect::FeatureSelect(const torch::Tensor& samples, + const std::vector& features, + const std::string& className, + int maxFeatures, + int classNumStates, + const torch::Tensor& weights) + : Metrics(samples, features, className, classNumStates), + maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), + weights(weights) { } + + //--------------------------------------------------------------------- + // public helpers + //--------------------------------------------------------------------- void FeatureSelect::initialize() { selectedFeatures.clear(); selectedScores.clear(); + suLabels.clear(); + suFeatures.clear(); + + fitted = false; } + + //--------------------------------------------------------------------- + // Symmetrical Uncertainty (SU) + //--------------------------------------------------------------------- double FeatureSelect::symmetricalUncertainty(int a, int b) { /* - Compute symmetrical uncertainty. Normalize* information gain (mutual - information) with the entropies of the features in order to compensate - the bias due to high cardinality features. *Range [0, 1] - (https://www.sciencedirect.com/science/article/pii/S0020025519303603) - */ - auto x = samples.index({ a, "..." }); - auto y = samples.index({ b, "..." }); - auto mu = mutualInformation(x, y, weights); - auto hx = entropy(x, weights); - auto hy = entropy(y, weights); - return 2.0 * mu / (hx + hy); + * Compute symmetrical uncertainty. Normalises the information gain + * (mutual information) with the entropies of the variables to compensate + * the bias due to high‑cardinality features. Range: [0, 1] + * See: https://www.sciencedirect.com/science/article/pii/S0020025519303603 + */ + + auto x = samples.index({ a, Ellipsis }); // row a => feature a + auto y = (b >= 0) ? samples.index({ b, Ellipsis }) // row b (>=0) => feature b + : samples.index({ -1, Ellipsis }); // ‑1 treated as last row = labels + + double mu = mutualInformation(x, y, weights); + double hx = entropy(x, weights); + double hy = entropy(y, weights); + + const double denom = hx + hy; + if (denom == 0.0) return 0.0; // perfectly pure variables + + return 2.0 * mu / denom; } + + //--------------------------------------------------------------------- + // SU feature–class + //--------------------------------------------------------------------- void FeatureSelect::computeSuLabels() { - // Compute Simmetrical Uncertainty between features and labels + // Compute Symmetrical Uncertainty between each feature and the class labels // https://en.wikipedia.org/wiki/Symmetric_uncertainty - for (int i = 0; i < features.size(); ++i) { - suLabels.push_back(symmetricalUncertainty(i, -1)); + const int classIdx = static_cast(samples.size(0)) - 1; // labels in last row + suLabels.reserve(features.size()); + for (int i = 0; i < static_cast(features.size()); ++i) { + suLabels.emplace_back(symmetricalUncertainty(i, classIdx)); } } - double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature) + + //--------------------------------------------------------------------- + // SU feature–feature with cache + //--------------------------------------------------------------------- + double FeatureSelect::computeSuFeatures(int firstFeature, int secondFeature) { - // Compute Simmetrical Uncertainty between features - // https://en.wikipedia.org/wiki/Symmetric_uncertainty - try { - return suFeatures.at({ firstFeature, secondFeature }); - } - catch (const std::out_of_range& e) { - double result = symmetricalUncertainty(firstFeature, secondFeature); - suFeatures[{firstFeature, secondFeature}] = result; - return result; - } + // Order the pair to exploit symmetry => only one entry in the map + auto ordered = std::minmax(firstFeature, secondFeature); + const std::pair key{ ordered.first, ordered.second }; + + auto it = suFeatures.find(key); + if (it != suFeatures.end()) return it->second; + + double result = symmetricalUncertainty(key.first, key.second); + suFeatures[key] = result; // store once (symmetry handled by ordering) + return result; } + + //--------------------------------------------------------------------- + // Correlation‑based Feature Selection (CFS) merit + //--------------------------------------------------------------------- double FeatureSelect::computeMeritCFS() { - double rcf = 0; - for (auto feature : selectedFeatures) { - rcf += suLabels[feature]; - } - double rff = 0; - int n = selectedFeatures.size(); - for (const auto& item : doCombinations(selectedFeatures)) { - rff += computeSuFeatures(item.first, item.second); - } - return rcf / sqrt(n + (n * n - n) * rff); + const int n = static_cast(selectedFeatures.size()); + if (n == 0) return 0.0; + + // average r_cf (feature–class) + double rcf_sum = 0.0; + for (int f : selectedFeatures) rcf_sum += suLabels[f]; + const double rcf_avg = rcf_sum / n; + + // average r_ff (feature–feature) + double rff_sum = 0.0; + const auto& pairs = doCombinations(selectedFeatures); // generates each unordered pair once + for (const auto& p : pairs) rff_sum += computeSuFeatures(p.first, p.second); + + const double numPairs = n * (n - 1) * 0.5; + const double rff_avg = (numPairs > 0) ? rff_sum / numPairs : 0.0; + + // Merit_S = k * r_cf / sqrt( k + k*(k‑1) * r_ff ) (Hall, 1999) + const double k = static_cast(n); + return (k * rcf_avg) / std::sqrt(k + k * (k - 1) * rff_avg); } + + //--------------------------------------------------------------------- + // getters + //--------------------------------------------------------------------- std::vector FeatureSelect::getFeatures() const { - if (!fitted) { - throw std::runtime_error("FeatureSelect not fitted"); - } + if (!fitted) throw std::runtime_error("FeatureSelect not fitted"); return selectedFeatures; } + std::vector FeatureSelect::getScores() const { - if (!fitted) { - throw std::runtime_error("FeatureSelect not fitted"); - } + if (!fitted) throw std::runtime_error("FeatureSelect not fitted"); return selectedScores; } -} \ No newline at end of file + +} // namespace bayesnet + \ No newline at end of file diff --git a/lib/catch2 b/lib/catch2 new file mode 160000 index 0000000..029fe3b --- /dev/null +++ b/lib/catch2 @@ -0,0 +1 @@ +Subproject commit 029fe3b4609dd84cd939b73357f37bbb75bcf82f diff --git a/lib/folding b/lib/folding new file mode 160000 index 0000000..2ac43e3 --- /dev/null +++ b/lib/folding @@ -0,0 +1 @@ +Subproject commit 2ac43e32ac1eac0c986702ec526cf5367a565ef0 diff --git a/lib/json b/lib/json new file mode 160000 index 0000000..620034e --- /dev/null +++ b/lib/json @@ -0,0 +1 @@ +Subproject commit 620034ececc93991c5c1183b73c3768d81ca84b3 diff --git a/lib/mdlp b/lib/mdlp new file mode 160000 index 0000000..7d62d6a --- /dev/null +++ b/lib/mdlp @@ -0,0 +1 @@ +Subproject commit 7d62d6af4a6ca944a3bbde0b61f651fd4b2d3f57 diff --git a/tests/lib/Files b/tests/lib/Files new file mode 160000 index 0000000..a4329f5 --- /dev/null +++ b/tests/lib/Files @@ -0,0 +1 @@ +Subproject commit a4329f5f9dfdb18ee3faa63bd5b665f2f253b8d2 diff --git a/tests/lib/catch2 b/tests/lib/catch2 new file mode 160000 index 0000000..506276c --- /dev/null +++ b/tests/lib/catch2 @@ -0,0 +1 @@ +Subproject commit 506276c59217429c93abd2fe9507c7f45eb81072