Line data Source code
1 : // ***************************************************************
2 : // SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
3 : // SPDX-FileType: SOURCE
4 : // SPDX-License-Identifier: MIT
5 : // ***************************************************************
6 :
7 : #include <limits>
8 : #include "bayesnet/utils/bayesnetUtils.h"
9 : #include "FeatureSelect.h"
10 : namespace bayesnet {
11 22 : FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
12 22 : Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
13 :
14 : {
15 22 : }
16 16 : void FeatureSelect::initialize()
17 : {
18 16 : selectedFeatures.clear();
19 16 : selectedScores.clear();
20 16 : }
21 411 : double FeatureSelect::symmetricalUncertainty(int a, int b)
22 : {
23 : /*
24 : Compute symmetrical uncertainty. Normalize* information gain (mutual
25 : information) with the entropies of the features in order to compensate
26 : the bias due to high cardinality features. *Range [0, 1]
27 : (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
28 : */
29 1233 : auto x = samples.index({ a, "..." });
30 1233 : auto y = samples.index({ b, "..." });
31 411 : auto mu = mutualInformation(x, y, weights);
32 411 : auto hx = entropy(x, weights);
33 411 : auto hy = entropy(y, weights);
34 411 : return 2.0 * mu / (hx + hy);
35 1233 : }
36 16 : void FeatureSelect::computeSuLabels()
37 : {
38 : // Compute Simmetrical Uncertainty between features and labels
39 : // https://en.wikipedia.org/wiki/Symmetric_uncertainty
40 135 : for (int i = 0; i < features.size(); ++i) {
41 119 : suLabels.push_back(symmetricalUncertainty(i, -1));
42 : }
43 16 : }
44 980 : double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
45 : {
46 : // Compute Simmetrical Uncertainty between features
47 : // https://en.wikipedia.org/wiki/Symmetric_uncertainty
48 : try {
49 980 : return suFeatures.at({ firstFeature, secondFeature });
50 : }
51 292 : catch (const std::out_of_range& e) {
52 292 : double result = symmetricalUncertainty(firstFeature, secondFeature);
53 292 : suFeatures[{firstFeature, secondFeature}] = result;
54 292 : return result;
55 292 : }
56 : }
57 158 : double FeatureSelect::computeMeritCFS()
58 : {
59 158 : double rcf = 0;
60 727 : for (auto feature : selectedFeatures) {
61 569 : rcf += suLabels[feature];
62 : }
63 158 : double rff = 0;
64 158 : int n = selectedFeatures.size();
65 1043 : for (const auto& item : doCombinations(selectedFeatures)) {
66 885 : rff += computeSuFeatures(item.first, item.second);
67 158 : }
68 158 : return rcf / sqrt(n + (n * n - n) * rff);
69 : }
70 16 : std::vector<int> FeatureSelect::getFeatures() const
71 : {
72 16 : if (!fitted) {
73 0 : throw std::runtime_error("FeatureSelect not fitted");
74 : }
75 16 : return selectedFeatures;
76 : }
77 16 : std::vector<double> FeatureSelect::getScores() const
78 : {
79 16 : if (!fitted) {
80 0 : throw std::runtime_error("FeatureSelect not fitted");
81 : }
82 16 : return selectedScores;
83 : }
84 : }
|