Line data Source code
1 : // ***************************************************************
2 : // SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
3 : // SPDX-FileType: SOURCE
4 : // SPDX-License-Identifier: MIT
5 : // ***************************************************************
6 :
7 : #include <limits>
8 : #include "bayesnet/utils/bayesnetUtils.h"
9 : #include "FeatureSelect.h"
10 : namespace bayesnet {
11 46 : FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
12 46 : Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
13 :
14 : {
15 46 : }
16 32 : void FeatureSelect::initialize()
17 : {
18 32 : selectedFeatures.clear();
19 32 : selectedScores.clear();
20 32 : }
21 822 : double FeatureSelect::symmetricalUncertainty(int a, int b)
22 : {
23 : /*
24 : Compute symmetrical uncertainty. Normalize* information gain (mutual
25 : information) with the entropies of the features in order to compensate
26 : the bias due to high cardinality features. *Range [0, 1]
27 : (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
28 : */
29 2466 : auto x = samples.index({ a, "..." });
30 2466 : auto y = samples.index({ b, "..." });
31 822 : auto mu = mutualInformation(x, y, weights);
32 822 : auto hx = entropy(x, weights);
33 822 : auto hy = entropy(y, weights);
34 822 : return 2.0 * mu / (hx + hy);
35 2466 : }
36 32 : void FeatureSelect::computeSuLabels()
37 : {
38 : // Compute Simmetrical Uncertainty between features and labels
39 : // https://en.wikipedia.org/wiki/Symmetric_uncertainty
40 270 : for (int i = 0; i < features.size(); ++i) {
41 238 : suLabels.push_back(symmetricalUncertainty(i, -1));
42 : }
43 32 : }
44 1960 : double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
45 : {
46 : // Compute Simmetrical Uncertainty between features
47 : // https://en.wikipedia.org/wiki/Symmetric_uncertainty
48 : try {
49 1960 : return suFeatures.at({ firstFeature, secondFeature });
50 : }
51 584 : catch (const std::out_of_range& e) {
52 584 : double result = symmetricalUncertainty(firstFeature, secondFeature);
53 584 : suFeatures[{firstFeature, secondFeature}] = result;
54 584 : return result;
55 584 : }
56 : }
57 316 : double FeatureSelect::computeMeritCFS()
58 : {
59 316 : double rcf = 0;
60 1454 : for (auto feature : selectedFeatures) {
61 1138 : rcf += suLabels[feature];
62 : }
63 316 : double rff = 0;
64 316 : int n = selectedFeatures.size();
65 2086 : for (const auto& item : doCombinations(selectedFeatures)) {
66 1770 : rff += computeSuFeatures(item.first, item.second);
67 316 : }
68 316 : return rcf / sqrt(n + (n * n - n) * rff);
69 : }
70 36 : std::vector<int> FeatureSelect::getFeatures() const
71 : {
72 36 : if (!fitted) {
73 4 : throw std::runtime_error("FeatureSelect not fitted");
74 : }
75 32 : return selectedFeatures;
76 : }
77 36 : std::vector<double> FeatureSelect::getScores() const
78 : {
79 36 : if (!fitted) {
80 4 : throw std::runtime_error("FeatureSelect not fitted");
81 : }
82 32 : return selectedScores;
83 : }
84 : }
|