Line data Source code
1 : // ***************************************************************
2 : // SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez
3 : // SPDX-FileType: SOURCE
4 : // SPDX-License-Identifier: MIT
5 : // ***************************************************************
6 :
7 : #include "Mst.h"
8 : #include "BayesMetrics.h"
9 : namespace bayesnet {
10 : //samples is n+1xm tensor used to fit the model
11 338 : Metrics::Metrics(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int classNumStates)
12 338 : : samples(samples)
13 338 : , features(features)
14 338 : , className(className)
15 338 : , classNumStates(classNumStates)
16 : {
17 338 : }
18 : //samples is n+1xm std::vector used to fit the model
19 16 : Metrics::Metrics(const std::vector<std::vector<int>>& vsamples, const std::vector<int>& labels, const std::vector<std::string>& features, const std::string& className, const int classNumStates)
20 16 : : features(features)
21 16 : , className(className)
22 16 : , classNumStates(classNumStates)
23 32 : , samples(torch::zeros({ static_cast<int>(vsamples.size() + 1), static_cast<int>(vsamples[0].size()) }, torch::kInt32))
24 : {
25 128 : for (int i = 0; i < vsamples.size(); ++i) {
26 448 : samples.index_put_({ i, "..." }, torch::tensor(vsamples[i], torch::kInt32));
27 : }
28 64 : samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
29 144 : }
30 90 : std::vector<int> Metrics::SelectKBestWeighted(const torch::Tensor& weights, bool ascending, unsigned k)
31 : {
32 : // Return the K Best features
33 90 : auto n = features.size();
34 90 : if (k == 0) {
35 0 : k = n;
36 : }
37 : // compute scores
38 90 : scoresKBest.clear();
39 90 : featuresKBest.clear();
40 270 : auto label = samples.index({ -1, "..." });
41 2837 : for (int i = 0; i < n; ++i) {
42 8241 : scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights));
43 2747 : featuresKBest.push_back(i);
44 : }
45 : // sort & reduce scores and features
46 90 : if (ascending) {
47 19 : sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
48 453 : { return scoresKBest[i] < scoresKBest[j]; });
49 19 : sort(scoresKBest.begin(), scoresKBest.end(), std::less<double>());
50 19 : if (k < n) {
51 28 : for (int i = 0; i < n - k; ++i) {
52 20 : featuresKBest.erase(featuresKBest.begin());
53 20 : scoresKBest.erase(scoresKBest.begin());
54 : }
55 : }
56 : } else {
57 71 : sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
58 12619 : { return scoresKBest[i] > scoresKBest[j]; });
59 71 : sort(scoresKBest.begin(), scoresKBest.end(), std::greater<double>());
60 71 : featuresKBest.resize(k);
61 71 : scoresKBest.resize(k);
62 : }
63 180 : return featuresKBest;
64 2927 : }
65 8 : std::vector<double> Metrics::getScoresKBest() const
66 : {
67 8 : return scoresKBest;
68 : }
69 :
70 34 : torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights)
71 : {
72 34 : auto result = std::vector<double>();
73 34 : auto source = std::vector<std::string>(features);
74 34 : source.push_back(className);
75 34 : auto combinations = doCombinations(source);
76 : // Compute class prior
77 34 : auto margin = torch::zeros({ classNumStates }, torch::kFloat);
78 184 : for (int value = 0; value < classNumStates; ++value) {
79 600 : auto mask = samples.index({ -1, "..." }) == value;
80 150 : margin[value] = mask.sum().item<double>() / samples.size(1);
81 150 : }
82 918 : for (auto [first, second] : combinations) {
83 884 : int index_first = find(features.begin(), features.end(), first) - features.begin();
84 884 : int index_second = find(features.begin(), features.end(), second) - features.begin();
85 884 : double accumulated = 0;
86 5240 : for (int value = 0; value < classNumStates; ++value) {
87 17424 : auto mask = samples.index({ -1, "..." }) == value;
88 13068 : auto first_dataset = samples.index({ index_first, mask });
89 13068 : auto second_dataset = samples.index({ index_second, mask });
90 8712 : auto weights_dataset = weights.index({ mask });
91 8712 : auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset);
92 4356 : auto pb = margin[value].item<double>();
93 4356 : accumulated += pb * mi;
94 4356 : }
95 884 : result.push_back(accumulated);
96 884 : }
97 34 : long n_vars = source.size();
98 34 : auto matrix = torch::zeros({ n_vars, n_vars });
99 34 : auto indices = torch::triu_indices(n_vars, n_vars, 1);
100 918 : for (auto i = 0; i < result.size(); ++i) {
101 884 : auto x = indices[0][i];
102 884 : auto y = indices[1][i];
103 884 : matrix[x][y] = result[i];
104 884 : matrix[y][x] = result[i];
105 884 : }
106 68 : return matrix;
107 21964 : }
108 : // To use in Python
109 0 : std::vector<float> Metrics::conditionalEdgeWeights(std::vector<float>& weights_)
110 : {
111 0 : const torch::Tensor weights = torch::tensor(weights_);
112 0 : auto matrix = conditionalEdge(weights);
113 0 : std::vector<float> v(matrix.data_ptr<float>(), matrix.data_ptr<float>() + matrix.numel());
114 0 : return v;
115 0 : }
116 8506 : double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights)
117 : {
118 8506 : torch::Tensor counts = feature.bincount(weights);
119 8506 : double totalWeight = counts.sum().item<double>();
120 8506 : torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
121 8506 : torch::Tensor logProbs = torch::log(probs);
122 8506 : torch::Tensor entropy = -probs * logProbs;
123 17012 : return entropy.nansum().item<double>();
124 8506 : }
125 : // H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
126 7684 : double Metrics::conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
127 : {
128 7684 : int numSamples = firstFeature.sizes()[0];
129 7684 : torch::Tensor featureCounts = secondFeature.bincount(weights);
130 7684 : std::unordered_map<int, std::unordered_map<int, double>> jointCounts;
131 7684 : double totalWeight = 0;
132 991784 : for (auto i = 0; i < numSamples; i++) {
133 984100 : jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += weights[i].item<double>();
134 984100 : totalWeight += weights[i].item<float>();
135 : }
136 7684 : if (totalWeight == 0)
137 0 : return 0;
138 7684 : double entropyValue = 0;
139 26852 : for (int value = 0; value < featureCounts.sizes()[0]; ++value) {
140 19168 : double p_f = featureCounts[value].item<double>() / totalWeight;
141 19168 : double entropy_f = 0;
142 57129 : for (auto& [label, jointCount] : jointCounts[value]) {
143 37961 : double p_l_f = jointCount / featureCounts[value].item<double>();
144 37961 : if (p_l_f > 0) {
145 37961 : entropy_f -= p_l_f * log(p_l_f);
146 : } else {
147 0 : entropy_f = 0;
148 : }
149 : }
150 19168 : entropyValue += p_f * entropy_f;
151 : }
152 7684 : return entropyValue;
153 7684 : }
154 : // I(X;Y) = H(Y) - H(Y|X)
155 7684 : double Metrics::mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights)
156 : {
157 7684 : return entropy(firstFeature, weights) - conditionalEntropy(firstFeature, secondFeature, weights);
158 : }
159 : /*
160 : Compute the maximum spanning tree considering the weights as distances
161 : and the indices of the weights as nodes of this square matrix using
162 : Kruskal algorithm
163 : */
164 29 : std::vector<std::pair<int, int>> Metrics::maximumSpanningTree(const std::vector<std::string>& features, const torch::Tensor& weights, const int root)
165 : {
166 29 : auto mst = MST(features, weights, root);
167 58 : return mst.maximumSpanningTree();
168 29 : }
169 : }
|