Fix weights mistakes in computation

This commit is contained in:
2023-08-16 12:32:51 +02:00
parent 4d4780c1d5
commit 80b20f35b4
16 changed files with 262 additions and 75 deletions

View File

@@ -1,2 +1 @@
add_library(ArffFiles ArffFiles.cc)
#target_link_libraries(BayesNet "${TORCH_LIBRARIES}")
add_library(ArffFiles ArffFiles.cc)

View File

@@ -0,0 +1 @@
add_library(FeatureSelect FeatureSelect.cpp)

View File

@@ -0,0 +1,119 @@
#include "FeatureSelect.h"
namespace features {
SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat)
: samples(samples), labels(labels), weights(weights), k(k), nat(nat)
{
if (samples.size() == 0 || samples[0].size() == 0)
throw invalid_argument("features must be a non-empty matrix");
if (samples.size() != labels.size())
throw invalid_argument("number of samples (" + to_string(samples.size()) + ") and labels (" + to_string(labels.size()) + ") must be equal");
if (samples.size() != weights.size())
throw invalid_argument("number of samples and weights must be equal");
if (k < 1 || k > static_cast<int>(samples[0].size()))
throw invalid_argument("k must be between 1 and number of features");
numFeatures = 0;
numClasses = 0;
numSamples = 0;
fitted = false;
}
SelectKBestWeighted& SelectKBestWeighted::fit()
{
auto labelsCopy = labels;
numFeatures = samples[0].size();
numSamples = samples.size();
// compute number of classes
sort(labelsCopy.begin(), labelsCopy.end());
auto last = unique(labelsCopy.begin(), labelsCopy.end());
labelsCopy.erase(last, labelsCopy.end());
numClasses = labelsCopy.size();
// compute scores
scores.reserve(numFeatures);
for (int i = 0; i < numFeatures; ++i) {
scores.push_back(MutualInformation(i));
features.push_back(i);
}
// sort & reduce scores and features
sort(features.begin(), features.end(), [&](int i, int j)
{ return scores[i] > scores[j]; });
sort(scores.begin(), scores.end(), greater<precision_t>());
features.resize(k);
scores.resize(k);
fitted = true;
return *this;
}
precision_t SelectKBestWeighted::entropyLabel()
{
return entropy(labels);
}
precision_t SelectKBestWeighted::entropy(const sample_t& data)
{
precision_t ventropy = 0, totalWeight = 0;
score_t counts(numClasses + 1, 0);
for (auto i = 0; i < static_cast<int>(data.size()); ++i) {
counts[data[i]] += weights[i];
totalWeight += weights[i];
}
for (auto count : counts) {
precision_t p = count / totalWeight;
if (p > 0) {
if (nat) {
ventropy -= p * log(p);
} else {
ventropy -= p * log2(p);
}
}
}
return ventropy;
}
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
precision_t SelectKBestWeighted::conditionalEntropy(const int feature)
{
unordered_map<value_t, precision_t> featureCounts;
unordered_map<value_t, unordered_map<value_t, precision_t>> jointCounts;
featureCounts.clear();
jointCounts.clear();
precision_t totalWeight = 0;
for (auto i = 0; i < numSamples; i++) {
featureCounts[samples[i][feature]] += weights[i];
jointCounts[samples[i][feature]][labels[i]] += weights[i];
totalWeight += weights[i];
}
if (totalWeight == 0)
throw invalid_argument("Total weight should not be zero");
precision_t entropy = 0;
for (auto& [feat, count] : featureCounts) {
auto p_f = count / totalWeight;
precision_t entropy_f = 0;
for (auto& [label, jointCount] : jointCounts[feat]) {
auto p_l_f = jointCount / count;
if (p_l_f > 0) {
if (nat) {
entropy_f -= p_l_f * log(p_l_f);
} else {
entropy_f -= p_l_f * log2(p_l_f);
}
}
}
entropy += p_f * entropy_f;
}
return entropy;
}
// I(X;Y) = H(Y) - H(Y|X)
precision_t SelectKBestWeighted::MutualInformation(const int i)
{
return entropyLabel() - conditionalEntropy(i);
}
score_t SelectKBestWeighted::getScores() const
{
if (!fitted)
throw logic_error("score not fitted");
return scores;
}
//Return the indices of the selected features
labels_t SelectKBestWeighted::getFeatures() const
{
if (!fitted)
throw logic_error("score not fitted");
return features;
}
}

View File

@@ -0,0 +1,38 @@
#ifndef SELECT_K_BEST_WEIGHTED_H
#define SELECT_K_BEST_WEIGHTED_H
#include <map>
#include <vector>
#include <string>
using namespace std;
namespace features {
typedef float precision_t;
typedef int value_t;
typedef vector<value_t> sample_t;
typedef vector<sample_t> samples_t;
typedef vector<value_t> labels_t;
typedef vector<precision_t> score_t, weights_t;
class SelectKBestWeighted {
private:
const samples_t samples;
const labels_t labels;
const weights_t weights;
const int k;
bool nat; // use natural log or log2
int numFeatures, numClasses, numSamples;
bool fitted;
score_t scores; // scores of the features
labels_t features; // indices of the selected features
precision_t entropyLabel();
precision_t entropy(const sample_t&);
precision_t conditionalEntropy(const int);
precision_t MutualInformation(const int);
public:
SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool);
SelectKBestWeighted& fit();
score_t getScores() const;
labels_t getFeatures() const; //Return the indices of the selected features
static inline string version() { return "0.1.0"; };
};
}
#endif