Remove FeatureSel, add SelectKBest to BayesMetrics
This commit is contained in:
parent
a3e665eed6
commit
704dc937be
@ -60,7 +60,6 @@ add_git_submodule("lib/json")
|
|||||||
# --------------
|
# --------------
|
||||||
add_subdirectory(config)
|
add_subdirectory(config)
|
||||||
add_subdirectory(lib/Files)
|
add_subdirectory(lib/Files)
|
||||||
add_subdirectory(lib/FeatureSelect)
|
|
||||||
add_subdirectory(src/BayesNet)
|
add_subdirectory(src/BayesNet)
|
||||||
add_subdirectory(src/Platform)
|
add_subdirectory(src/Platform)
|
||||||
add_subdirectory(sample)
|
add_subdirectory(sample)
|
||||||
|
@ -1 +0,0 @@
|
|||||||
add_library(FeatureSelect FeatureSelect.cpp)
|
|
@ -1,119 +0,0 @@
|
|||||||
#include "FeatureSelect.h"
|
|
||||||
namespace features {
|
|
||||||
SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat)
|
|
||||||
: samples(samples), labels(labels), weights(weights), k(k), nat(nat)
|
|
||||||
{
|
|
||||||
if (samples.size() == 0 || samples[0].size() == 0)
|
|
||||||
throw invalid_argument("features must be a non-empty matrix");
|
|
||||||
if (samples.size() != labels.size())
|
|
||||||
throw invalid_argument("number of samples (" + to_string(samples.size()) + ") and labels (" + to_string(labels.size()) + ") must be equal");
|
|
||||||
if (samples.size() != weights.size())
|
|
||||||
throw invalid_argument("number of samples and weights must be equal");
|
|
||||||
if (k < 1 || k > static_cast<int>(samples[0].size()))
|
|
||||||
throw invalid_argument("k must be between 1 and number of features");
|
|
||||||
numFeatures = 0;
|
|
||||||
numClasses = 0;
|
|
||||||
numSamples = 0;
|
|
||||||
fitted = false;
|
|
||||||
}
|
|
||||||
SelectKBestWeighted& SelectKBestWeighted::fit()
|
|
||||||
{
|
|
||||||
auto labelsCopy = labels;
|
|
||||||
numFeatures = samples[0].size();
|
|
||||||
numSamples = samples.size();
|
|
||||||
// compute number of classes
|
|
||||||
sort(labelsCopy.begin(), labelsCopy.end());
|
|
||||||
auto last = unique(labelsCopy.begin(), labelsCopy.end());
|
|
||||||
labelsCopy.erase(last, labelsCopy.end());
|
|
||||||
numClasses = labelsCopy.size();
|
|
||||||
// compute scores
|
|
||||||
scores.reserve(numFeatures);
|
|
||||||
for (int i = 0; i < numFeatures; ++i) {
|
|
||||||
scores.push_back(MutualInformation(i));
|
|
||||||
features.push_back(i);
|
|
||||||
}
|
|
||||||
// sort & reduce scores and features
|
|
||||||
sort(features.begin(), features.end(), [&](int i, int j)
|
|
||||||
{ return scores[i] > scores[j]; });
|
|
||||||
sort(scores.begin(), scores.end(), greater<precision_t>());
|
|
||||||
features.resize(k);
|
|
||||||
scores.resize(k);
|
|
||||||
fitted = true;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
precision_t SelectKBestWeighted::entropyLabel()
|
|
||||||
{
|
|
||||||
return entropy(labels);
|
|
||||||
}
|
|
||||||
precision_t SelectKBestWeighted::entropy(const sample_t& data)
|
|
||||||
{
|
|
||||||
precision_t ventropy = 0, totalWeight = 0;
|
|
||||||
score_t counts(numClasses + 1, 0);
|
|
||||||
for (auto i = 0; i < static_cast<int>(data.size()); ++i) {
|
|
||||||
counts[data[i]] += weights[i];
|
|
||||||
totalWeight += weights[i];
|
|
||||||
}
|
|
||||||
for (auto count : counts) {
|
|
||||||
precision_t p = count / totalWeight;
|
|
||||||
if (p > 0) {
|
|
||||||
if (nat) {
|
|
||||||
ventropy -= p * log(p);
|
|
||||||
} else {
|
|
||||||
ventropy -= p * log2(p);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ventropy;
|
|
||||||
}
|
|
||||||
// H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
|
|
||||||
precision_t SelectKBestWeighted::conditionalEntropy(const int feature)
|
|
||||||
{
|
|
||||||
unordered_map<value_t, precision_t> featureCounts;
|
|
||||||
unordered_map<value_t, unordered_map<value_t, precision_t>> jointCounts;
|
|
||||||
featureCounts.clear();
|
|
||||||
jointCounts.clear();
|
|
||||||
precision_t totalWeight = 0;
|
|
||||||
for (auto i = 0; i < numSamples; i++) {
|
|
||||||
featureCounts[samples[i][feature]] += weights[i];
|
|
||||||
jointCounts[samples[i][feature]][labels[i]] += weights[i];
|
|
||||||
totalWeight += weights[i];
|
|
||||||
}
|
|
||||||
if (totalWeight == 0)
|
|
||||||
throw invalid_argument("Total weight should not be zero");
|
|
||||||
precision_t entropy = 0;
|
|
||||||
for (auto& [feat, count] : featureCounts) {
|
|
||||||
auto p_f = count / totalWeight;
|
|
||||||
precision_t entropy_f = 0;
|
|
||||||
for (auto& [label, jointCount] : jointCounts[feat]) {
|
|
||||||
auto p_l_f = jointCount / count;
|
|
||||||
if (p_l_f > 0) {
|
|
||||||
if (nat) {
|
|
||||||
entropy_f -= p_l_f * log(p_l_f);
|
|
||||||
} else {
|
|
||||||
entropy_f -= p_l_f * log2(p_l_f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
entropy += p_f * entropy_f;
|
|
||||||
}
|
|
||||||
return entropy;
|
|
||||||
}
|
|
||||||
// I(X;Y) = H(Y) - H(Y|X)
|
|
||||||
precision_t SelectKBestWeighted::MutualInformation(const int i)
|
|
||||||
{
|
|
||||||
return entropyLabel() - conditionalEntropy(i);
|
|
||||||
}
|
|
||||||
score_t SelectKBestWeighted::getScores() const
|
|
||||||
{
|
|
||||||
if (!fitted)
|
|
||||||
throw logic_error("score not fitted");
|
|
||||||
return scores;
|
|
||||||
}
|
|
||||||
//Return the indices of the selected features
|
|
||||||
labels_t SelectKBestWeighted::getFeatures() const
|
|
||||||
{
|
|
||||||
if (!fitted)
|
|
||||||
throw logic_error("score not fitted");
|
|
||||||
return features;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,38 +0,0 @@
|
|||||||
#ifndef SELECT_K_BEST_WEIGHTED_H
|
|
||||||
#define SELECT_K_BEST_WEIGHTED_H
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
using namespace std;
|
|
||||||
namespace features {
|
|
||||||
typedef float precision_t;
|
|
||||||
typedef int value_t;
|
|
||||||
typedef vector<value_t> sample_t;
|
|
||||||
typedef vector<sample_t> samples_t;
|
|
||||||
typedef vector<value_t> labels_t;
|
|
||||||
typedef vector<precision_t> score_t, weights_t;
|
|
||||||
|
|
||||||
class SelectKBestWeighted {
|
|
||||||
private:
|
|
||||||
const samples_t samples;
|
|
||||||
const labels_t labels;
|
|
||||||
const weights_t weights;
|
|
||||||
const int k;
|
|
||||||
bool nat; // use natural log or log2
|
|
||||||
int numFeatures, numClasses, numSamples;
|
|
||||||
bool fitted;
|
|
||||||
score_t scores; // scores of the features
|
|
||||||
labels_t features; // indices of the selected features
|
|
||||||
precision_t entropyLabel();
|
|
||||||
precision_t entropy(const sample_t&);
|
|
||||||
precision_t conditionalEntropy(const int);
|
|
||||||
precision_t MutualInformation(const int);
|
|
||||||
public:
|
|
||||||
SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool);
|
|
||||||
SelectKBestWeighted& fit();
|
|
||||||
score_t getScores() const;
|
|
||||||
labels_t getFeatures() const; //Return the indices of the selected features
|
|
||||||
static inline string version() { return "0.1.0"; };
|
|
||||||
};
|
|
||||||
}
|
|
||||||
#endif
|
|
@ -21,6 +21,31 @@ namespace bayesnet {
|
|||||||
}
|
}
|
||||||
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
|
samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
|
||||||
}
|
}
|
||||||
|
vector<int> Metrics::SelectKBestWeighted(const torch::Tensor& weights, unsigned k)
|
||||||
|
{
|
||||||
|
auto n = samples.size(1);
|
||||||
|
if (k == 0) {
|
||||||
|
k = n;
|
||||||
|
}
|
||||||
|
// compute scores
|
||||||
|
scoresKBest.reserve(n);
|
||||||
|
auto label = samples.index({ -1, "..." });
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights));
|
||||||
|
featuresKBest.push_back(i);
|
||||||
|
}
|
||||||
|
// sort & reduce scores and features
|
||||||
|
sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
|
||||||
|
{ return scoresKBest[i] > scoresKBest[j]; });
|
||||||
|
sort(scoresKBest.begin(), scoresKBest.end(), std::greater<double>());
|
||||||
|
featuresKBest.resize(k);
|
||||||
|
scoresKBest.resize(k);
|
||||||
|
return featuresKBest;
|
||||||
|
}
|
||||||
|
vector<double> Metrics::getScoresKBest() const
|
||||||
|
{
|
||||||
|
return scoresKBest;
|
||||||
|
}
|
||||||
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
|
vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
|
||||||
{
|
{
|
||||||
vector<pair<string, string>> result;
|
vector<pair<string, string>> result;
|
||||||
|
@ -12,6 +12,8 @@ namespace bayesnet {
|
|||||||
vector<string> features;
|
vector<string> features;
|
||||||
string className;
|
string className;
|
||||||
int classNumStates = 0;
|
int classNumStates = 0;
|
||||||
|
vector<double> scoresKBest;
|
||||||
|
vector<int> featuresKBest; // sorted indices of the features
|
||||||
double entropy(const Tensor& feature, const Tensor& weights);
|
double entropy(const Tensor& feature, const Tensor& weights);
|
||||||
double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
|
double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
|
||||||
vector<pair<string, string>> doCombinations(const vector<string>&);
|
vector<pair<string, string>> doCombinations(const vector<string>&);
|
||||||
@ -19,6 +21,8 @@ namespace bayesnet {
|
|||||||
Metrics() = default;
|
Metrics() = default;
|
||||||
Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
|
Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
|
||||||
Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
|
Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
|
||||||
|
vector<int> SelectKBestWeighted(const torch::Tensor& weights, unsigned k = 0);
|
||||||
|
vector<double> getScoresKBest() const;
|
||||||
double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
|
double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
|
||||||
vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
|
vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
|
||||||
Tensor conditionalEdge(const torch::Tensor& weights);
|
Tensor conditionalEdge(const torch::Tensor& weights);
|
||||||
|
@ -1,36 +1,35 @@
|
|||||||
#include "BoostAODE.h"
|
#include "BoostAODE.h"
|
||||||
#include "FeatureSelect.h"
|
#include "BayesMetrics.h"
|
||||||
|
|
||||||
namespace bayesnet {
|
namespace bayesnet {
|
||||||
BoostAODE::BoostAODE() : Ensemble() {}
|
BoostAODE::BoostAODE() : Ensemble() {}
|
||||||
void BoostAODE::buildModel(const torch::Tensor& weights)
|
void BoostAODE::buildModel(const torch::Tensor& weights)
|
||||||
{
|
{
|
||||||
models.clear();
|
models.clear();
|
||||||
int n_samples = dataset.size(1);
|
|
||||||
int n_features = dataset.size(0);
|
|
||||||
features::samples_t vsamples;
|
|
||||||
for (auto i = 0; i < n_samples; ++i) {
|
|
||||||
auto row = dataset.index({ "...", i });
|
|
||||||
// convert row to std::vector<int>
|
|
||||||
auto vrow = vector<int>(row.data_ptr<int>(), row.data_ptr<int>() + row.numel());
|
|
||||||
vsamples.push_back(vrow);
|
|
||||||
}
|
|
||||||
auto vweights = features::weights_t(n_samples, 1.0 / n_samples);
|
|
||||||
auto row = dataset.index({ -1, "..." });
|
|
||||||
auto yv = features::labels_t(row.data_ptr<int>(), row.data_ptr<int>() + row.numel());
|
|
||||||
auto featureSelection = features::SelectKBestWeighted(vsamples, yv, vweights, n_features, true);
|
|
||||||
auto features = featureSelection.fit().getFeatures();
|
|
||||||
// features = (
|
|
||||||
// CSelectKBestWeighted(
|
|
||||||
// self.X_, self.y_, weights, k = self.n_features_in_
|
|
||||||
// )
|
|
||||||
// .fit()
|
|
||||||
// .get_features()
|
|
||||||
auto scores = features::score_t(n_features, 0.0);
|
|
||||||
for (int i = 0; i < features.size(); ++i) {
|
for (int i = 0; i < features.size(); ++i) {
|
||||||
models.push_back(std::make_unique<SPODE>(i));
|
models.push_back(std::make_unique<SPODE>(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
void BoostAODE::trainModel(const torch::Tensor& weights)
|
||||||
|
{
|
||||||
|
// End building vectors
|
||||||
|
Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kDouble);
|
||||||
|
auto X_ = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." });
|
||||||
|
auto featureSelection = metrics.SelectKBestWeighted(weights_, n); // Get all the features sorted
|
||||||
|
for (int i = 0; i < features.size(); ++i) {
|
||||||
|
models[i].fit(dataset, features, className, states, weights_);
|
||||||
|
auto ypred = models[i].predict(X_);
|
||||||
|
// em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
|
||||||
|
// am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1)
|
||||||
|
// # Step 3.2: Update weights for next classifier
|
||||||
|
// weights = [
|
||||||
|
// wm * np.exp(am * (ym != yp))
|
||||||
|
// for wm, ym, yp in zip(weights, self.y_, y_pred)
|
||||||
|
// ]
|
||||||
|
// # Step 4: Add the new model
|
||||||
|
// self.estimators_.append(estimator)
|
||||||
|
}
|
||||||
|
}
|
||||||
vector<string> BoostAODE::graph(const string& title) const
|
vector<string> BoostAODE::graph(const string& title) const
|
||||||
{
|
{
|
||||||
return Ensemble::graph(title);
|
return Ensemble::graph(title);
|
||||||
|
@ -6,6 +6,7 @@ namespace bayesnet {
|
|||||||
class BoostAODE : public Ensemble {
|
class BoostAODE : public Ensemble {
|
||||||
protected:
|
protected:
|
||||||
void buildModel(const torch::Tensor& weights) override;
|
void buildModel(const torch::Tensor& weights) override;
|
||||||
|
void trainModel(const torch::Tensor& weights) override;
|
||||||
public:
|
public:
|
||||||
BoostAODE();
|
BoostAODE();
|
||||||
virtual ~BoostAODE() {};
|
virtual ~BoostAODE() {};
|
||||||
|
@ -1,9 +1,8 @@
|
|||||||
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
|
include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
|
include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/lib/featureselect)
|
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
|
||||||
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
|
||||||
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
|
add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc
|
||||||
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
|
KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc
|
||||||
Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
|
Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
|
||||||
target_link_libraries(BayesNet mdlp FeatureSelect "${TORCH_LIBRARIES}")
|
target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
|
@ -21,7 +21,6 @@ namespace bayesnet {
|
|||||||
SPODELd& SPODELd::fit(torch::Tensor& dataset, vector<string>& features_, string className_, map<string, vector<int>>& states_)
|
SPODELd& SPODELd::fit(torch::Tensor& dataset, vector<string>& features_, string className_, map<string, vector<int>>& states_)
|
||||||
{
|
{
|
||||||
Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
|
Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
|
||||||
cout << "Xf " << Xf.sizes() << " dtype: " << Xf.dtype() << endl;
|
|
||||||
y = dataset.index({ -1, "..." }).clone();
|
y = dataset.index({ -1, "..." }).clone();
|
||||||
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
|
// This first part should go in a Classifier method called fit_local_discretization o fit_float...
|
||||||
features = features_;
|
features = features_;
|
||||||
|
Loading…
Reference in New Issue
Block a user