From 704dc937be0658f8b87e86f46ac4c8f6bfd53d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 16 Aug 2023 19:05:18 +0200 Subject: [PATCH] Remove FeatureSel, add SelectKBest to BayesMetrics --- CMakeLists.txt | 1 - lib/featureselect/CMakeLists.txt | 1 - lib/featureselect/FeatureSelect.cpp | 119 ---------------------------- lib/featureselect/FeatureSelect.h | 38 --------- src/BayesNet/BayesMetrics.cc | 25 ++++++ src/BayesNet/BayesMetrics.h | 4 + src/BayesNet/BoostAODE.cc | 43 +++++----- src/BayesNet/BoostAODE.h | 1 + src/BayesNet/CMakeLists.txt | 3 +- src/BayesNet/SPODELd.cc | 1 - 10 files changed, 52 insertions(+), 184 deletions(-) delete mode 100644 lib/featureselect/CMakeLists.txt delete mode 100644 lib/featureselect/FeatureSelect.cpp delete mode 100644 lib/featureselect/FeatureSelect.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 186a175..c53a3a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,6 @@ add_git_submodule("lib/json") # -------------- add_subdirectory(config) add_subdirectory(lib/Files) -add_subdirectory(lib/FeatureSelect) add_subdirectory(src/BayesNet) add_subdirectory(src/Platform) add_subdirectory(sample) diff --git a/lib/featureselect/CMakeLists.txt b/lib/featureselect/CMakeLists.txt deleted file mode 100644 index 06da1b7..0000000 --- a/lib/featureselect/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_library(FeatureSelect FeatureSelect.cpp) \ No newline at end of file diff --git a/lib/featureselect/FeatureSelect.cpp b/lib/featureselect/FeatureSelect.cpp deleted file mode 100644 index 6659063..0000000 --- a/lib/featureselect/FeatureSelect.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include "FeatureSelect.h" -namespace features { - SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat) - : samples(samples), labels(labels), weights(weights), k(k), nat(nat) - { - if (samples.size() == 0 || samples[0].size() == 0) - throw invalid_argument("features must be a non-empty matrix"); - if (samples.size() != labels.size()) - throw invalid_argument("number of samples (" + to_string(samples.size()) + ") and labels (" + to_string(labels.size()) + ") must be equal"); - if (samples.size() != weights.size()) - throw invalid_argument("number of samples and weights must be equal"); - if (k < 1 || k > static_cast(samples[0].size())) - throw invalid_argument("k must be between 1 and number of features"); - numFeatures = 0; - numClasses = 0; - numSamples = 0; - fitted = false; - } - SelectKBestWeighted& SelectKBestWeighted::fit() - { - auto labelsCopy = labels; - numFeatures = samples[0].size(); - numSamples = samples.size(); - // compute number of classes - sort(labelsCopy.begin(), labelsCopy.end()); - auto last = unique(labelsCopy.begin(), labelsCopy.end()); - labelsCopy.erase(last, labelsCopy.end()); - numClasses = labelsCopy.size(); - // compute scores - scores.reserve(numFeatures); - for (int i = 0; i < numFeatures; ++i) { - scores.push_back(MutualInformation(i)); - features.push_back(i); - } - // sort & reduce scores and features - sort(features.begin(), features.end(), [&](int i, int j) - { return scores[i] > scores[j]; }); - sort(scores.begin(), scores.end(), greater()); - features.resize(k); - scores.resize(k); - fitted = true; - return *this; - } - precision_t SelectKBestWeighted::entropyLabel() - { - return entropy(labels); - } - precision_t SelectKBestWeighted::entropy(const sample_t& data) - { - precision_t ventropy = 0, totalWeight = 0; - score_t counts(numClasses + 1, 0); - for (auto i = 0; i < static_cast(data.size()); ++i) { - counts[data[i]] += weights[i]; - totalWeight += weights[i]; - } - for (auto count : counts) { - precision_t p = count / totalWeight; - if (p > 0) { - if (nat) { - ventropy -= p * log(p); - } else { - ventropy -= p * log2(p); - } - } - } - return ventropy; - } - // H(Y|X) = sum_{x in X} p(x) H(Y|X=x) - precision_t SelectKBestWeighted::conditionalEntropy(const int feature) - { - unordered_map featureCounts; - unordered_map> jointCounts; - featureCounts.clear(); - jointCounts.clear(); - precision_t totalWeight = 0; - for (auto i = 0; i < numSamples; i++) { - featureCounts[samples[i][feature]] += weights[i]; - jointCounts[samples[i][feature]][labels[i]] += weights[i]; - totalWeight += weights[i]; - } - if (totalWeight == 0) - throw invalid_argument("Total weight should not be zero"); - precision_t entropy = 0; - for (auto& [feat, count] : featureCounts) { - auto p_f = count / totalWeight; - precision_t entropy_f = 0; - for (auto& [label, jointCount] : jointCounts[feat]) { - auto p_l_f = jointCount / count; - if (p_l_f > 0) { - if (nat) { - entropy_f -= p_l_f * log(p_l_f); - } else { - entropy_f -= p_l_f * log2(p_l_f); - } - } - } - entropy += p_f * entropy_f; - } - return entropy; - } - // I(X;Y) = H(Y) - H(Y|X) - precision_t SelectKBestWeighted::MutualInformation(const int i) - { - return entropyLabel() - conditionalEntropy(i); - } - score_t SelectKBestWeighted::getScores() const - { - if (!fitted) - throw logic_error("score not fitted"); - return scores; - } - //Return the indices of the selected features - labels_t SelectKBestWeighted::getFeatures() const - { - if (!fitted) - throw logic_error("score not fitted"); - return features; - } -} diff --git a/lib/featureselect/FeatureSelect.h b/lib/featureselect/FeatureSelect.h deleted file mode 100644 index 18ddd99..0000000 --- a/lib/featureselect/FeatureSelect.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef SELECT_K_BEST_WEIGHTED_H -#define SELECT_K_BEST_WEIGHTED_H -#include -#include -#include -using namespace std; -namespace features { - typedef float precision_t; - typedef int value_t; - typedef vector sample_t; - typedef vector samples_t; - typedef vector labels_t; - typedef vector score_t, weights_t; - - class SelectKBestWeighted { - private: - const samples_t samples; - const labels_t labels; - const weights_t weights; - const int k; - bool nat; // use natural log or log2 - int numFeatures, numClasses, numSamples; - bool fitted; - score_t scores; // scores of the features - labels_t features; // indices of the selected features - precision_t entropyLabel(); - precision_t entropy(const sample_t&); - precision_t conditionalEntropy(const int); - precision_t MutualInformation(const int); - public: - SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool); - SelectKBestWeighted& fit(); - score_t getScores() const; - labels_t getFeatures() const; //Return the indices of the selected features - static inline string version() { return "0.1.0"; }; - }; -} -#endif \ No newline at end of file diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index a0b46f1..88f0306 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -21,6 +21,31 @@ namespace bayesnet { } samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32)); } + vector Metrics::SelectKBestWeighted(const torch::Tensor& weights, unsigned k) + { + auto n = samples.size(1); + if (k == 0) { + k = n; + } + // compute scores + scoresKBest.reserve(n); + auto label = samples.index({ -1, "..." }); + for (int i = 0; i < n; ++i) { + scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights)); + featuresKBest.push_back(i); + } + // sort & reduce scores and features + sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j) + { return scoresKBest[i] > scoresKBest[j]; }); + sort(scoresKBest.begin(), scoresKBest.end(), std::greater()); + featuresKBest.resize(k); + scoresKBest.resize(k); + return featuresKBest; + } + vector Metrics::getScoresKBest() const + { + return scoresKBest; + } vector> Metrics::doCombinations(const vector& source) { vector> result; diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 5bd25b6..70d33e9 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -12,6 +12,8 @@ namespace bayesnet { vector features; string className; int classNumStates = 0; + vector scoresKBest; + vector featuresKBest; // sorted indices of the features double entropy(const Tensor& feature, const Tensor& weights); double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); vector> doCombinations(const vector&); @@ -19,6 +21,8 @@ namespace bayesnet { Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); Metrics(const vector>& vsamples, const vector& labels, const vector& features, const string& className, const int classNumStates); + vector SelectKBestWeighted(const torch::Tensor& weights, unsigned k = 0); + vector getScoresKBest() const; double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); vector conditionalEdgeWeights(vector& weights); // To use in Python Tensor conditionalEdge(const torch::Tensor& weights); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index d68ac2a..e9b5e62 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -1,36 +1,35 @@ #include "BoostAODE.h" -#include "FeatureSelect.h" +#include "BayesMetrics.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} void BoostAODE::buildModel(const torch::Tensor& weights) { models.clear(); - int n_samples = dataset.size(1); - int n_features = dataset.size(0); - features::samples_t vsamples; - for (auto i = 0; i < n_samples; ++i) { - auto row = dataset.index({ "...", i }); - // convert row to std::vector - auto vrow = vector(row.data_ptr(), row.data_ptr() + row.numel()); - vsamples.push_back(vrow); - } - auto vweights = features::weights_t(n_samples, 1.0 / n_samples); - auto row = dataset.index({ -1, "..." }); - auto yv = features::labels_t(row.data_ptr(), row.data_ptr() + row.numel()); - auto featureSelection = features::SelectKBestWeighted(vsamples, yv, vweights, n_features, true); - auto features = featureSelection.fit().getFeatures(); - // features = ( - // CSelectKBestWeighted( - // self.X_, self.y_, weights, k = self.n_features_in_ - // ) - // .fit() - // .get_features() - auto scores = features::score_t(n_features, 0.0); for (int i = 0; i < features.size(); ++i) { models.push_back(std::make_unique(i)); } } + void BoostAODE::trainModel(const torch::Tensor& weights) + { + // End building vectors + Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kDouble); + auto X_ = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); + auto featureSelection = metrics.SelectKBestWeighted(weights_, n); // Get all the features sorted + for (int i = 0; i < features.size(); ++i) { + models[i].fit(dataset, features, className, states, weights_); + auto ypred = models[i].predict(X_); + // em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights) + // am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1) + // # Step 3.2: Update weights for next classifier + // weights = [ + // wm * np.exp(am * (ym != yp)) + // for wm, ym, yp in zip(weights, self.y_, y_pred) + // ] + // # Step 4: Add the new model + // self.estimators_.append(estimator) + } + } vector BoostAODE::graph(const string& title) const { return Ensemble::graph(title); diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index 66a871f..b14c7c6 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -6,6 +6,7 @@ namespace bayesnet { class BoostAODE : public Ensemble { protected: void buildModel(const torch::Tensor& weights) override; + void trainModel(const torch::Tensor& weights) override; public: BoostAODE(); virtual ~BoostAODE() {}; diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 2f2f631..435511c 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -1,9 +1,8 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) -include_directories(${BayesNet_SOURCE_DIR}/lib/featureselect) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) -target_link_libraries(BayesNet mdlp FeatureSelect "${TORCH_LIBRARIES}") \ No newline at end of file +target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/SPODELd.cc b/src/BayesNet/SPODELd.cc index 8a38160..2711c86 100644 --- a/src/BayesNet/SPODELd.cc +++ b/src/BayesNet/SPODELd.cc @@ -21,7 +21,6 @@ namespace bayesnet { SPODELd& SPODELd::fit(torch::Tensor& dataset, vector& features_, string className_, map>& states_) { Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone(); - cout << "Xf " << Xf.sizes() << " dtype: " << Xf.dtype() << endl; y = dataset.index({ -1, "..." }).clone(); // This first part should go in a Classifier method called fit_local_discretization o fit_float... features = features_;