From 2e3e0e0fc285f2363d8c6f2ed5c2a10cb4589d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Thu, 16 May 2024 11:17:21 +0200 Subject: [PATCH] Add selectKParis method --- bayesnet/ensembles/BoostA2DE.cc | 142 ++++++++++++++++++++++++++++++++ bayesnet/ensembles/BoostA2DE.h | 2 + bayesnet/ensembles/BoostAODE.cc | 22 +++-- bayesnet/ensembles/BoostAODE.h | 1 + bayesnet/utils/BayesMetrics.cc | 38 +++++++++ bayesnet/utils/BayesMetrics.h | 5 +- tests/TestBoostA2DE.cc | 34 +++++--- tests/TestBoostAODE.cc | 6 +- 8 files changed, 224 insertions(+), 26 deletions(-) diff --git a/bayesnet/ensembles/BoostA2DE.cc b/bayesnet/ensembles/BoostA2DE.cc index 77f2971..7948923 100644 --- a/bayesnet/ensembles/BoostA2DE.cc +++ b/bayesnet/ensembles/BoostA2DE.cc @@ -19,9 +19,151 @@ namespace bayesnet { BoostA2DE::BoostA2DE(bool predict_voting) : Boost(predict_voting) { } + std::vector BoostA2DE::initializeModels() + { + torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + std::vector featuresSelected = featureSelection(weights_); + if (featuresSelected.size() < 2) { + notes.push_back("No features selected in initialization"); + status = ERROR; + return std::vector(); + } + for (int i = 0; i < featuresSelected.size() - 1; i++) { + for (int j = i + 1; j < featuresSelected.size(); j++) { + auto parents = { featuresSelected[i], featuresSelected[j] }; + std::unique_ptr model = std::make_unique(parents); + model->fit(dataset, features, className, states, weights_); + models.push_back(std::move(model)); + significanceModels.push_back(1.0); // They will be updated later in trainModel + n_models++; + } + } + notes.push_back("Used features in initialization: " + std::to_string(featuresSelected.size()) + " of " + std::to_string(features.size()) + " with " + select_features_algorithm); + return featuresSelected; + } void BoostA2DE::trainModel(const torch::Tensor& weights) { + // + // Logging setup + // + // loguru::set_thread_name("BoostA2DE"); + // loguru::g_stderr_verbosity = loguru::Verbosity_OFF; + // loguru::add_file("boostA2DE.log", loguru::Truncate, loguru::Verbosity_MAX); + // // Algorithm based on the adaboost algorithm for classification + // // as explained in Ensemble methods (Zhi-Hua Zhou, 2012) + // fitted = true; + // double alpha_t = 0; + // torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + // bool finished = false; + // std::vector featuresUsed; + // if (selectFeatures) { + // featuresUsed = initializeModels(); + // auto ypred = predict(X_train); + // std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred, weights_); + // // Update significance of the models + // for (int i = 0; i < n_models; ++i) { + // significanceModels[i] = alpha_t; + // } + // if (finished) { + // return; + // } + // } + // int numItemsPack = 0; // The counter of the models inserted in the current pack + // // Variables to control the accuracy finish condition + // double priorAccuracy = 0.0; + // double improvement = 1.0; + // double convergence_threshold = 1e-4; + // int tolerance = 0; // number of times the accuracy is lower than the convergence_threshold + // // Step 0: Set the finish condition + // // epsilon sub t > 0.5 => inverse the weights policy + // // validation error is not decreasing + // // run out of features + // bool ascending = order_algorithm == Orders.ASC; + // std::mt19937 g{ 173 }; + // while (!finished) { + // // Step 1: Build ranking with mutual information + // auto pairSelection = metrics.SelectKBestWeighted(weights_, ascending, n); // Get all the features sorted + // if (order_algorithm == Orders.RAND) { + // std::shuffle(featureSelection.begin(), featureSelection.end(), g); + // } + // // Remove used features + // featureSelection.erase(remove_if(begin(featureSelection), end(featureSelection), [&](auto x) + // { return std::find(begin(featuresUsed), end(featuresUsed), x) != end(featuresUsed);}), + // end(featureSelection) + // ); + // int k = bisection ? pow(2, tolerance) : 1; + // int counter = 0; // The model counter of the current pack + // VLOG_SCOPE_F(1, "counter=%d k=%d featureSelection.size: %zu", counter, k, featureSelection.size()); + // while (counter++ < k && featureSelection.size() > 0) { + // auto feature = featureSelection[0]; + // featureSelection.erase(featureSelection.begin()); + // std::unique_ptr model; + // model = std::make_unique(feature); + // model->fit(dataset, features, className, states, weights_); + // alpha_t = 0.0; + // if (!block_update) { + // auto ypred = model->predict(X_train); + // // Step 3.1: Compute the classifier amout of say + // std::tie(weights_, alpha_t, finished) = update_weights(y_train, ypred, weights_); + // } + // // Step 3.4: Store classifier and its accuracy to weigh its future vote + // numItemsPack++; + // featuresUsed.push_back(feature); + // models.push_back(std::move(model)); + // significanceModels.push_back(alpha_t); + // n_models++; + // VLOG_SCOPE_F(2, "numItemsPack: %d n_models: %d featuresUsed: %zu", numItemsPack, n_models, featuresUsed.size()); + // } + // if (block_update) { + // std::tie(weights_, alpha_t, finished) = update_weights_block(k, y_train, weights_); + // } + // if (convergence && !finished) { + // auto y_val_predict = predict(X_test); + // double accuracy = (y_val_predict == y_test).sum().item() / (double)y_test.size(0); + // if (priorAccuracy == 0) { + // priorAccuracy = accuracy; + // } else { + // improvement = accuracy - priorAccuracy; + // } + // if (improvement < convergence_threshold) { + // VLOG_SCOPE_F(3, " (improvement=threshold) Reset. tolerance: %d numItemsPack: %d improvement: %f prior: %f current: %f", tolerance, numItemsPack, improvement, priorAccuracy, accuracy); + // tolerance = 0; // Reset the counter if the model performs better + // numItemsPack = 0; + // } + // if (convergence_best) { + // // Keep the best accuracy until now as the prior accuracy + // priorAccuracy = std::max(accuracy, priorAccuracy); + // } else { + // // Keep the last accuray obtained as the prior accuracy + // priorAccuracy = accuracy; + // } + // } + // VLOG_SCOPE_F(1, "tolerance: %d featuresUsed.size: %zu features.size: %zu", tolerance, featuresUsed.size(), features.size()); + // finished = finished || tolerance > maxTolerance || featuresUsed.size() == features.size(); + // } + // if (tolerance > maxTolerance) { + // if (numItemsPack < n_models) { + // notes.push_back("Convergence threshold reached & " + std::to_string(numItemsPack) + " models eliminated"); + // VLOG_SCOPE_F(4, "Convergence threshold reached & %d models eliminated of %d", numItemsPack, n_models); + // for (int i = 0; i < numItemsPack; ++i) { + // significanceModels.pop_back(); + // models.pop_back(); + // n_models--; + // } + // } else { + // notes.push_back("Convergence threshold reached & 0 models eliminated"); + // VLOG_SCOPE_F(4, "Convergence threshold reached & 0 models eliminated n_models=%d numItemsPack=%d", n_models, numItemsPack); + // } + // } + // if (featuresUsed.size() != features.size()) { + // notes.push_back("Used features in train: " + std::to_string(featuresUsed.size()) + " of " + std::to_string(features.size())); + // status = WARNING; + // } + // notes.push_back("Number of models: " + std::to_string(n_models)); } std::vector BoostA2DE::graph(const std::string& title) const { diff --git a/bayesnet/ensembles/BoostA2DE.h b/bayesnet/ensembles/BoostA2DE.h index 34cfe24..ff56b79 100644 --- a/bayesnet/ensembles/BoostA2DE.h +++ b/bayesnet/ensembles/BoostA2DE.h @@ -18,6 +18,8 @@ namespace bayesnet { std::vector graph(const std::string& title = "BoostA2DE") const override; protected: void trainModel(const torch::Tensor& weights) override; + private: + std::vector initializeModels(); }; } #endif \ No newline at end of file diff --git a/bayesnet/ensembles/BoostAODE.cc b/bayesnet/ensembles/BoostAODE.cc index e4ea907..30137df 100644 --- a/bayesnet/ensembles/BoostAODE.cc +++ b/bayesnet/ensembles/BoostAODE.cc @@ -10,14 +10,12 @@ #include #include #include "BoostAODE.h" -#include "lib/log/loguru.cpp" namespace bayesnet { BoostAODE::BoostAODE(bool predict_voting) : Boost(predict_voting) { } - std::vector BoostAODE::initializeModels() { torch::Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); @@ -37,9 +35,9 @@ namespace bayesnet { // // Logging setup // - loguru::set_thread_name("BoostAODE"); - loguru::g_stderr_verbosity = loguru::Verbosity_OFF; - loguru::add_file("boostAODE.log", loguru::Truncate, loguru::Verbosity_MAX); + // loguru::set_thread_name("BoostAODE"); + // loguru::g_stderr_verbosity = loguru::Verbosity_OFF; + // loguru::add_file("boostAODE.log", loguru::Truncate, loguru::Verbosity_MAX); // Algorithm based on the adaboost algorithm for classification // as explained in Ensemble methods (Zhi-Hua Zhou, 2012) @@ -85,7 +83,7 @@ namespace bayesnet { ); int k = bisection ? pow(2, tolerance) : 1; int counter = 0; // The model counter of the current pack - VLOG_SCOPE_F(1, "counter=%d k=%d featureSelection.size: %zu", counter, k, featureSelection.size()); + // VLOG_SCOPE_F(1, "counter=%d k=%d featureSelection.size: %zu", counter, k, featureSelection.size()); while (counter++ < k && featureSelection.size() > 0) { auto feature = featureSelection[0]; featureSelection.erase(featureSelection.begin()); @@ -104,7 +102,7 @@ namespace bayesnet { models.push_back(std::move(model)); significanceModels.push_back(alpha_t); n_models++; - VLOG_SCOPE_F(2, "numItemsPack: %d n_models: %d featuresUsed: %zu", numItemsPack, n_models, featuresUsed.size()); + // VLOG_SCOPE_F(2, "numItemsPack: %d n_models: %d featuresUsed: %zu", numItemsPack, n_models, featuresUsed.size()); } if (block_update) { std::tie(weights_, alpha_t, finished) = update_weights_block(k, y_train, weights_); @@ -118,10 +116,10 @@ namespace bayesnet { improvement = accuracy - priorAccuracy; } if (improvement < convergence_threshold) { - VLOG_SCOPE_F(3, " (improvement=threshold) Reset. tolerance: %d numItemsPack: %d improvement: %f prior: %f current: %f", tolerance, numItemsPack, improvement, priorAccuracy, accuracy); + // VLOG_SCOPE_F(3, "* (improvement>=threshold) Reset. tolerance: %d numItemsPack: %d improvement: %f prior: %f current: %f", tolerance, numItemsPack, improvement, priorAccuracy, accuracy); tolerance = 0; // Reset the counter if the model performs better numItemsPack = 0; } @@ -133,13 +131,13 @@ namespace bayesnet { priorAccuracy = accuracy; } } - VLOG_SCOPE_F(1, "tolerance: %d featuresUsed.size: %zu features.size: %zu", tolerance, featuresUsed.size(), features.size()); + // VLOG_SCOPE_F(1, "tolerance: %d featuresUsed.size: %zu features.size: %zu", tolerance, featuresUsed.size(), features.size()); finished = finished || tolerance > maxTolerance || featuresUsed.size() == features.size(); } if (tolerance > maxTolerance) { if (numItemsPack < n_models) { notes.push_back("Convergence threshold reached & " + std::to_string(numItemsPack) + " models eliminated"); - VLOG_SCOPE_F(4, "Convergence threshold reached & %d models eliminated of %d", numItemsPack, n_models); + // VLOG_SCOPE_F(4, "Convergence threshold reached & %d models eliminated of %d", numItemsPack, n_models); for (int i = 0; i < numItemsPack; ++i) { significanceModels.pop_back(); models.pop_back(); @@ -147,7 +145,7 @@ namespace bayesnet { } } else { notes.push_back("Convergence threshold reached & 0 models eliminated"); - VLOG_SCOPE_F(4, "Convergence threshold reached & 0 models eliminated n_models=%d numItemsPack=%d", n_models, numItemsPack); + // VLOG_SCOPE_F(4, "Convergence threshold reached & 0 models eliminated n_models=%d numItemsPack=%d", n_models, numItemsPack); } } if (featuresUsed.size() != features.size()) { diff --git a/bayesnet/ensembles/BoostAODE.h b/bayesnet/ensembles/BoostAODE.h index 8c613a9..e4eb250 100644 --- a/bayesnet/ensembles/BoostAODE.h +++ b/bayesnet/ensembles/BoostAODE.h @@ -10,6 +10,7 @@ #include #include "bayesnet/classifiers/SPODE.h" #include "Boost.h" + namespace bayesnet { class BoostAODE : public Boost { public: diff --git a/bayesnet/utils/BayesMetrics.cc b/bayesnet/utils/BayesMetrics.cc index c863704..43b0dec 100644 --- a/bayesnet/utils/BayesMetrics.cc +++ b/bayesnet/utils/BayesMetrics.cc @@ -30,6 +30,44 @@ namespace bayesnet { } samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32)); } + std::vector> Metrics::SelectKPairs(const torch::Tensor& weights, bool ascending, unsigned k) + { + // Return the K Best features + auto n = features.size(); + if (k == 0) { + k = n; + } + // compute scores + scoresKPairs.clear(); + pairsKBest.clear(); + auto label = samples.index({ -1, "..." }); + // for (int i = 0; i < n; ++i) { + // for (int j = i + 1; j < n; ++j) { + // scoresKBest.push_back(mutualInformation(samples.index({ i, "..." }), samples.index({ j, "..." }), weights)); + // featuresKBest.push_back(i); + // featuresKBest.push_back(j); + // } + // } + // // sort & reduce scores and features + // if (ascending) { + // sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j) + // { return scoresKBest[i] < scoresKBest[j]; }); + // sort(scoresKBest.begin(), scoresKBest.end(), std::less()); + // if (k < n) { + // for (int i = 0; i < n - k; ++i) { + // featuresKBest.erase(featuresKBest.begin()); + // scoresKBest.erase(scoresKBest.begin()); + // } + // } + // } else { + // sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j) + // { return scoresKBest[i] > scoresKBest[j]; }); + // sort(scoresKBest.begin(), scoresKBest.end(), std::greater()); + // featuresKBest.resize(k); + // scoresKBest.resize(k); + // } + return pairsKBest; + } std::vector Metrics::SelectKBestWeighted(const torch::Tensor& weights, bool ascending, unsigned k) { // Return the K Best features diff --git a/bayesnet/utils/BayesMetrics.h b/bayesnet/utils/BayesMetrics.h index 6c20852..f24a496 100644 --- a/bayesnet/utils/BayesMetrics.h +++ b/bayesnet/utils/BayesMetrics.h @@ -16,6 +16,7 @@ namespace bayesnet { Metrics(const torch::Tensor& samples, const std::vector& features, const std::string& className, const int classNumStates); Metrics(const std::vector>& vsamples, const std::vector& labels, const std::vector& features, const std::string& className, const int classNumStates); std::vector SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0); + std::vector> SelectKPairs(const torch::Tensor& weights, bool ascending = false, unsigned k = 0); std::vector getScoresKBest() const; double mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights); double conditionalMutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& labels, const torch::Tensor& weights); @@ -41,7 +42,7 @@ namespace bayesnet { } return result; } - template + template T pop_first(std::vector& v) { T temp = v[0]; @@ -52,6 +53,8 @@ namespace bayesnet { int classNumStates = 0; std::vector scoresKBest; std::vector featuresKBest; // sorted indices of the features + std::vector> pairsKBest; // sorted indices of the pairs + std::map, double> scoresKPairs; double conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights); }; } diff --git a/tests/TestBoostA2DE.cc b/tests/TestBoostA2DE.cc index 1be717d..4b870f7 100644 --- a/tests/TestBoostA2DE.cc +++ b/tests/TestBoostA2DE.cc @@ -8,21 +8,35 @@ #include #include #include +#include "bayesnet/utils/BayesMetrics.h" #include "bayesnet/ensembles/BoostA2DE.h" #include "TestUtils.h" -TEST_CASE("Feature_select CFS", "[BoostA2DE]") +TEST_CASE("Build basic model", "[BoostA2DE]") { - auto raw = RawDatasets("iris", true); - auto clf = bayesnet::BoostA2DE(); - clf.setHyperparameters({ {"select_features", "CFS"} }); - clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states); - REQUIRE(clf.getNumberOfNodes() == 0); - REQUIRE(clf.getNumberOfEdges() == 0); - // REQUIRE(clf.getNotes().size() == 2); - // REQUIRE(clf.getNotes()[0] == "Used features in initialization: 6 of 9 with CFS"); - // REQUIRE(clf.getNotes()[1] == "Number of models: 9"); + auto raw = RawDatasets("diabetes", true); + bayesnet::Metrics metrics(raw.dataset, raw.features, raw.className, raw.classNumStates); + auto expected = std::map, double>{ + { { 0, 1 }, 0.0 }, + { { 0, 2 }, 0.287696 }, + { { 0, 3 }, 0.403749 }, + { { 1, 2 }, 1.17112 }, + { { 1, 3 }, 1.31852 }, + { { 2, 3 }, 0.210068 }, + }; + for (int i = 0; i < raw.features.size() - 1; ++i) { + for (int j = i + 1; j < raw.features.size(); ++j) { + double result = metrics.conditionalMutualInformation(raw.dataset.index({ i, "..." }), raw.dataset.index({ j, "..." }), raw.yt, raw.weights); + // REQUIRE(result == Catch::Approx(expected.at({ i, j })).epsilon(raw.epsilon)); + auto clf = bayesnet::SPnDE({ i, j }); + clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states); + auto score = clf.score(raw.Xt, raw.yt); + std::cout << " i " << i << " j " << j << " cmi " + << std::setw(8) << std::setprecision(6) << fixed << result + << " score = " << score << std::endl; + } + } } // TEST_CASE("Feature_select IWSS", "[BoostAODE]") // { diff --git a/tests/TestBoostAODE.cc b/tests/TestBoostAODE.cc index 4f92110..b434055 100644 --- a/tests/TestBoostAODE.cc +++ b/tests/TestBoostAODE.cc @@ -104,7 +104,7 @@ TEST_CASE("Order asc, desc & random", "[BoostAODE]") clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states); auto score = clf.score(raw.Xv, raw.yv); auto scoret = clf.score(raw.Xt, raw.yt); - INFO("BoostAODE order: " + order); + INFO("BoostAODE order: " << order); REQUIRE(score == Catch::Approx(scores[order]).epsilon(raw.epsilon)); REQUIRE(scoret == Catch::Approx(scores[order]).epsilon(raw.epsilon)); } @@ -120,7 +120,7 @@ TEST_CASE("Oddities", "[BoostAODE]") { { "maxTolerance", 5 } }, }; for (const auto& hyper : bad_hyper.items()) { - INFO("BoostAODE hyper: " + hyper.value().dump()); + INFO("BoostAODE hyper: " << hyper.value().dump()); REQUIRE_THROWS_AS(clf.setHyperparameters(hyper.value()), std::invalid_argument); } REQUIRE_THROWS_AS(clf.setHyperparameters({ {"maxTolerance", 0 } }), std::invalid_argument); @@ -131,7 +131,7 @@ TEST_CASE("Oddities", "[BoostAODE]") { { "select_features","FCBF" }, { "threshold", 1.01 } }, }; for (const auto& hyper : bad_hyper_fit.items()) { - INFO("BoostAODE hyper: " + hyper.value().dump()); + INFO("BoostAODE hyper: " << hyper.value().dump()); clf.setHyperparameters(hyper.value()); REQUIRE_THROWS_AS(clf.fit(raw.Xv, raw.yv, raw.features, raw.className, raw.states), std::invalid_argument); }