diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 341951e..66016a6 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -29,6 +29,13 @@ namespace bayesnet { } return result; } + template + T pop_first(vector& v) + { + T temp = v[0]; + v.erase(v.begin()); + return temp; + } public: Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index cee8a51..fb38a7c 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -6,6 +6,8 @@ #include "Folding.h" #include "Paths.h" #include "CFS.h" +#include "FCBF.h" +#include "IWSS.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} @@ -44,7 +46,7 @@ namespace bayesnet { void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters) { // Check if hyperparameters are valid - const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "cfs" }; + const vector validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" }; checkHyperparameters(validKeys, hyperparameters); if (hyperparameters.contains("repeatSparent")) { repeatSparent = hyperparameters["repeatSparent"]; @@ -58,29 +60,39 @@ namespace bayesnet { if (hyperparameters.contains("convergence")) { convergence = hyperparameters["convergence"]; } - if (hyperparameters.contains("cfs")) { - cfs = hyperparameters["cfs"]; + if (hyperparameters.contains("threshold")) { + threshold = hyperparameters["threshold"]; + } + if (hyperparameters.contains("select_features")) { + auto selectedAlgorithm = hyperparameters["select_features"]; + vector algos = { "IWSS", "FCBF", "CFS" }; + selectFeatures = true; + algorithm = selectedAlgorithm; + if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) { + throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]"); + } } } unordered_set BoostAODE::initializeModels() { unordered_set featuresUsed; - // Read the CFS features - string output = "[", prefix = ""; - bool first = true; - for (const auto& feature : features) { - output += prefix + "'" + feature + "'"; - if (first) { - prefix = ", "; - first = false; - } - } - output += "]"; Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); int maxFeatures = 0; - auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); - cfs.fit(); - auto cfsFeatures = cfs.getFeatures(); + if (algorithm == "CFS") { + featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_); + } else if (algorithm == "IWSS") { + if (threshold < 0 || threshold >0.5) { + throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]"); + } + featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); + } else if (algorithm == "FCBF") { + if (threshold < 1e-7 || threshold > 1) { + throw invalid_argument("Invalid threshold value [1e-7, 1]"); + } + featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold); + } + featureSelector->fit(); + auto cfsFeatures = featureSelector->getFeatures(); for (const int& feature : cfsFeatures) { // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl; featuresUsed.insert(feature); @@ -90,12 +102,13 @@ namespace bayesnet { significanceModels.push_back(1.0); n_models++; } + delete featureSelector; return featuresUsed; } void BoostAODE::trainModel(const torch::Tensor& weights) { unordered_set featuresUsed; - if (cfs) { + if (selectFeatures) { featuresUsed = initializeModels(); } if (maxModels == 0) diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index fb87fce..dd1cf75 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -3,6 +3,7 @@ #include "Ensemble.h" #include #include "SPODE.h" +#include "FeatureSelect.h" namespace bayesnet { class BoostAODE : public Ensemble { public: @@ -22,7 +23,10 @@ namespace bayesnet { int maxModels = 0; bool ascending = false; //Process KBest features ascending or descending order bool convergence = false; //if true, stop when the model does not improve - bool cfs = false; // if true use CFS to select features stored in cfs folder with sha256(features) file_name + bool selectFeatures = false; // if true, use feature selection + string algorithm = ""; // Selected feature selection algorithm + FeatureSelect* featureSelector = nullptr; + double threshold = -1; }; } #endif \ No newline at end of file diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc index 50c0ea8..f2ffc1e 100644 --- a/src/BayesNet/CFS.cc +++ b/src/BayesNet/CFS.cc @@ -2,13 +2,9 @@ #include #include "bayesnetUtils.h" namespace bayesnet { - - - - void CFS::fit() { - selectedFeatures.clear(); + initialize(); computeSuLabels(); auto featureOrder = argsort(suLabels); // sort descending order auto continueCondition = true; @@ -21,7 +17,8 @@ namespace bayesnet { int bestFeature = -1; for (auto feature : featureOrder) { selectedFeatures.push_back(feature); - auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures + // Compute merit with selectedFeatures + auto meritNew = computeMeritCFS(); if (meritNew > merit) { merit = meritNew; bestFeature = feature; diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index c9543ea..cc0f5a5 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc - Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/FCBF.cc b/src/BayesNet/FCBF.cc new file mode 100644 index 0000000..db935af --- /dev/null +++ b/src/BayesNet/FCBF.cc @@ -0,0 +1,44 @@ +#include "bayesnetUtils.h" +#include "FCBF.h" +namespace bayesnet { + + FCBF::FCBF(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold) + { + if (threshold < 1e-7) { + throw std::invalid_argument("Threshold cannot be less than 1e-7"); + } + } + void FCBF::fit() + { + initialize(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto featureOrderCopy = featureOrder; + for (const auto& feature : featureOrder) { + // Don't self compare + featureOrderCopy.erase(featureOrderCopy.begin()); + if (suLabels.at(feature) == 0.0) { + // The feature has been removed from the list + continue; + } + if (suLabels.at(feature) < threshold) { + break; + } + // Remove redundant features + for (const auto& featureCopy : featureOrderCopy) { + double value = computeSuFeatures(feature, featureCopy); + if (value >= suLabels.at(featureCopy)) { + // Remove feature from list + suLabels[featureCopy] = 0.0; + } + } + selectedFeatures.push_back(feature); + selectedScores.push_back(suLabels[feature]); + if (selectedFeatures.size() == maxFeatures) { + break; + } + } + fitted = true; + } +} \ No newline at end of file diff --git a/src/BayesNet/FCBF.h b/src/BayesNet/FCBF.h new file mode 100644 index 0000000..aa7ff47 --- /dev/null +++ b/src/BayesNet/FCBF.h @@ -0,0 +1,18 @@ +#ifndef FCBF_H +#define FCBF_H +#include +#include +#include "FeatureSelect.h" +using namespace std; +namespace bayesnet { + class FCBF : public FeatureSelect { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + FCBF(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold); + virtual ~FCBF() {}; + void fit() override; + private: + double threshold = -1; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/FeatureSelect.cc b/src/BayesNet/FeatureSelect.cc index 4eb45fe..11d929b 100644 --- a/src/BayesNet/FeatureSelect.cc +++ b/src/BayesNet/FeatureSelect.cc @@ -7,6 +7,11 @@ namespace bayesnet { { } + void FeatureSelect::initialize() + { + selectedFeatures.clear(); + selectedScores.clear(); + } double FeatureSelect::symmetricalUncertainty(int a, int b) { /* diff --git a/src/BayesNet/FeatureSelect.h b/src/BayesNet/FeatureSelect.h index c342468..46923c9 100644 --- a/src/BayesNet/FeatureSelect.h +++ b/src/BayesNet/FeatureSelect.h @@ -14,6 +14,7 @@ namespace bayesnet { vector getFeatures() const; vector getScores() const; protected: + void initialize(); void computeSuLabels(); double computeSuFeatures(const int a, const int b); double symmetricalUncertainty(int a, int b); diff --git a/src/BayesNet/IWSS.cc b/src/BayesNet/IWSS.cc new file mode 100644 index 0000000..f39f137 --- /dev/null +++ b/src/BayesNet/IWSS.cc @@ -0,0 +1,47 @@ +#include "IWSS.h" +#include +#include "bayesnetUtils.h" +namespace bayesnet { + IWSS::IWSS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) : + FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold) + { + if (threshold < 0 || threshold > .5) { + throw std::invalid_argument("Threshold has to be in [0, 0.5]"); + } + } + void IWSS::fit() + { + initialize(); + computeSuLabels(); + auto featureOrder = argsort(suLabels); // sort descending order + auto featureOrderCopy = featureOrder; + // Add first and second features to result + // First with its own score + auto first_feature = pop_first(featureOrderCopy); + selectedFeatures.push_back(first_feature); + selectedScores.push_back(suLabels.at(first_feature)); + // Second with the score of the candidates + selectedFeatures.push_back(pop_first(featureOrderCopy)); + auto merit = computeMeritCFS(); + selectedScores.push_back(merit); + for (const auto feature : featureOrderCopy) { + selectedFeatures.push_back(feature); + // Compute merit with selectedFeatures + auto meritNew = computeMeritCFS(); + double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0; + if (meritNew > merit || delta < threshold) { + if (meritNew > merit) { + merit = meritNew; + } + selectedScores.push_back(meritNew); + } else { + selectedFeatures.pop_back(); + break; + } + if (selectedFeatures.size() == maxFeatures) { + break; + } + } + fitted = true; + } +} \ No newline at end of file diff --git a/src/BayesNet/IWSS.h b/src/BayesNet/IWSS.h new file mode 100644 index 0000000..88a1034 --- /dev/null +++ b/src/BayesNet/IWSS.h @@ -0,0 +1,18 @@ +#ifndef IWSS_H +#define IWSS_H +#include +#include +#include "FeatureSelect.h" +using namespace std; +namespace bayesnet { + class IWSS : public FeatureSelect { + public: + // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector + IWSS(const torch::Tensor& samples, const vector& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold); + virtual ~IWSS() {}; + void fit() override; + private: + double threshold = -1; + }; +} +#endif \ No newline at end of file