Implement IWSS and FCBF too for BoostAODE

2023-10-14 13:12:04 +02:00 · 2023-10-14 13:12:04 +02:00 · b35532dd9e
commit b35532dd9e
parent 6ef49385ea
11 changed files with 180 additions and 26 deletions
--- a/src/BayesNet/BayesMetrics.h
+++ b/src/BayesNet/BayesMetrics.h
@ -29,6 +29,13 @@ namespace bayesnet {
            }
            return result;
        }
+        template <class T>
+        T pop_first(vector<T>& v)
+        {
+            T temp = v[0];
+            v.erase(v.begin());
+            return temp;
+        }
    public:
        Metrics() = default;
        Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
--- a/src/BayesNet/BoostAODE.cc
+++ b/src/BayesNet/BoostAODE.cc
@ -6,6 +6,8 @@
 #include "Folding.h"
 #include "Paths.h"
 #include "CFS.h"
+#include "FCBF.h"
+#include "IWSS.h"

 namespace bayesnet {
    BoostAODE::BoostAODE() : Ensemble() {}
@ -44,7 +46,7 @@ namespace bayesnet {
    void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters)
    {
        // Check if hyperparameters are valid
-        const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "cfs" };
+        const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" };
        checkHyperparameters(validKeys, hyperparameters);
        if (hyperparameters.contains("repeatSparent")) {
            repeatSparent = hyperparameters["repeatSparent"];
@ -58,29 +60,39 @@ namespace bayesnet {
        if (hyperparameters.contains("convergence")) {
            convergence = hyperparameters["convergence"];
        }
-        if (hyperparameters.contains("cfs")) {
-            cfs = hyperparameters["cfs"];
+        if (hyperparameters.contains("threshold")) {
+            threshold = hyperparameters["threshold"];
+        }
+        if (hyperparameters.contains("select_features")) {
+            auto selectedAlgorithm = hyperparameters["select_features"];
+            vector<string> algos = { "IWSS", "FCBF", "CFS" };
+            selectFeatures = true;
+            algorithm = selectedAlgorithm;
+            if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) {
+                throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]");
+            }
        }
    }
    unordered_set<int> BoostAODE::initializeModels()
    {
        unordered_set<int> featuresUsed;
-        // Read the CFS features
-        string output = "[", prefix = "";
-        bool first = true;
-        for (const auto& feature : features) {
-            output += prefix + "'" + feature + "'";
-            if (first) {
-                prefix = ", ";
-                first = false;
-            }
-        }
-        output += "]";
        Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
        int maxFeatures = 0;
-        auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
-        cfs.fit();
-        auto cfsFeatures = cfs.getFeatures();
+        if (algorithm == "CFS") {
+            featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
+        } else if (algorithm == "IWSS") {
+            if (threshold < 0 || threshold >0.5) {
+                throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]");
+            }
+            featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
+        } else if (algorithm == "FCBF") {
+            if (threshold < 1e-7 || threshold > 1) {
+                throw invalid_argument("Invalid threshold value [1e-7, 1]");
+            }
+            featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
+        }
+        featureSelector->fit();
+        auto cfsFeatures = featureSelector->getFeatures();
        for (const int& feature : cfsFeatures) {
            // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl;
            featuresUsed.insert(feature);
@ -90,12 +102,13 @@ namespace bayesnet {
            significanceModels.push_back(1.0);
            n_models++;
        }
+        delete featureSelector;
        return featuresUsed;
    }
    void BoostAODE::trainModel(const torch::Tensor& weights)
    {
        unordered_set<int> featuresUsed;
-        if (cfs) {
+        if (selectFeatures) {
            featuresUsed = initializeModels();
        }
        if (maxModels == 0)
--- a/src/BayesNet/BoostAODE.h
+++ b/src/BayesNet/BoostAODE.h
@ -3,6 +3,7 @@
 #include "Ensemble.h"
 #include <map>
 #include "SPODE.h"
+#include "FeatureSelect.h"
 namespace bayesnet {
    class BoostAODE : public Ensemble {
    public:
@ -22,7 +23,10 @@ namespace bayesnet {
        int maxModels = 0;
        bool ascending = false; //Process KBest features ascending or descending order
        bool convergence = false; //if true, stop when the model does not improve
-        bool cfs = false; // if true use CFS to select features stored in cfs folder with sha256(features) file_name
+        bool selectFeatures = false; // if true, use feature selection
+        string algorithm = ""; // Selected feature selection algorithm
+        FeatureSelect* featureSelector = nullptr;
+        double threshold = -1;
    };
 }
 #endif
--- a/src/BayesNet/CFS.cc
+++ b/src/BayesNet/CFS.cc
@ -2,13 +2,9 @@
 #include <limits>
 #include "bayesnetUtils.h"
 namespace bayesnet {
-
-
-
-
    void CFS::fit()
    {
-        selectedFeatures.clear();
+        initialize();
        computeSuLabels();
        auto featureOrder = argsort(suLabels); // sort descending order
        auto continueCondition = true;
@ -21,7 +17,8 @@ namespace bayesnet {
            int bestFeature = -1;
            for (auto feature : featureOrder) {
                selectedFeatures.push_back(feature);
-                auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures
+                // Compute merit with selectedFeatures
+                auto meritNew = computeMeritCFS();
                if (meritNew > merit) {
                    merit = meritNew;
                    bestFeature = feature;
--- a/src/BayesNet/CMakeLists.txt
+++ b/src/BayesNet/CMakeLists.txt
@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
 include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
 add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc 
    KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc 
-    Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
+    Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
 target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
--- a/src/BayesNet/FCBF.cc
+++ b/src/BayesNet/FCBF.cc
@ -0,0 +1,44 @@
+#include "bayesnetUtils.h"
+#include "FCBF.h"
+namespace bayesnet {
+
+    FCBF::FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
+        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
+    {
+        if (threshold < 1e-7) {
+            throw std::invalid_argument("Threshold cannot be less than 1e-7");
+        }
+    }
+    void FCBF::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto featureOrderCopy = featureOrder;
+        for (const auto& feature : featureOrder) {
+            // Don't self compare
+            featureOrderCopy.erase(featureOrderCopy.begin());
+            if (suLabels.at(feature) == 0.0) {
+                // The feature has been removed from the list
+                continue;
+            }
+            if (suLabels.at(feature) < threshold) {
+                break;
+            }
+            // Remove redundant features
+            for (const auto& featureCopy : featureOrderCopy) {
+                double value = computeSuFeatures(feature, featureCopy);
+                if (value >= suLabels.at(featureCopy)) {
+                    // Remove feature from list
+                    suLabels[featureCopy] = 0.0;
+                }
+            }
+            selectedFeatures.push_back(feature);
+            selectedScores.push_back(suLabels[feature]);
+            if (selectedFeatures.size() == maxFeatures) {
+                break;
+            }
+        }
+        fitted = true;
+    }
+}
--- a/src/BayesNet/FCBF.h
+++ b/src/BayesNet/FCBF.h
@ -0,0 +1,18 @@
+#ifndef FCBF_H
+#define FCBF_H
+#include <torch/torch.h>
+#include <vector>
+#include "FeatureSelect.h"
+using namespace std;
+namespace bayesnet {
+    class FCBF : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
+        virtual ~FCBF() {};
+        void fit() override;
+    private:
+        double threshold = -1;
+    };
+}
+#endif
--- a/src/BayesNet/FeatureSelect.cc
+++ b/src/BayesNet/FeatureSelect.cc
@ -7,6 +7,11 @@ namespace bayesnet {

    {
    }
+    void FeatureSelect::initialize()
+    {
+        selectedFeatures.clear();
+        selectedScores.clear();
+    }
    double FeatureSelect::symmetricalUncertainty(int a, int b)
    {
        /*
--- a/src/BayesNet/FeatureSelect.h
+++ b/src/BayesNet/FeatureSelect.h
@ -14,6 +14,7 @@ namespace bayesnet {
        vector<int> getFeatures() const;
        vector<double> getScores() const;
    protected:
+        void initialize();
        void computeSuLabels();
        double computeSuFeatures(const int a, const int b);
        double symmetricalUncertainty(int a, int b);
--- a/src/BayesNet/IWSS.cc
+++ b/src/BayesNet/IWSS.cc
@ -0,0 +1,47 @@
+#include "IWSS.h"
+#include <limits>
+#include "bayesnetUtils.h"
+namespace bayesnet {
+    IWSS::IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
+        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
+    {
+        if (threshold < 0 || threshold > .5) {
+            throw std::invalid_argument("Threshold has to be in [0, 0.5]");
+        }
+    }
+    void IWSS::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto featureOrderCopy = featureOrder;
+        // Add first and second features to result
+        //     First with its own score
+        auto first_feature = pop_first(featureOrderCopy);
+        selectedFeatures.push_back(first_feature);
+        selectedScores.push_back(suLabels.at(first_feature));
+        //     Second with the score of the candidates
+        selectedFeatures.push_back(pop_first(featureOrderCopy));
+        auto merit = computeMeritCFS();
+        selectedScores.push_back(merit);
+        for (const auto feature : featureOrderCopy) {
+            selectedFeatures.push_back(feature);
+            // Compute merit with selectedFeatures
+            auto meritNew = computeMeritCFS();
+            double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0;
+            if (meritNew > merit || delta < threshold) {
+                if (meritNew > merit) {
+                    merit = meritNew;
+                }
+                selectedScores.push_back(meritNew);
+            } else {
+                selectedFeatures.pop_back();
+                break;
+            }
+            if (selectedFeatures.size() == maxFeatures) {
+                break;
+            }
+        }
+        fitted = true;
+    }
+}
--- a/src/BayesNet/IWSS.h
+++ b/src/BayesNet/IWSS.h
@ -0,0 +1,18 @@
+#ifndef IWSS_H
+#define IWSS_H
+#include <torch/torch.h>
+#include <vector>
+#include "FeatureSelect.h"
+using namespace std;
+namespace bayesnet {
+    class IWSS : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
+        virtual ~IWSS() {};
+        void fit() override;
+    private:
+        double threshold = -1;
+    };
+}
+#endif