Implement IWSS and FCBF too for BoostAODE

2023-10-14 13:12:04 +02:00 · 2023-10-14 13:12:04 +02:00 · b35532dd9e
commit b35532dd9e
parent 6ef49385ea
11 changed files with 180 additions and 26 deletions
--- a/src/BayesNet/BayesMetrics.h
+++ b/src/BayesNet/BayesMetrics.h
@ -29,6 +29,13 @@ namespace bayesnet {
            }
            return result;
        }
        template <class T>
        T pop_first(vector<T>& v)
        {
            T temp = v[0];
            v.erase(v.begin());
            return temp;
        }
    public:
        Metrics() = default;
        Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
--- a/src/BayesNet/BoostAODE.cc
+++ b/src/BayesNet/BoostAODE.cc
@ -6,6 +6,8 @@
 #include "Folding.h"
 #include "Paths.h"
 #include "CFS.h"
 #include "FCBF.h"
 #include "IWSS.h"
 namespace bayesnet {
    BoostAODE::BoostAODE() : Ensemble() {}
@ -44,7 +46,7 @@ namespace bayesnet {
    void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters)
    {
        // Check if hyperparameters are valid
-        const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "cfs" };
+        const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" };
        checkHyperparameters(validKeys, hyperparameters);
        if (hyperparameters.contains("repeatSparent")) {
            repeatSparent = hyperparameters["repeatSparent"];
@ -58,29 +60,39 @@ namespace bayesnet {
        if (hyperparameters.contains("convergence")) {
            convergence = hyperparameters["convergence"];
        }
-        if (hyperparameters.contains("cfs")) {
+        if (hyperparameters.contains("threshold")) {
-            cfs = hyperparameters["cfs"];
+            threshold = hyperparameters["threshold"];
        }
        if (hyperparameters.contains("select_features")) {
            auto selectedAlgorithm = hyperparameters["select_features"];
            vector<string> algos = { "IWSS", "FCBF", "CFS" };
            selectFeatures = true;
            algorithm = selectedAlgorithm;
            if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) {
                throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]");
            }
        }
    }
    unordered_set<int> BoostAODE::initializeModels()
    {
        unordered_set<int> featuresUsed;
        // Read the CFS features
        string output = "[", prefix = "";
        bool first = true;
        for (const auto& feature : features) {
            output += prefix + "'" + feature + "'";
            if (first) {
                prefix = ", ";
                first = false;
            }
        }
        output += "]";
        Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
        int maxFeatures = 0;
-        auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
+        if (algorithm == "CFS") {
-        cfs.fit();
+            featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
-        auto cfsFeatures = cfs.getFeatures();
+        } else if (algorithm == "IWSS") {
            if (threshold < 0 || threshold >0.5) {
                throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]");
            }
            featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
        } else if (algorithm == "FCBF") {
            if (threshold < 1e-7 || threshold > 1) {
                throw invalid_argument("Invalid threshold value [1e-7, 1]");
            }
            featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
        }
        featureSelector->fit();
        auto cfsFeatures = featureSelector->getFeatures();
        for (const int& feature : cfsFeatures) {
            // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl;
            featuresUsed.insert(feature);
@ -90,12 +102,13 @@ namespace bayesnet {
            significanceModels.push_back(1.0);
            n_models++;
        }
        delete featureSelector;
        return featuresUsed;
    }
    void BoostAODE::trainModel(const torch::Tensor& weights)
    {
        unordered_set<int> featuresUsed;
-        if (cfs) {
+        if (selectFeatures) {
            featuresUsed = initializeModels();
        }
        if (maxModels == 0)
--- a/src/BayesNet/BoostAODE.h
+++ b/src/BayesNet/BoostAODE.h
@ -3,6 +3,7 @@
 #include "Ensemble.h"
 #include <map>
 #include "SPODE.h"
 #include "FeatureSelect.h"
 namespace bayesnet {
    class BoostAODE : public Ensemble {
    public:
@ -22,7 +23,10 @@ namespace bayesnet {
        int maxModels = 0;
        bool ascending = false; //Process KBest features ascending or descending order
        bool convergence = false; //if true, stop when the model does not improve
-        bool cfs = false; // if true use CFS to select features stored in cfs folder with sha256(features) file_name
+        bool selectFeatures = false; // if true, use feature selection
        string algorithm = ""; // Selected feature selection algorithm
        FeatureSelect* featureSelector = nullptr;
        double threshold = -1;
    };
 }
 #endif
--- a/src/BayesNet/CFS.cc
+++ b/src/BayesNet/CFS.cc
@ -2,13 +2,9 @@
 #include <limits>
 #include "bayesnetUtils.h"
 namespace bayesnet {
    void CFS::fit()
    {
-        selectedFeatures.clear();
+        initialize();
        computeSuLabels();
        auto featureOrder = argsort(suLabels); // sort descending order
        auto continueCondition = true;
@ -21,7 +17,8 @@ namespace bayesnet {
            int bestFeature = -1;
            for (auto feature : featureOrder) {
                selectedFeatures.push_back(feature);
-                auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures
+                // Compute merit with selectedFeatures
                auto meritNew = computeMeritCFS();
                if (meritNew > merit) {
                    merit = meritNew;
                    bestFeature = feature;
--- a/src/BayesNet/CMakeLists.txt
+++ b/src/BayesNet/CMakeLists.txt
@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
 include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
 add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc 
    KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc 
-    Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
+    Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
 target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
--- a/src/BayesNet/FCBF.cc
+++ b/src/BayesNet/FCBF.cc
@ -0,0 +1,44 @@
 #include "bayesnetUtils.h"
 #include "FCBF.h"
 namespace bayesnet {
    FCBF::FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
    {
        if (threshold < 1e-7) {
            throw std::invalid_argument("Threshold cannot be less than 1e-7");
        }
    }
    void FCBF::fit()
    {
        initialize();
        computeSuLabels();
        auto featureOrder = argsort(suLabels); // sort descending order
        auto featureOrderCopy = featureOrder;
        for (const auto& feature : featureOrder) {
            // Don't self compare
            featureOrderCopy.erase(featureOrderCopy.begin());
            if (suLabels.at(feature) == 0.0) {
                // The feature has been removed from the list
                continue;
            }
            if (suLabels.at(feature) < threshold) {
                break;
            }
            // Remove redundant features
            for (const auto& featureCopy : featureOrderCopy) {
                double value = computeSuFeatures(feature, featureCopy);
                if (value >= suLabels.at(featureCopy)) {
                    // Remove feature from list
                    suLabels[featureCopy] = 0.0;
                }
            }
            selectedFeatures.push_back(feature);
            selectedScores.push_back(suLabels[feature]);
            if (selectedFeatures.size() == maxFeatures) {
                break;
            }
        }
        fitted = true;
    }
 }
--- a/src/BayesNet/FCBF.h
+++ b/src/BayesNet/FCBF.h
@ -0,0 +1,18 @@
 #ifndef FCBF_H
 #define FCBF_H
 #include <torch/torch.h>
 #include <vector>
 #include "FeatureSelect.h"
 using namespace std;
 namespace bayesnet {
    class FCBF : public FeatureSelect {
    public:
        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
        FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
        virtual ~FCBF() {};
        void fit() override;
    private:
        double threshold = -1;
    };
 }
 #endif
--- a/src/BayesNet/FeatureSelect.cc
+++ b/src/BayesNet/FeatureSelect.cc
@ -7,6 +7,11 @@ namespace bayesnet {
    {
    }
    void FeatureSelect::initialize()
    {
        selectedFeatures.clear();
        selectedScores.clear();
    }
    double FeatureSelect::symmetricalUncertainty(int a, int b)
    {
        /*
--- a/src/BayesNet/FeatureSelect.h
+++ b/src/BayesNet/FeatureSelect.h
@ -14,6 +14,7 @@ namespace bayesnet {
        vector<int> getFeatures() const;
        vector<double> getScores() const;
    protected:
        void initialize();
        void computeSuLabels();
        double computeSuFeatures(const int a, const int b);
        double symmetricalUncertainty(int a, int b);
--- a/src/BayesNet/IWSS.cc
+++ b/src/BayesNet/IWSS.cc
@ -0,0 +1,47 @@
 #include "IWSS.h"
 #include <limits>
 #include "bayesnetUtils.h"
 namespace bayesnet {
    IWSS::IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
    {
        if (threshold < 0 || threshold > .5) {
            throw std::invalid_argument("Threshold has to be in [0, 0.5]");
        }
    }
    void IWSS::fit()
    {
        initialize();
        computeSuLabels();
        auto featureOrder = argsort(suLabels); // sort descending order
        auto featureOrderCopy = featureOrder;
        // Add first and second features to result
        //     First with its own score
        auto first_feature = pop_first(featureOrderCopy);
        selectedFeatures.push_back(first_feature);
        selectedScores.push_back(suLabels.at(first_feature));
        //     Second with the score of the candidates
        selectedFeatures.push_back(pop_first(featureOrderCopy));
        auto merit = computeMeritCFS();
        selectedScores.push_back(merit);
        for (const auto feature : featureOrderCopy) {
            selectedFeatures.push_back(feature);
            // Compute merit with selectedFeatures
            auto meritNew = computeMeritCFS();
            double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0;
            if (meritNew > merit || delta < threshold) {
                if (meritNew > merit) {
                    merit = meritNew;
                }
                selectedScores.push_back(meritNew);
            } else {
                selectedFeatures.pop_back();
                break;
            }
            if (selectedFeatures.size() == maxFeatures) {
                break;
            }
        }
        fitted = true;
    }
 }
--- a/src/BayesNet/IWSS.h
+++ b/src/BayesNet/IWSS.h
@ -0,0 +1,18 @@
 #ifndef IWSS_H
 #define IWSS_H
 #include <torch/torch.h>
 #include <vector>
 #include "FeatureSelect.h"
 using namespace std;
 namespace bayesnet {
    class IWSS : public FeatureSelect {
    public:
        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
        IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
        virtual ~IWSS() {};
        void fit() override;
    private:
        double threshold = -1;
    };
 }
 #endif