Refactor CFS class creating abstract base class

2023-10-14 11:27:46 +02:00 · 2023-10-14 11:27:46 +02:00 · 6d5a25cdc8
commit 6d5a25cdc8
parent d00b08cbe8
5 changed files with 127 additions and 100 deletions
--- a/src/BayesNet/CFS.cc
+++ b/src/BayesNet/CFS.cc
@ -2,90 +2,38 @@
 #include <limits>
 #include "bayesnetUtils.h"
 namespace bayesnet {
-    CFS::CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
-        Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)

-    {
-    }
-    double CFS::symmetricalUncertainty(int a, int b)
-    {
-        /*
-        Compute symmetrical uncertainty. Normalize* information gain (mutual
-        information) with the entropies of the features in order to compensate
-        the bias due to high cardinality features. *Range [0, 1]
-        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
-        */
-        auto x = samples.index({ a, "..." });
-        auto y = samples.index({ b, "..." });
-        auto mu = mutualInformation(x, y, weights);
-        auto hx = entropy(x, weights);
-        auto hy = entropy(y, weights);
-        return 2.0 * mu / (hx + hy);
-    }
-    void CFS::computeSuLabels()
-    {
-        // Compute Simmetrical Uncertainty between features and labels
-        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
-        for (int i = 0; i < features.size(); ++i) {
-            suLabels.push_back(symmetricalUncertainty(i, -1));
-        }

-    }
-    double CFS::computeSuFeatures(const int firstFeature, const int secondFeature)
-    {
-        // Compute Simmetrical Uncertainty between features
-        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
-        try {
-            return suFeatures.at({ firstFeature, secondFeature });
-        }
-        catch (const out_of_range& e) {
-            auto result = symmetricalUncertainty(firstFeature, secondFeature);
-            suFeatures[{firstFeature, secondFeature}] = result;
-            return result;
-        }
-    }
-    double CFS::computeMerit()
-    {
-        double result;
-        double rcf = 0;
-        for (auto feature : cfsFeatures) {
-            rcf += suLabels[feature];
-        }
-        double rff = 0;
-        int n = cfsFeatures.size();
-        for (const auto& item : doCombinations(cfsFeatures)) {
-            rff += computeSuFeatures(item.first, item.second);
-        }
-        return rcf / sqrt(n + (n * n - n) * rff);
-    }
+
+
    void CFS::fit()
    {
-        cfsFeatures.clear();
+        selectedFeatures.clear();
        computeSuLabels();
        auto featureOrder = argsort(suLabels); // sort descending order
        auto continueCondition = true;
        auto feature = featureOrder[0];
-        cfsFeatures.push_back(feature);
-        cfsScores.push_back(suLabels[feature]);
-        cfsFeatures.erase(cfsFeatures.begin());
+        selectedFeatures.push_back(feature);
+        selectedScores.push_back(suLabels[feature]);
+        selectedFeatures.erase(selectedFeatures.begin());
        while (continueCondition) {
            double merit = numeric_limits<double>::lowest();
            int bestFeature = -1;
            for (auto feature : featureOrder) {
-                cfsFeatures.push_back(feature);
-                auto meritNew = computeMerit(); // Compute merit with cfsFeatures
+                selectedFeatures.push_back(feature);
+                auto meritNew = computeMeritCFS(); // Compute merit with cfsFeatures
                if (meritNew > merit) {
                    merit = meritNew;
                    bestFeature = feature;
                }
-                cfsFeatures.pop_back();
+                selectedFeatures.pop_back();
            }
            if (bestFeature == -1) {
                // meritNew has to be nan due to constant features
                break;
            }
-            cfsFeatures.push_back(bestFeature);
-            cfsScores.push_back(merit);
+            selectedFeatures.push_back(bestFeature);
+            selectedScores.push_back(merit);
            featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
            continueCondition = computeContinueCondition(featureOrder);
        }
@ -93,10 +41,10 @@ namespace bayesnet {
    }
    bool CFS::computeContinueCondition(const vector<int>& featureOrder)
    {
-        if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) {
+        if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
            return false;
        }
-        if (cfsScores.size() >= 5) {
+        if (selectedScores.size() >= 5) {
            /*
            "To prevent the best first search from exploring the entire
            feature subset search space, a stopping criterion is imposed.
@ -106,7 +54,7 @@ namespace bayesnet {
            */
            double item_ant = numeric_limits<double>::lowest();
            int num = 0;
-            vector<double> lastFive(cfsScores.end() - 5, cfsScores.end());
+            vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
            for (auto item : lastFive) {
                if (item_ant == numeric_limits<double>::lowest()) {
                    item_ant = item;
@ -124,18 +72,4 @@ namespace bayesnet {
        }
        return true;
    }
-    vector<int> CFS::getFeatures() const
-    {
-        if (!fitted) {
-            throw runtime_error("CFS not fitted");
-        }
-        return cfsFeatures;
-    }
-    vector<double> CFS::getScores() const
-    {
-        if (!fitted) {
-            throw runtime_error("CFS not fitted");
-        }
-        return cfsScores;
-    }
 }
--- a/src/BayesNet/CFS.h
+++ b/src/BayesNet/CFS.h
@ -2,32 +2,20 @@
 #define CFS_H
 #include <torch/torch.h>
 #include <vector>
-#include "BayesMetrics.h"
+#include "FeatureSelect.h"
 using namespace std;
 namespace bayesnet {
-    class CFS : public Metrics {
+    class CFS : public FeatureSelect {
    public:
        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
-        CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
+        CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
+            FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
+        {
+        }
        virtual ~CFS() {};
-        void fit();
-        void test();
-        vector<int> getFeatures() const;
-        vector<double> getScores() const;
+        void fit() override;
    private:
-        void computeSuLabels();
-        double computeSuFeatures(const int a, const int b);
-        double symmetricalUncertainty(int a, int b);
-        double computeMerit();
        bool computeContinueCondition(const vector<int>& featureOrder);
-        vector<pair<int, int>> combinations(const vector<int>& features);
-        const torch::Tensor& weights;
-        int maxFeatures;
-        vector<int> cfsFeatures;
-        vector<double> cfsScores;
-        vector<double> suLabels;
-        map<pair<int, int>, double> suFeatures;
-        bool fitted = false;
    };
 }
 #endif
--- a/src/BayesNet/CMakeLists.txt
+++ b/src/BayesNet/CMakeLists.txt
@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
 include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
 add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc 
    KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc 
-    Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
+    Mst.cc Proposal.cc CFS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
 target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
--- a/src/BayesNet/FeatureSelect.cc
+++ b/src/BayesNet/FeatureSelect.cc
@ -0,0 +1,74 @@
+#include "FeatureSelect.h"
+#include <limits>
+#include "bayesnetUtils.h"
+namespace bayesnet {
+    FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
+        Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
+
+    {
+    }
+    double FeatureSelect::symmetricalUncertainty(int a, int b)
+    {
+        /*
+        Compute symmetrical uncertainty. Normalize* information gain (mutual
+        information) with the entropies of the features in order to compensate
+        the bias due to high cardinality features. *Range [0, 1]
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+        */
+        auto x = samples.index({ a, "..." });
+        auto y = samples.index({ b, "..." });
+        auto mu = mutualInformation(x, y, weights);
+        auto hx = entropy(x, weights);
+        auto hy = entropy(y, weights);
+        return 2.0 * mu / (hx + hy);
+    }
+    void FeatureSelect::computeSuLabels()
+    {
+        // Compute Simmetrical Uncertainty between features and labels
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        for (int i = 0; i < features.size(); ++i) {
+            suLabels.push_back(symmetricalUncertainty(i, -1));
+        }
+    }
+    double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
+    {
+        // Compute Simmetrical Uncertainty between features
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        try {
+            return suFeatures.at({ firstFeature, secondFeature });
+        }
+        catch (const out_of_range& e) {
+            double result = symmetricalUncertainty(firstFeature, secondFeature);
+            suFeatures[{firstFeature, secondFeature}] = result;
+            return result;
+        }
+    }
+    double FeatureSelect::computeMeritCFS()
+    {
+        double result;
+        double rcf = 0;
+        for (auto feature : selectedFeatures) {
+            rcf += suLabels[feature];
+        }
+        double rff = 0;
+        int n = selectedFeatures.size();
+        for (const auto& item : doCombinations(selectedFeatures)) {
+            rff += computeSuFeatures(item.first, item.second);
+        }
+        return rcf / sqrt(n + (n * n - n) * rff);
+    }
+    vector<int> FeatureSelect::getFeatures() const
+    {
+        if (!fitted) {
+            throw runtime_error("FeatureSelect not fitted");
+        }
+        return selectedFeatures;
+    }
+    vector<double> FeatureSelect::getScores() const
+    {
+        if (!fitted) {
+            throw runtime_error("FeatureSelect not fitted");
+        }
+        return selectedScores;
+    }
+}
--- a/src/BayesNet/FeatureSelect.h
+++ b/src/BayesNet/FeatureSelect.h
@ -0,0 +1,31 @@
+#ifndef FEATURE_SELECT_H
+#define FEATURE_SELECT_H
+#include <torch/torch.h>
+#include <vector>
+#include "BayesMetrics.h"
+using namespace std;
+namespace bayesnet {
+    class FeatureSelect : public Metrics {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
+        virtual ~FeatureSelect() {};
+        virtual void fit() = 0;
+        vector<int> getFeatures() const;
+        vector<double> getScores() const;
+    protected:
+        void computeSuLabels();
+        double computeSuFeatures(const int a, const int b);
+        double symmetricalUncertainty(int a, int b);
+        double computeMeritCFS();
+        vector<pair<int, int>> combinations(const vector<int>& features);
+        const torch::Tensor& weights;
+        int maxFeatures;
+        vector<int> selectedFeatures;
+        vector<double> selectedScores;
+        vector<double> suLabels;
+        map<pair<int, int>, double> suFeatures;
+        bool fitted = false;
+    };
+}
+#endif