Refactor library structure

2024-03-08 22:20:54 +01:00
parent 1231f4522a
commit 635ef22520
56 changed files with 64 additions and 68 deletions
--- a/bayesnet/feature_selection/CFS.cc
+++ b/bayesnet/feature_selection/CFS.cc
@@ -0,0 +1,72 @@
+#include <limits>
+#include "bayesnet/utils/bayesnetUtils.h"
+#include "CFS.h"
+namespace bayesnet {
+    void CFS::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto continueCondition = true;
+        auto feature = featureOrder[0];
+        selectedFeatures.push_back(feature);
+        selectedScores.push_back(suLabels[feature]);
+        selectedFeatures.erase(selectedFeatures.begin());
+        while (continueCondition) {
+            double merit = std::numeric_limits<double>::lowest();
+            int bestFeature = -1;
+            for (auto feature : featureOrder) {
+                selectedFeatures.push_back(feature);
+                // Compute merit with selectedFeatures
+                auto meritNew = computeMeritCFS();
+                if (meritNew > merit) {
+                    merit = meritNew;
+                    bestFeature = feature;
+                }
+                selectedFeatures.pop_back();
+            }
+            if (bestFeature == -1) {
+                // meritNew has to be nan due to constant features
+                break;
+            }
+            selectedFeatures.push_back(bestFeature);
+            selectedScores.push_back(merit);
+            featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
+            continueCondition = computeContinueCondition(featureOrder);
+        }
+        fitted = true;
+    }
+    bool CFS::computeContinueCondition(const std::vector<int>& featureOrder)
+    {
+        if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
+            return false;
+        }
+        if (selectedScores.size() >= 5) {
+            /*
+            "To prevent the best first search from exploring the entire
+            feature subset search space, a stopping criterion is imposed.
+            The search will terminate if five consecutive fully expanded
+            subsets show no improvement over the current best subset."
+            as stated in Mark A.Hall Thesis
+            */
+            double item_ant = std::numeric_limits<double>::lowest();
+            int num = 0;
+            std::vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
+            for (auto item : lastFive) {
+                if (item_ant == std::numeric_limits<double>::lowest()) {
+                    item_ant = item;
+                }
+                if (item > item_ant) {
+                    break;
+                } else {
+                    num++;
+                    item_ant = item;
+                }
+            }
+            if (num == 5) {
+                return false;
+            }
+        }
+        return true;
+    }
+}
--- a/bayesnet/feature_selection/CFS.h
+++ b/bayesnet/feature_selection/CFS.h
@@ -0,0 +1,20 @@
+#ifndef CFS_H
+#define CFS_H
+#include <torch/torch.h>
+#include <vector>
+#include "bayesnet/feature_selection/FeatureSelect.h"
+namespace bayesnet {
+    class CFS : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
+        CFS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
+            FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
+        {
+        }
+        virtual ~CFS() {};
+        void fit() override;
+    private:
+        bool computeContinueCondition(const std::vector<int>& featureOrder);
+    };
+}
+#endif
--- a/bayesnet/feature_selection/FCBF.cc
+++ b/bayesnet/feature_selection/FCBF.cc
@@ -0,0 +1,44 @@
+#include "bayesnet/utils/bayesnetUtils.h"
+#include "FCBF.h"
+namespace bayesnet {
+
+    FCBF::FCBF(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
+        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
+    {
+        if (threshold < 1e-7) {
+            throw std::invalid_argument("Threshold cannot be less than 1e-7");
+        }
+    }
+    void FCBF::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto featureOrderCopy = featureOrder;
+        for (const auto& feature : featureOrder) {
+            // Don't self compare
+            featureOrderCopy.erase(featureOrderCopy.begin());
+            if (suLabels.at(feature) == 0.0) {
+                // The feature has been removed from the list
+                continue;
+            }
+            if (suLabels.at(feature) < threshold) {
+                break;
+            }
+            // Remove redundant features
+            for (const auto& featureCopy : featureOrderCopy) {
+                double value = computeSuFeatures(feature, featureCopy);
+                if (value >= suLabels.at(featureCopy)) {
+                    // Remove feature from list
+                    suLabels[featureCopy] = 0.0;
+                }
+            }
+            selectedFeatures.push_back(feature);
+            selectedScores.push_back(suLabels[feature]);
+            if (selectedFeatures.size() == maxFeatures) {
+                break;
+            }
+        }
+        fitted = true;
+    }
+}
--- a/bayesnet/feature_selection/FCBF.h
+++ b/bayesnet/feature_selection/FCBF.h
@@ -0,0 +1,17 @@
+#ifndef FCBF_H
+#define FCBF_H
+#include <torch/torch.h>
+#include <vector>
+#include "bayesnet/feature_selection/FeatureSelect.h"
+namespace bayesnet {
+    class FCBF : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
+        FCBF(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
+        virtual ~FCBF() {};
+        void fit() override;
+    private:
+        double threshold = -1;
+    };
+}
+#endif
--- a/bayesnet/feature_selection/FeatureSelect.cc
+++ b/bayesnet/feature_selection/FeatureSelect.cc
@@ -0,0 +1,78 @@
+#include <limits>
+#include "bayesnet/utils/bayesnetUtils.h"
+#include "FeatureSelect.h"
+namespace bayesnet {
+    FeatureSelect::FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
+        Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
+
+    {
+    }
+    void FeatureSelect::initialize()
+    {
+        selectedFeatures.clear();
+        selectedScores.clear();
+    }
+    double FeatureSelect::symmetricalUncertainty(int a, int b)
+    {
+        /*
+        Compute symmetrical uncertainty. Normalize* information gain (mutual
+        information) with the entropies of the features in order to compensate
+        the bias due to high cardinality features. *Range [0, 1]
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+        */
+        auto x = samples.index({ a, "..." });
+        auto y = samples.index({ b, "..." });
+        auto mu = mutualInformation(x, y, weights);
+        auto hx = entropy(x, weights);
+        auto hy = entropy(y, weights);
+        return 2.0 * mu / (hx + hy);
+    }
+    void FeatureSelect::computeSuLabels()
+    {
+        // Compute Simmetrical Uncertainty between features and labels
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        for (int i = 0; i < features.size(); ++i) {
+            suLabels.push_back(symmetricalUncertainty(i, -1));
+        }
+    }
+    double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
+    {
+        // Compute Simmetrical Uncertainty between features
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        try {
+            return suFeatures.at({ firstFeature, secondFeature });
+        }
+        catch (const std::out_of_range& e) {
+            double result = symmetricalUncertainty(firstFeature, secondFeature);
+            suFeatures[{firstFeature, secondFeature}] = result;
+            return result;
+        }
+    }
+    double FeatureSelect::computeMeritCFS()
+    {
+        double rcf = 0;
+        for (auto feature : selectedFeatures) {
+            rcf += suLabels[feature];
+        }
+        double rff = 0;
+        int n = selectedFeatures.size();
+        for (const auto& item : doCombinations(selectedFeatures)) {
+            rff += computeSuFeatures(item.first, item.second);
+        }
+        return rcf / sqrt(n + (n * n - n) * rff);
+    }
+    std::vector<int> FeatureSelect::getFeatures() const
+    {
+        if (!fitted) {
+            throw std::runtime_error("FeatureSelect not fitted");
+        }
+        return selectedFeatures;
+    }
+    std::vector<double> FeatureSelect::getScores() const
+    {
+        if (!fitted) {
+            throw std::runtime_error("FeatureSelect not fitted");
+        }
+        return selectedScores;
+    }
+}
--- a/bayesnet/feature_selection/FeatureSelect.h
+++ b/bayesnet/feature_selection/FeatureSelect.h
@@ -0,0 +1,30 @@
+#ifndef FEATURE_SELECT_H
+#define FEATURE_SELECT_H
+#include <torch/torch.h>
+#include <vector>
+#include "bayesnet/utils/BayesMetrics.h"
+namespace bayesnet {
+    class FeatureSelect : public Metrics {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
+        FeatureSelect(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
+        virtual ~FeatureSelect() {};
+        virtual void fit() = 0;
+        std::vector<int> getFeatures() const;
+        std::vector<double> getScores() const;
+    protected:
+        void initialize();
+        void computeSuLabels();
+        double computeSuFeatures(const int a, const int b);
+        double symmetricalUncertainty(int a, int b);
+        double computeMeritCFS();
+        const torch::Tensor& weights;
+        int maxFeatures;
+        std::vector<int> selectedFeatures;
+        std::vector<double> selectedScores;
+        std::vector<double> suLabels;
+        std::map<std::pair<int, int>, double> suFeatures;
+        bool fitted = false;
+    };
+}
+#endif
--- a/bayesnet/feature_selection/IWSS.cc
+++ b/bayesnet/feature_selection/IWSS.cc
@@ -0,0 +1,47 @@
+#include <limits>
+#include "bayesnet/utils/bayesnetUtils.h"
+#include "IWSS.h"
+namespace bayesnet {
+    IWSS::IWSS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
+        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
+    {
+        if (threshold < 0 || threshold > .5) {
+            throw std::invalid_argument("Threshold has to be in [0, 0.5]");
+        }
+    }
+    void IWSS::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto featureOrderCopy = featureOrder;
+        // Add first and second features to result
+        //     First with its own score
+        auto first_feature = pop_first(featureOrderCopy);
+        selectedFeatures.push_back(first_feature);
+        selectedScores.push_back(suLabels.at(first_feature));
+        //     Second with the score of the candidates
+        selectedFeatures.push_back(pop_first(featureOrderCopy));
+        auto merit = computeMeritCFS();
+        selectedScores.push_back(merit);
+        for (const auto feature : featureOrderCopy) {
+            selectedFeatures.push_back(feature);
+            // Compute merit with selectedFeatures
+            auto meritNew = computeMeritCFS();
+            double delta = merit != 0.0 ? std::abs(merit - meritNew) / merit : 0.0;
+            if (meritNew > merit || delta < threshold) {
+                if (meritNew > merit) {
+                    merit = meritNew;
+                }
+                selectedScores.push_back(meritNew);
+            } else {
+                selectedFeatures.pop_back();
+                break;
+            }
+            if (selectedFeatures.size() == maxFeatures) {
+                break;
+            }
+        }
+        fitted = true;
+    }
+}
--- a/bayesnet/feature_selection/IWSS.h
+++ b/bayesnet/feature_selection/IWSS.h
@@ -0,0 +1,17 @@
+#ifndef IWSS_H
+#define IWSS_H
+#include <vector>
+#include <torch/torch.h>
+#include "FeatureSelect.h"
+namespace bayesnet {
+    class IWSS : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y std::vector
+        IWSS(const torch::Tensor& samples, const std::vector<std::string>& features, const std::string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
+        virtual ~IWSS() {};
+        void fit() override;
+    private:
+        double threshold = -1;
+    };
+}
+#endif