From 40d1dad5d827f3729e38c7b3fe448e1b511bb880 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= <rmontanana@gmail.com>
Date: Wed, 11 Oct 2023 21:17:26 +0200
Subject: [PATCH] Begin CFS implementation

---
 src/BayesNet/BayesMetrics.cc |   9 +--
 src/BayesNet/BayesMetrics.h  |  14 ++--
 src/BayesNet/CFS.cc          | 127 +++++++++++++++++++++++++++++++++++
 src/BayesNet/CFS.h           |  31 +++++++++
 src/BayesNet/CMakeLists.txt  |   2 +-
 src/BayesNet/Node.h          |   2 +-
 6 files changed, 173 insertions(+), 12 deletions(-)
 create mode 100644 src/BayesNet/CFS.cc
 create mode 100644 src/BayesNet/CFS.h
diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc
index 623656e..86de9ea 100644
--- a/src/BayesNet/BayesMetrics.cc
+++ b/src/BayesNet/BayesMetrics.cc
@@ -60,11 +60,12 @@ namespace bayesnet {
     {
         return scoresKBest;
     }
-    vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
+    template <class T>
+    vector<pair<T, T>> Metrics::doCombinations(const vector<T>& source)
     {
-        vector<pair<string, string>> result;
+        vector<pair<T, T>> result;
         for (int i = 0; i < source.size(); ++i) {
-            string temp = source[i];
+            T temp = source[i];
             for (int j = i + 1; j < source.size(); ++j) {
                 result.push_back({ temp, source[j] });
             }
@@ -76,7 +77,7 @@ namespace bayesnet {
         auto result = vector<double>();
         auto source = vector<string>(features);
         source.push_back(className);
-        auto combinations = doCombinations(source);
+        auto combinations = doCombinations<string>(source);
         // Compute class prior
         auto margin = torch::zeros({ classNumStates }, torch::kFloat);
         for (int value = 0; value < classNumStates; ++value) {
diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h
index 01841a7..30606c0 100644
--- a/src/BayesNet/BayesMetrics.h
+++ b/src/BayesNet/BayesMetrics.h
@@ -8,20 +8,22 @@ namespace bayesnet {
     using namespace torch;
     class Metrics {
     private:
-        Tensor samples; // nxm tensor used to fit the model
-        vector<string> features;
-        string className;
         int classNumStates = 0;
         vector<double> scoresKBest;
         vector<int> featuresKBest; // sorted indices of the features
-        double entropy(const Tensor& feature, const Tensor& weights);
         double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
-        vector<pair<string, string>> doCombinations(const vector<string>&);
+    protected:
+        Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector
+        string className;
+        double entropy(const Tensor& feature, const Tensor& weights);
+        vector<string> features;
+        template <class T>
+        vector<pair<T, T>> doCombinations(const vector<T>& source);
     public:
         Metrics() = default;
         Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
         Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
-        vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending=false, unsigned k = 0);
+        vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0);
         vector<double> getScoresKBest() const;
         double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
         vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
diff --git a/src/BayesNet/CFS.cc b/src/BayesNet/CFS.cc
new file mode 100644
index 0000000..b3473cd
--- /dev/null
+++ b/src/BayesNet/CFS.cc
@@ -0,0 +1,127 @@
+#include "CFS.h"
+#include <limits>
+#include "bayesnetUtils.h"
+namespace bayesnet {
+    CFS::CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
+        Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
+
+    {
+    }
+    double CFS::symmetricalUncertainty(int a, int b)
+    {
+        /*
+        Compute symmetrical uncertainty. Normalize* information gain (mutual
+        information) with the entropies of the features in order to compensate
+        the bias due to high cardinality features. *Range [0, 1]
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+        */
+        auto x = samples.index({ a, "..." });
+        auto y = samples.index({ b, "..." });
+        return 2.0 * mutualInformation(y, x, weights) / (entropy(x, weights) + entropy(y, weights));
+    }
+    void CFS::computeSuLabels()
+    {
+        // Compute Simmetrical Uncertainty between features and labels
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        for (int i = 0; i < features.size(); ++i) {
+            suLabels[i] = symmetricalUncertainty(i, -1);
+        }
+
+    }
+    double CFS::computeSuFeatures(const int firstFeature, const int secondFeature)
+    {
+        // Compute Simmetrical Uncertainty between features
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        // TODO: Implement Cache in this function
+        return symmetricalUncertainty(firstFeature, secondFeature);
+    }
+    double CFS::computeMerit()
+    {
+        double result;
+        double rcf = 0;
+        for (auto feature : cfsFeatures) {
+            rcf += suLabels[feature];
+        }
+        double rff = 0;
+        int n = cfsFeatures.size();
+        for (const auto& item : doCombinations<int>(cfsFeatures)) {
+            rff += computeSuFeatures(item.first, item.second);
+        }
+        return rcf / sqrt(n + (n * n - n) * rff);
+    }
+    void CFS::fit()
+    {
+        cfsFeatures.clear();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto continueCondition = true;
+        auto feature = featureOrder[0];
+        cfsFeatures.push_back(feature);
+        cfsScores.push_back(suLabels[feature]);
+        while (continueCondition) {
+            double merit = numeric_limits<double>::lowest();
+            int bestFeature = -1;
+            for (auto feature : featureOrder) {
+                cfsFeatures.push_back(feature);
+                auto meritNew = computeMerit(); // Compute merit with cfsFeatures
+                if (meritNew > merit) {
+                    merit = meritNew;
+                    bestFeature = feature;
+                }
+                cfsFeatures.pop_back();
+            }
+            cfsFeatures.push_back(bestFeature);
+            cfsScores.push_back(merit);
+            featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), feature), featureOrder.end());
+            continueCondition = computeContinueCondition(featureOrder);
+        }
+        fitted = true;
+    }
+    bool CFS::computeContinueCondition(const vector<int>& featureOrder)
+    {
+        if (cfsFeatures.size() == maxFeatures || featureOrder.size() == 0) {
+            return false;
+        }
+        if (cfsScores.size() >= 5) {
+            /*
+            "To prevent the best first search from exploring the entire
+            feature subset search space, a stopping criterion is imposed.
+            The search will terminate if five consecutive fully expanded
+            subsets show no improvement over the current best subset."
+            as stated in Mark A.Hall Thesis
+            */
+            double item_ant = numeric_limits<double>::lowest();
+            int num = 0;
+            vector<double> lastFive(cfsScores.end() - 5, cfsScores.end());
+            for (auto item : lastFive) {
+                if (item_ant == numeric_limits<double>::lowest()) {
+                    item_ant = item;
+                }
+                if (item > item_ant) {
+                    break;
+                } else {
+                    num++;
+                    item_ant = item;
+                }
+            }
+            if (num == 5) {
+                return false;
+            }
+        }
+        return true;
+    }
+    vector<int> CFS::getFeatures() const
+    {
+        if (!fitted) {
+            throw runtime_error("CFS not fitted");
+        }
+        return cfsFeatures;
+    }
+    vector<double> CFS::getScores() const
+    {
+        if (!fitted) {
+            throw runtime_error("CFS not fitted");
+        }
+        return cfsScores;
+    }
+}
\ No newline at end of file
diff --git a/src/BayesNet/CFS.h b/src/BayesNet/CFS.h
new file mode 100644
index 0000000..1cf621d
--- /dev/null
+++ b/src/BayesNet/CFS.h
@@ -0,0 +1,31 @@
+#ifndef CFS_H
+#define CFS_H
+#include <torch/torch.h>
+#include <vector>
+#include "BayesMetrics.h"
+using namespace std;
+namespace bayesnet {
+    class CFS : public Metrics {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
+        virtual ~CFS() {};
+        void fit();
+        vector<int> getFeatures() const;
+        vector<double> getScores() const;
+    private:
+        void computeSuLabels();
+        double computeSuFeatures(const int a, const int b);
+        double symmetricalUncertainty(int a, int b);
+        double computeMerit();
+        bool computeContinueCondition(const vector<int>& featureOrder);
+        vector<pair<int, int>> combinations(const vector<int>& features);
+        const torch::Tensor& weights;
+        int maxFeatures;
+        vector<int> cfsFeatures;
+        vector<double> cfsScores;
+        vector<double> suLabels;
+        bool fitted = false;
+    };
+}
+#endif
\ No newline at end of file
diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt
index 6ca1238..e22827e 100644
--- a/src/BayesNet/CMakeLists.txt
+++ b/src/BayesNet/CMakeLists.txt
@@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
 include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
 add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc 
     KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc 
-    Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
+    Mst.cc Proposal.cc CFS.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
 target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}" OpenSSL::Crypto)
\ No newline at end of file
diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h
index 6758c5c..4979007 100644
--- a/src/BayesNet/Node.h
+++ b/src/BayesNet/Node.h
@@ -14,8 +14,8 @@ namespace bayesnet {
         int numStates; // number of states of the variable
         torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
         vector<int64_t> dimensions; // dimensions of the cpTable
-    public:
         vector<pair<string, string>> combinations(const vector<string>&);
+    public:
         explicit Node(const string&);
         void clear();
         void addParent(Node*);