Merge pull request 'Create Boost_CFS' (#11) from Boost_CFS into main

Add hyper parameter to BoostAODE. This hyper parameter decides if we select features with cfs/fcbf/iwss before start building models and build a Spode with the selected features. The hyperparameter is select_features
2023-10-15 09:22:14 +00:00 · 2023-10-15 09:22:14 +00:00 · f72aa5b9a6
commit f72aa5b9a6
parent 7d8aca4f59 fa7fe081ad
30 changed files with 591 additions and 140 deletions
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@ -0,0 +1,18 @@
+{
+    "configurations": [
+        {
+            "name": "Mac",
+            "includePath": [
+                "${workspaceFolder}/**"
+            ],
+            "defines": [],
+            "macFrameworkPath": [
+                "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks"
+            ],
+            "cStandard": "c17",
+            "cppStandard": "c++17",
+            "compileCommands": "${workspaceFolder}/cmake-build-release/compile_commands.json"
+        }
+    ],
+    "version": 4
+}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -45,7 +45,6 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
 # CMakes modules
 # --------------
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH})
-
 include(AddGitSubmodule)
 if (CODE_COVERAGE)
    enable_testing()
@ -65,7 +64,11 @@ endif (ENABLE_CLANG_TIDY)
 add_git_submodule("lib/mdlp")
 add_git_submodule("lib/argparse")
 add_git_submodule("lib/json")
-find_library(XLSXWRITER_LIB libxlsxwriter.dylib PATHS /usr/local/lib)
+
+
+find_library(XLSXWRITER_LIB NAMES libxlsxwriter.dylib libxlsxwriter.so PATHS ${BayesNet_SOURCE_DIR}/lib/libxlsxwriter/lib)
+message("XLSXWRITER_LIB=${XLSXWRITER_LIB}")
+

 # Subdirectories
 # --------------
--- a/README.md
+++ b/README.md
@ -27,11 +27,9 @@ export BOOST_ROOT=/path/to/library/
 ```bash
 cd lib/libxlsxwriter
 make
-sudo make install
+make install DESTDIR=/home/rmontanana/Code PREFIX=
 ```

-It has to be installed in /usr/local/lib otherwise CMakeLists.txt has to be modified accordingly
-
 Environment variable has to be set:

 ```bash
--- a/src/BayesNet/BayesMetrics.cc
+++ b/src/BayesNet/BayesMetrics.cc
@ -60,17 +60,7 @@ namespace bayesnet {
    {
        return scoresKBest;
    }
-    vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
-    {
-        vector<pair<string, string>> result;
-        for (int i = 0; i < source.size(); ++i) {
-            string temp = source[i];
-            for (int j = i + 1; j < source.size(); ++j) {
-                result.push_back({ temp, source[j] });
-            }
-        }
-        return result;
-    }
+
    torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights)
    {
        auto result = vector<double>();
--- a/src/BayesNet/BayesMetrics.h
+++ b/src/BayesNet/BayesMetrics.h
@ -8,20 +8,39 @@ namespace bayesnet {
    using namespace torch;
    class Metrics {
    private:
-        Tensor samples; // nxm tensor used to fit the model
-        vector<string> features;
-        string className;
        int classNumStates = 0;
        vector<double> scoresKBest;
        vector<int> featuresKBest; // sorted indices of the features
-        double entropy(const Tensor& feature, const Tensor& weights);
        double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
-        vector<pair<string, string>> doCombinations(const vector<string>&);
+    protected:
+        Tensor samples; // n+1xm tensor used to fit the model where samples[-1] is the y vector
+        string className;
+        double entropy(const Tensor& feature, const Tensor& weights);
+        vector<string> features;
+        template <class T>
+        vector<pair<T, T>> doCombinations(const vector<T>& source)
+        {
+            vector<pair<T, T>> result;
+            for (int i = 0; i < source.size(); ++i) {
+                T temp = source[i];
+                for (int j = i + 1; j < source.size(); ++j) {
+                    result.push_back({ temp, source[j] });
+                }
+            }
+            return result;
+        }
+        template <class T>
+        T pop_first(vector<T>& v)
+        {
+            T temp = v[0];
+            v.erase(v.begin());
+            return temp;
+        }
    public:
        Metrics() = default;
        Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
        Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
-        vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending=false, unsigned k = 0);
+        vector<int> SelectKBestWeighted(const torch::Tensor& weights, bool ascending = false, unsigned k = 0);
        vector<double> getScoresKBest() const;
        double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
        vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
--- a/src/BayesNet/BoostAODE.cc
+++ b/src/BayesNet/BoostAODE.cc
@ -1,36 +1,22 @@
-#include "BoostAODE.h"
 #include <set>
-#include "BayesMetrics.h"
+#include <functional>
+#include <limits.h>
+#include "BoostAODE.h"
 #include "Colors.h"
 #include "Folding.h"
-#include <limits.h>
+#include "Paths.h"
+#include "CFS.h"
+#include "FCBF.h"
+#include "IWSS.h"

 namespace bayesnet {
    BoostAODE::BoostAODE() : Ensemble() {}
    void BoostAODE::buildModel(const torch::Tensor& weights)
    {
        // Models shall be built in trainModel
-    }
-    void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters)
-    {
-        // Check if hyperparameters are valid
-        const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence" };
-        checkHyperparameters(validKeys, hyperparameters);
-        if (hyperparameters.contains("repeatSparent")) {
-            repeatSparent = hyperparameters["repeatSparent"];
-        }
-        if (hyperparameters.contains("maxModels")) {
-            maxModels = hyperparameters["maxModels"];
-        }
-        if (hyperparameters.contains("ascending")) {
-            ascending = hyperparameters["ascending"];
-        }
-        if (hyperparameters.contains("convergence")) {
-            convergence = hyperparameters["convergence"];
-        }
-    }
-    void BoostAODE::validationInit()
-    {
+        models.clear();
+        n_models = 0;
+        // Prepare the validation dataset
        auto y_ = dataset.index({ -1, "..." });
        if (convergence) {
            // Prepare train & validation sets from train data
@ -56,18 +42,79 @@ namespace bayesnet {
            X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." });
            y_train = y_;
        }
-
+    }
+    void BoostAODE::setHyperparameters(nlohmann::json& hyperparameters)
+    {
+        // Check if hyperparameters are valid
+        const vector<string> validKeys = { "repeatSparent", "maxModels", "ascending", "convergence", "threshold", "select_features" };
+        checkHyperparameters(validKeys, hyperparameters);
+        if (hyperparameters.contains("repeatSparent")) {
+            repeatSparent = hyperparameters["repeatSparent"];
+        }
+        if (hyperparameters.contains("maxModels")) {
+            maxModels = hyperparameters["maxModels"];
+        }
+        if (hyperparameters.contains("ascending")) {
+            ascending = hyperparameters["ascending"];
+        }
+        if (hyperparameters.contains("convergence")) {
+            convergence = hyperparameters["convergence"];
+        }
+        if (hyperparameters.contains("threshold")) {
+            threshold = hyperparameters["threshold"];
+        }
+        if (hyperparameters.contains("select_features")) {
+            auto selectedAlgorithm = hyperparameters["select_features"];
+            vector<string> algos = { "IWSS", "FCBF", "CFS" };
+            selectFeatures = true;
+            algorithm = selectedAlgorithm;
+            if (find(algos.begin(), algos.end(), selectedAlgorithm) == algos.end()) {
+                throw invalid_argument("Invalid selectFeatures value [IWSS, FCBF, CFS]");
+            }
+        }
+    }
+    unordered_set<int> BoostAODE::initializeModels()
+    {
+        unordered_set<int> featuresUsed;
+        Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
+        int maxFeatures = 0;
+        if (algorithm == "CFS") {
+            featureSelector = new CFS(dataset, features, className, maxFeatures, states.at(className).size(), weights_);
+        } else if (algorithm == "IWSS") {
+            if (threshold < 0 || threshold >0.5) {
+                throw invalid_argument("Invalid threshold value for IWSS [0, 0.5]");
+            }
+            featureSelector = new IWSS(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
+        } else if (algorithm == "FCBF") {
+            if (threshold < 1e-7 || threshold > 1) {
+                throw invalid_argument("Invalid threshold value [1e-7, 1]");
+            }
+            featureSelector = new FCBF(dataset, features, className, maxFeatures, states.at(className).size(), weights_, threshold);
+        }
+        featureSelector->fit();
+        auto cfsFeatures = featureSelector->getFeatures();
+        for (const int& feature : cfsFeatures) {
+            // cout << "Feature: [" << feature << "] " << feature << " " << features.at(feature) << endl;
+            featuresUsed.insert(feature);
+            unique_ptr<Classifier> model = std::make_unique<SPODE>(feature);
+            model->fit(dataset, features, className, states, weights_);
+            models.push_back(std::move(model));
+            significanceModels.push_back(1.0);
+            n_models++;
+        }
+        delete featureSelector;
+        return featuresUsed;
    }
    void BoostAODE::trainModel(const torch::Tensor& weights)
    {
-        models.clear();
-        n_models = 0;
+        unordered_set<int> featuresUsed;
+        if (selectFeatures) {
+            featuresUsed = initializeModels();
+        }
        if (maxModels == 0)
            maxModels = .1 * n > 10 ? .1 * n : n;
-        validationInit();
        Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64);
        bool exitCondition = false;
-        unordered_set<int> featuresUsed;
        // Variables to control the accuracy finish condition
        double priorAccuracy = 0.0;
        double delta = 1.0;
@ -86,16 +133,16 @@ namespace bayesnet {
            unique_ptr<Classifier> model;
            auto feature = featureSelection[0];
            if (!repeatSparent || featuresUsed.size() < featureSelection.size()) {
-                bool found = false;
-                for (auto feat : featureSelection) {
+                bool used = true;
+                for (const auto& feat : featureSelection) {
                    if (find(featuresUsed.begin(), featuresUsed.end(), feat) != featuresUsed.end()) {
                        continue;
                    }
-                    found = true;
+                    used = false;
                    feature = feat;
                    break;
                }
-                if (!found) {
+                if (used) {
                    exitCondition = true;
                    continue;
                }
@ -135,7 +182,7 @@ namespace bayesnet {
                    count++;
                }
            }
-            exitCondition = n_models == maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance;
+            exitCondition = n_models >= maxModels && repeatSparent || epsilon_t > 0.5 || count > tolerance;
        }
        if (featuresUsed.size() != features.size()) {
            status = WARNING;
--- a/src/BayesNet/BoostAODE.h
+++ b/src/BayesNet/BoostAODE.h
@ -1,7 +1,9 @@
 #ifndef BOOSTAODE_H
 #define BOOSTAODE_H
 #include "Ensemble.h"
+#include <map>
 #include "SPODE.h"
+#include "FeatureSelect.h"
 namespace bayesnet {
    class BoostAODE : public Ensemble {
    public:
@ -15,11 +17,16 @@ namespace bayesnet {
    private:
        torch::Tensor dataset_;
        torch::Tensor X_train, y_train, X_test, y_test;
-        void validationInit();
-        bool repeatSparent = false;
+        unordered_set<int> initializeModels();
+        // Hyperparameters
+        bool repeatSparent = false; // if true, a feature can be selected more than once
        int maxModels = 0;
        bool ascending = false; //Process KBest features ascending or descending order
        bool convergence = false; //if true, stop when the model does not improve
+        bool selectFeatures = false; // if true, use feature selection
+        string algorithm = ""; // Selected feature selection algorithm
+        FeatureSelect* featureSelector = nullptr;
+        double threshold = -1;
    };
 }
 #endif
--- a/src/BayesNet/CFS.cc
+++ b/src/BayesNet/CFS.cc
@ -0,0 +1,72 @@
+#include "CFS.h"
+#include <limits>
+#include "bayesnetUtils.h"
+namespace bayesnet {
+    void CFS::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto continueCondition = true;
+        auto feature = featureOrder[0];
+        selectedFeatures.push_back(feature);
+        selectedScores.push_back(suLabels[feature]);
+        selectedFeatures.erase(selectedFeatures.begin());
+        while (continueCondition) {
+            double merit = numeric_limits<double>::lowest();
+            int bestFeature = -1;
+            for (auto feature : featureOrder) {
+                selectedFeatures.push_back(feature);
+                // Compute merit with selectedFeatures
+                auto meritNew = computeMeritCFS();
+                if (meritNew > merit) {
+                    merit = meritNew;
+                    bestFeature = feature;
+                }
+                selectedFeatures.pop_back();
+            }
+            if (bestFeature == -1) {
+                // meritNew has to be nan due to constant features
+                break;
+            }
+            selectedFeatures.push_back(bestFeature);
+            selectedScores.push_back(merit);
+            featureOrder.erase(remove(featureOrder.begin(), featureOrder.end(), bestFeature), featureOrder.end());
+            continueCondition = computeContinueCondition(featureOrder);
+        }
+        fitted = true;
+    }
+    bool CFS::computeContinueCondition(const vector<int>& featureOrder)
+    {
+        if (selectedFeatures.size() == maxFeatures || featureOrder.size() == 0) {
+            return false;
+        }
+        if (selectedScores.size() >= 5) {
+            /*
+            "To prevent the best first search from exploring the entire
+            feature subset search space, a stopping criterion is imposed.
+            The search will terminate if five consecutive fully expanded
+            subsets show no improvement over the current best subset."
+            as stated in Mark A.Hall Thesis
+            */
+            double item_ant = numeric_limits<double>::lowest();
+            int num = 0;
+            vector<double> lastFive(selectedScores.end() - 5, selectedScores.end());
+            for (auto item : lastFive) {
+                if (item_ant == numeric_limits<double>::lowest()) {
+                    item_ant = item;
+                }
+                if (item > item_ant) {
+                    break;
+                } else {
+                    num++;
+                    item_ant = item;
+                }
+            }
+            if (num == 5) {
+                return false;
+            }
+        }
+        return true;
+    }
+}
--- a/src/BayesNet/CFS.h
+++ b/src/BayesNet/CFS.h
@ -0,0 +1,21 @@
+#ifndef CFS_H
+#define CFS_H
+#include <torch/torch.h>
+#include <vector>
+#include "FeatureSelect.h"
+using namespace std;
+namespace bayesnet {
+    class CFS : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        CFS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
+            FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights)
+        {
+        }
+        virtual ~CFS() {};
+        void fit() override;
+    private:
+        bool computeContinueCondition(const vector<int>& featureOrder);
+    };
+}
+#endif
--- a/src/BayesNet/CMakeLists.txt
+++ b/src/BayesNet/CMakeLists.txt
@ -5,5 +5,5 @@ include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
 include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
 add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc 
    KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc 
-    Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
+    Mst.cc Proposal.cc CFS.cc FCBF.cc IWSS.cc FeatureSelect.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
 target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
--- a/src/BayesNet/FCBF.cc
+++ b/src/BayesNet/FCBF.cc
@ -0,0 +1,44 @@
+#include "bayesnetUtils.h"
+#include "FCBF.h"
+namespace bayesnet {
+
+    FCBF::FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
+        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
+    {
+        if (threshold < 1e-7) {
+            throw std::invalid_argument("Threshold cannot be less than 1e-7");
+        }
+    }
+    void FCBF::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto featureOrderCopy = featureOrder;
+        for (const auto& feature : featureOrder) {
+            // Don't self compare
+            featureOrderCopy.erase(featureOrderCopy.begin());
+            if (suLabels.at(feature) == 0.0) {
+                // The feature has been removed from the list
+                continue;
+            }
+            if (suLabels.at(feature) < threshold) {
+                break;
+            }
+            // Remove redundant features
+            for (const auto& featureCopy : featureOrderCopy) {
+                double value = computeSuFeatures(feature, featureCopy);
+                if (value >= suLabels.at(featureCopy)) {
+                    // Remove feature from list
+                    suLabels[featureCopy] = 0.0;
+                }
+            }
+            selectedFeatures.push_back(feature);
+            selectedScores.push_back(suLabels[feature]);
+            if (selectedFeatures.size() == maxFeatures) {
+                break;
+            }
+        }
+        fitted = true;
+    }
+}
--- a/src/BayesNet/FCBF.h
+++ b/src/BayesNet/FCBF.h
@ -0,0 +1,18 @@
+#ifndef FCBF_H
+#define FCBF_H
+#include <torch/torch.h>
+#include <vector>
+#include "FeatureSelect.h"
+using namespace std;
+namespace bayesnet {
+    class FCBF : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        FCBF(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
+        virtual ~FCBF() {};
+        void fit() override;
+    private:
+        double threshold = -1;
+    };
+}
+#endif
--- a/src/BayesNet/FeatureSelect.cc
+++ b/src/BayesNet/FeatureSelect.cc
@ -0,0 +1,79 @@
+#include "FeatureSelect.h"
+#include <limits>
+#include "bayesnetUtils.h"
+namespace bayesnet {
+    FeatureSelect::FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights) :
+        Metrics(samples, features, className, classNumStates), maxFeatures(maxFeatures == 0 ? samples.size(0) - 1 : maxFeatures), weights(weights)
+
+    {
+    }
+    void FeatureSelect::initialize()
+    {
+        selectedFeatures.clear();
+        selectedScores.clear();
+    }
+    double FeatureSelect::symmetricalUncertainty(int a, int b)
+    {
+        /*
+        Compute symmetrical uncertainty. Normalize* information gain (mutual
+        information) with the entropies of the features in order to compensate
+        the bias due to high cardinality features. *Range [0, 1]
+        (https://www.sciencedirect.com/science/article/pii/S0020025519303603)
+        */
+        auto x = samples.index({ a, "..." });
+        auto y = samples.index({ b, "..." });
+        auto mu = mutualInformation(x, y, weights);
+        auto hx = entropy(x, weights);
+        auto hy = entropy(y, weights);
+        return 2.0 * mu / (hx + hy);
+    }
+    void FeatureSelect::computeSuLabels()
+    {
+        // Compute Simmetrical Uncertainty between features and labels
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        for (int i = 0; i < features.size(); ++i) {
+            suLabels.push_back(symmetricalUncertainty(i, -1));
+        }
+    }
+    double FeatureSelect::computeSuFeatures(const int firstFeature, const int secondFeature)
+    {
+        // Compute Simmetrical Uncertainty between features
+        // https://en.wikipedia.org/wiki/Symmetric_uncertainty
+        try {
+            return suFeatures.at({ firstFeature, secondFeature });
+        }
+        catch (const out_of_range& e) {
+            double result = symmetricalUncertainty(firstFeature, secondFeature);
+            suFeatures[{firstFeature, secondFeature}] = result;
+            return result;
+        }
+    }
+    double FeatureSelect::computeMeritCFS()
+    {
+        double result;
+        double rcf = 0;
+        for (auto feature : selectedFeatures) {
+            rcf += suLabels[feature];
+        }
+        double rff = 0;
+        int n = selectedFeatures.size();
+        for (const auto& item : doCombinations(selectedFeatures)) {
+            rff += computeSuFeatures(item.first, item.second);
+        }
+        return rcf / sqrt(n + (n * n - n) * rff);
+    }
+    vector<int> FeatureSelect::getFeatures() const
+    {
+        if (!fitted) {
+            throw runtime_error("FeatureSelect not fitted");
+        }
+        return selectedFeatures;
+    }
+    vector<double> FeatureSelect::getScores() const
+    {
+        if (!fitted) {
+            throw runtime_error("FeatureSelect not fitted");
+        }
+        return selectedScores;
+    }
+}
--- a/src/BayesNet/FeatureSelect.h
+++ b/src/BayesNet/FeatureSelect.h
@ -0,0 +1,31 @@
+#ifndef FEATURE_SELECT_H
+#define FEATURE_SELECT_H
+#include <torch/torch.h>
+#include <vector>
+#include "BayesMetrics.h"
+using namespace std;
+namespace bayesnet {
+    class FeatureSelect : public Metrics {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        FeatureSelect(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights);
+        virtual ~FeatureSelect() {};
+        virtual void fit() = 0;
+        vector<int> getFeatures() const;
+        vector<double> getScores() const;
+    protected:
+        void initialize();
+        void computeSuLabels();
+        double computeSuFeatures(const int a, const int b);
+        double symmetricalUncertainty(int a, int b);
+        double computeMeritCFS();
+        const torch::Tensor& weights;
+        int maxFeatures;
+        vector<int> selectedFeatures;
+        vector<double> selectedScores;
+        vector<double> suLabels;
+        map<pair<int, int>, double> suFeatures;
+        bool fitted = false;
+    };
+}
+#endif
--- a/src/BayesNet/IWSS.cc
+++ b/src/BayesNet/IWSS.cc
@ -0,0 +1,47 @@
+#include "IWSS.h"
+#include <limits>
+#include "bayesnetUtils.h"
+namespace bayesnet {
+    IWSS::IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold) :
+        FeatureSelect(samples, features, className, maxFeatures, classNumStates, weights), threshold(threshold)
+    {
+        if (threshold < 0 || threshold > .5) {
+            throw std::invalid_argument("Threshold has to be in [0, 0.5]");
+        }
+    }
+    void IWSS::fit()
+    {
+        initialize();
+        computeSuLabels();
+        auto featureOrder = argsort(suLabels); // sort descending order
+        auto featureOrderCopy = featureOrder;
+        // Add first and second features to result
+        //     First with its own score
+        auto first_feature = pop_first(featureOrderCopy);
+        selectedFeatures.push_back(first_feature);
+        selectedScores.push_back(suLabels.at(first_feature));
+        //     Second with the score of the candidates
+        selectedFeatures.push_back(pop_first(featureOrderCopy));
+        auto merit = computeMeritCFS();
+        selectedScores.push_back(merit);
+        for (const auto feature : featureOrderCopy) {
+            selectedFeatures.push_back(feature);
+            // Compute merit with selectedFeatures
+            auto meritNew = computeMeritCFS();
+            double delta = merit != 0.0 ? abs(merit - meritNew) / merit : 0.0;
+            if (meritNew > merit || delta < threshold) {
+                if (meritNew > merit) {
+                    merit = meritNew;
+                }
+                selectedScores.push_back(meritNew);
+            } else {
+                selectedFeatures.pop_back();
+                break;
+            }
+            if (selectedFeatures.size() == maxFeatures) {
+                break;
+            }
+        }
+        fitted = true;
+    }
+}
--- a/src/BayesNet/IWSS.h
+++ b/src/BayesNet/IWSS.h
@ -0,0 +1,18 @@
+#ifndef IWSS_H
+#define IWSS_H
+#include <torch/torch.h>
+#include <vector>
+#include "FeatureSelect.h"
+using namespace std;
+namespace bayesnet {
+    class IWSS : public FeatureSelect {
+    public:
+        // dataset is a n+1xm tensor of integers where dataset[-1] is the y vector
+        IWSS(const torch::Tensor& samples, const vector<string>& features, const string& className, const int maxFeatures, const int classNumStates, const torch::Tensor& weights, const double threshold);
+        virtual ~IWSS() {};
+        void fit() override;
+    private:
+        double threshold = -1;
+    };
+}
+#endif
--- a/src/BayesNet/Node.h
+++ b/src/BayesNet/Node.h
@ -14,8 +14,8 @@ namespace bayesnet {
        int numStates; // number of states of the variable
        torch::Tensor cpTable; // Order of indices is 0-> node variable, 1-> 1st parent, 2-> 2nd parent, ...
        vector<int64_t> dimensions; // dimensions of the cpTable
-    public:
        vector<pair<string, string>> combinations(const vector<string>&);
+    public:
        explicit Node(const string&);
        void clear();
        void addParent(Node*);
--- a/src/Platform/CMakeLists.txt
+++ b/src/Platform/CMakeLists.txt
@ -9,14 +9,9 @@ add_executable(b_main main.cc Folding.cc Experiment.cc Datasets.cc Dataset.cc Mo
 add_executable(b_manage manage.cc Results.cc Result.cc ReportConsole.cc ReportExcel.cc ReportBase.cc Datasets.cc Dataset.cc ExcelFile.cc)
 add_executable(b_list list.cc Datasets.cc Dataset.cc)
 add_executable(b_best best.cc BestResults.cc Result.cc Statistics.cc BestResultsExcel.cc ExcelFile.cc)
-add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc)
+add_executable(testx testx.cpp Datasets.cc Dataset.cc Folding.cc )
 target_link_libraries(b_main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}")
-if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
-    target_link_libraries(b_manage "${TORCH_LIBRARIES}" libxlsxwriter.so ArffFiles mdlp stdc++fs)
-    target_link_libraries(b_best Boost::boost libxlsxwriter.so stdc++fs)
-else()
-    target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
-    target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}")
-endif()
+target_link_libraries(b_manage "${TORCH_LIBRARIES}" "${XLSXWRITER_LIB}" ArffFiles mdlp)
+target_link_libraries(b_best Boost::boost "${XLSXWRITER_LIB}")
 target_link_libraries(b_list ArffFiles mdlp "${TORCH_LIBRARIES}")
-target_link_libraries(testx ArffFiles mdlp BayesNet "${TORCH_LIBRARIES}")
+target_link_libraries(testx ArffFiles BayesNet "${TORCH_LIBRARIES}")
--- a/src/Platform/Dataset.cc
+++ b/src/Platform/Dataset.cc
@ -212,14 +212,4 @@ namespace platform {
        }
        return Xd;
    }
-    vector<string> Dataset::split(const string& text, char delimiter)
-    {
-        vector<string> result;
-        stringstream ss(text);
-        string token;
-        while (getline(ss, token, delimiter)) {
-            result.push_back(token);
-        }
-        return result;
-    }
 }
--- a/src/Platform/Dataset.h
+++ b/src/Platform/Dataset.h
@ -5,6 +5,7 @@
 #include <vector>
 #include <string>
 #include "CPPFImdlp.h"
+#include "Utils.h"
 namespace platform {
    using namespace std;

@ -62,7 +63,6 @@ namespace platform {
    public:
        Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
        explicit Dataset(const Dataset&);
-        static vector<string> split(const string& text, char delimiter);
        string getName() const;
        string getClassName() const;
        vector<string> getFeatures() const;
--- a/src/Platform/Datasets.cc
+++ b/src/Platform/Datasets.cc
@ -13,7 +13,7 @@ namespace platform {
                if (line.empty() || line[0] == '#') {
                    continue;
                }
-                vector<string> tokens = Dataset::split(line, ',');
+                vector<string> tokens = split(line, ',');
                string name = tokens[0];
                string className;
                if (tokens.size() == 1) {
--- a/src/Platform/DotEnv.h
+++ b/src/Platform/DotEnv.h
@ -4,7 +4,11 @@
 #include <map>
 #include <fstream>
 #include <sstream>
-#include "Dataset.h"
+#include <algorithm>
+#include <iostream>
+#include "Utils.h"
+
+//#include "Dataset.h"
 namespace platform {
    class DotEnv {
    private:
@ -51,7 +55,7 @@ namespace platform {
            auto seeds_str = env["seeds"];
            seeds_str = trim(seeds_str);
            seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
-            auto seeds_str_split = Dataset::split(seeds_str, ',');
+            auto seeds_str_split = split(seeds_str, ',');
            transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
                return stoi(str);
                });
--- a/src/Platform/Experiment.cc
+++ b/src/Platform/Experiment.cc
@ -3,7 +3,7 @@
 #include "Datasets.h"
 #include "Models.h"
 #include "ReportConsole.h"
-#include "DotEnv.h"
+#include "Paths.h"
 namespace platform {
    using json = nlohmann::json;
    string get_date()
@ -134,8 +134,7 @@ namespace platform {
    }
    void Experiment::cross_validation(const string& fileName)
    {
-        auto env = platform::DotEnv();
-        auto datasets = platform::Datasets(discretized, env.get("source_data"));
+        auto datasets = platform::Datasets(discretized, Paths::datasets());
        // Get dataset
        auto [X, y] = datasets.getTensors(fileName);
        auto states = datasets.getStates(fileName);
--- a/src/Platform/Paths.h
+++ b/src/Platform/Paths.h
@ -1,11 +1,18 @@
 #ifndef PATHS_H
 #define PATHS_H
 #include <string>
+#include "DotEnv.h"
 namespace platform {
    class Paths {
    public:
        static std::string results() { return "results/"; }
        static std::string excel() { return "excel/"; }
+        static std::string cfs() { return "cfs/"; }
+        static std::string datasets()
+        {
+            auto env = platform::DotEnv();
+            return env.get("source_data");
+        }
    };
 }
 #endif
--- a/src/Platform/ReportBase.cc
+++ b/src/Platform/ReportBase.cc
@ -58,8 +58,7 @@ namespace platform {
            }
        } else {
            if (data["score_name"].get<string>() == "accuracy") {
-                auto env = platform::DotEnv();
-                auto dt = Datasets(false, env.get("source_data"));
+                auto dt = Datasets(false, Paths::datasets());
                dt.loadDataset(dataset);
                auto numClasses = dt.getNClasses(dataset);
                if (numClasses == 2) {
--- a/src/Platform/ReportConsole.cc
+++ b/src/Platform/ReportConsole.cc
@ -53,13 +53,9 @@ namespace platform {
            const string status = compareResult(r["dataset"].get<string>(), r["score"].get<double>());
            cout << status;
            cout << setw(12) << right << setprecision(6) << fixed << r["time"].get<double>() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get<double>() << " ";
-            try {
-                cout << r["hyperparameters"].get<string>();
-            }
-            catch (const exception& err) {
-                cout << r["hyperparameters"];
-            }
+            cout << r["hyperparameters"].dump();
            cout << endl;
+            cout << flush;
            lastResult = r;
            totalScore += r["score"].get<double>();
            odd = !odd;
--- a/src/Platform/Utils.h
+++ b/src/Platform/Utils.h
@ -0,0 +1,19 @@
+#ifndef UTILS_H
+#define UTILS_H
+#include <sstream>
+#include <string>
+#include <vector>
+namespace platform {
+    //static vector<string> split(const string& text, char delimiter);
+    static std::vector<std::string> split(const std::string& text, char delimiter)
+    {
+        std::vector<std::string> result;
+        std::stringstream ss(text);
+        std::string token;
+        while (std::getline(ss, token, delimiter)) {
+            result.push_back(token);
+        }
+        return result;
+    }
+}
+#endif
--- a/src/Platform/list.cc
+++ b/src/Platform/list.cc
@ -3,7 +3,6 @@
 #include "Paths.h"
 #include "Colors.h"
 #include "Datasets.h"
-#include "DotEnv.h"

 using namespace std;
 const int BALANCE_LENGTH = 75;
@ -28,8 +27,7 @@ void outputBalance(const string& balance)

 int main(int argc, char** argv)
 {
-    auto env = platform::DotEnv();
-    auto data = platform::Datasets(false, env.get("source_data"));
+    auto data = platform::Datasets(false, platform::Paths::datasets());
    locale mylocale(cout.getloc(), new separated);
    locale::global(mylocale);
    cout.imbue(mylocale);
--- a/src/Platform/main.cc
+++ b/src/Platform/main.cc
@ -12,7 +12,7 @@
 using namespace std;
 using json = nlohmann::json;

-argparse::ArgumentParser manageArguments(int argc, char** argv)
+argparse::ArgumentParser manageArguments()
 {
    auto env = platform::DotEnv();
    argparse::ArgumentParser program("main");
@ -48,44 +48,40 @@ argparse::ArgumentParser manageArguments(int argc, char** argv)
        }});
    auto seed_values = env.getSeeds();
    program.add_argument("-s", "--seeds").nargs(1, 10).help("Random seeds. Set to -1 to have pseudo random").scan<'i', int>().default_value(seed_values);
+    return program;
+}
+
+int main(int argc, char** argv)
+{
+    string file_name, model_name, title;
+    json hyperparameters_json;
+    bool discretize_dataset, stratified, saveResults;
+    vector<int> seeds;
+    vector<string> filesToTest;
+    int n_folds;
+    auto program = manageArguments();
    try {
        program.parse_args(argc, argv);
-        auto file_name = program.get<string>("dataset");
-        auto model_name = program.get<string>("model");
-        auto discretize_dataset = program.get<bool>("discretize");
-        auto stratified = program.get<bool>("stratified");
-        auto n_folds = program.get<int>("folds");
-        auto seeds = program.get<vector<int>>("seeds");
-        auto title = program.get<string>("title");
+        file_name = program.get<string>("dataset");
+        model_name = program.get<string>("model");
+        discretize_dataset = program.get<bool>("discretize");
+        stratified = program.get<bool>("stratified");
+        n_folds = program.get<int>("folds");
+        seeds = program.get<vector<int>>("seeds");
        auto hyperparameters = program.get<string>("hyperparameters");
-        auto saveResults = program.get<bool>("save");
+        hyperparameters_json = json::parse(hyperparameters);
+        title = program.get<string>("title");
        if (title == "" && file_name == "") {
            throw runtime_error("title is mandatory if dataset is not provided");
        }
+        saveResults = program.get<bool>("save");
    }
    catch (const exception& err) {
        cerr << err.what() << endl;
        cerr << program;
        exit(1);
    }
-    return program;
-}
-
-int main(int argc, char** argv)
-{
-    auto program = manageArguments(argc, argv);
-    auto file_name = program.get<string>("dataset");
-    auto model_name = program.get<string>("model");
-    auto discretize_dataset = program.get<bool>("discretize");
-    auto stratified = program.get<bool>("stratified");
-    auto n_folds = program.get<int>("folds");
-    auto seeds = program.get<vector<int>>("seeds");
-    auto hyperparameters = program.get<string>("hyperparameters");
-    vector<string> filesToTest;
-    auto env = platform::DotEnv();
-    auto datasets = platform::Datasets(discretize_dataset, env.get("source_data"));
-    auto title = program.get<string>("title");
-    auto saveResults = program.get<bool>("save");
+    auto datasets = platform::Datasets(discretize_dataset, platform::Paths::datasets());
    if (file_name != "") {
        if (!datasets.isDataset(file_name)) {
            cerr << "Dataset " << file_name << " not found" << endl;
@ -102,12 +98,12 @@ int main(int argc, char** argv)
    /*
    * Begin Processing
    */
-
+    auto env = platform::DotEnv();
    auto experiment = platform::Experiment();
    experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3");
    experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform"));
    experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy");
-    experiment.setHyperparameters(json::parse(hyperparameters));
+    experiment.setHyperparameters(hyperparameters_json);
    for (auto seed : seeds) {
        experiment.addRandomSeed(seed);
    }
--- a/src/Platform/testx.cpp
+++ b/src/Platform/testx.cpp
@ -1,5 +1,6 @@
 #include "Folding.h"
 #include <torch/torch.h>
+#include "nlohmann/json.hpp"
 #include "map"
 #include <iostream>
 #include <sstream>
@ -7,6 +8,9 @@
 #include "Network.h"
 #include "ArffFiles.h"
 #include "CPPFImdlp.h"
+#include "CFS.h"
+#include "IWSS.h"
+#include "FCBF.h"

 using namespace std;
 using namespace platform;
@ -191,22 +195,54 @@ int main()
    //     }
    //     cout << "***********************************************************************************************" << endl;
    // }
-    const string file_name = "iris";
-    auto net = bayesnet::Network();
+    // const string file_name = "iris";
+    // auto net = bayesnet::Network();
+    // auto dt = Datasets(true, "Arff");
+    // auto raw = RawDatasets("iris", true);
+    // auto [X, y] = dt.getVectors(file_name);
+    // cout << "Dataset dims " << raw.dataset.sizes() << endl;
+    // cout << "weights dims " << raw.weights.sizes() << endl;
+    // cout << "States dims " << raw.statest.size() << endl;
+    // cout << "features: ";
+    // for (const auto& feature : raw.featurest) {
+    //     cout << feature << ", ";
+    //     net.addNode(feature);
+    // }
+    // net.addNode(raw.classNamet);
+    // cout << endl;
+    // net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);
    auto dt = Datasets(true, "Arff");
-    auto raw = RawDatasets("iris", true);
-    auto [X, y] = dt.getVectors(file_name);
-    cout << "Dataset dims " << raw.dataset.sizes() << endl;
-    cout << "weights dims " << raw.weights.sizes() << endl;
-    cout << "States dims " << raw.statest.size() << endl;
-    cout << "features: ";
-    for (const auto& feature : raw.featurest) {
-        cout << feature << ", ";
-        net.addNode(feature);
+    nlohmann::json output;
+    for (const auto& name : dt.getNames()) {
+        // for (const auto& name : { "iris" }) {
+        auto [X, y] = dt.getTensors(name);
+        auto features = dt.getFeatures(name);
+        auto states = dt.getStates(name);
+        auto className = dt.getClassName(name);
+        int maxFeatures = 0;
+        auto classNumStates = states.at(className).size();
+        torch::Tensor weights = torch::full({ X.size(1) }, 1.0 / X.size(1), torch::kDouble);
+        auto dataset = X;
+        auto yresized = torch::transpose(y.view({ y.size(0), 1 }), 0, 1);
+        dataset = torch::cat({ dataset, yresized }, 0);
+        auto cfs = bayesnet::CFS(dataset, features, className, maxFeatures, classNumStates, weights);
+        auto fcbf = bayesnet::FCBF(dataset, features, className, maxFeatures, classNumStates, weights, 1e-7);
+        auto iwss = bayesnet::IWSS(dataset, features, className, maxFeatures, classNumStates, weights, 0.5);
+        cout << "Dataset: " << setw(20) << name << flush;
+        cfs.fit();
+        cout << " CFS: " << setw(4) << cfs.getFeatures().size() << flush;
+        fcbf.fit();
+        cout << " FCBF: " << setw(4) << fcbf.getFeatures().size() << flush;
+        iwss.fit();
+        cout << " IWSS: " << setw(4) << iwss.getFeatures().size() << flush;
+        cout << endl;
+        output[name]["CFS"] = cfs.getFeatures();
+        output[name]["FCBF"] = fcbf.getFeatures();
+        output[name]["IWSS"] = iwss.getFeatures();
    }
-    net.addNode(raw.classNamet);
-    cout << endl;
-    net.fit(raw.dataset, raw.weights, raw.featurest, raw.classNamet, raw.statest);
+    ofstream file("features_cpp.json");
+    file << output;
+    file.close();

 }