Remove FeatureSel, add SelectKBest to BayesMetrics

2023-08-16 19:05:18 +02:00 · 2023-08-16 19:05:18 +02:00 · 704dc937be
commit 704dc937be
parent a3e665eed6
10 changed files with 52 additions and 184 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -60,7 +60,6 @@ add_git_submodule("lib/json")
 # --------------
 add_subdirectory(config)
 add_subdirectory(lib/Files)
 add_subdirectory(lib/FeatureSelect)
 add_subdirectory(src/BayesNet)
 add_subdirectory(src/Platform)
 add_subdirectory(sample)
--- a/lib/featureselect/CMakeLists.txt
+++ b/lib/featureselect/CMakeLists.txt
@ -1 +0,0 @@
 add_library(FeatureSelect FeatureSelect.cpp)
--- a/lib/featureselect/FeatureSelect.cpp
+++ b/lib/featureselect/FeatureSelect.cpp
@ -1,119 +0,0 @@
 #include "FeatureSelect.h"
 namespace features {
    SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat)
        : samples(samples), labels(labels), weights(weights), k(k), nat(nat)
    {
        if (samples.size() == 0 || samples[0].size() == 0)
            throw invalid_argument("features must be a non-empty matrix");
        if (samples.size() != labels.size())
            throw invalid_argument("number of samples (" + to_string(samples.size()) + ") and labels (" + to_string(labels.size()) + ") must be equal");
        if (samples.size() != weights.size())
            throw invalid_argument("number of samples and weights must be equal");
        if (k < 1 || k >  static_cast<int>(samples[0].size()))
            throw invalid_argument("k must be between 1 and number of features");
        numFeatures = 0;
        numClasses = 0;
        numSamples = 0;
        fitted = false;
    }
    SelectKBestWeighted& SelectKBestWeighted::fit()
    {
        auto labelsCopy = labels;
        numFeatures = samples[0].size();
        numSamples = samples.size();
        // compute number of classes
        sort(labelsCopy.begin(), labelsCopy.end());
        auto last = unique(labelsCopy.begin(), labelsCopy.end());
        labelsCopy.erase(last, labelsCopy.end());
        numClasses = labelsCopy.size();
        // compute scores
        scores.reserve(numFeatures);
        for (int i = 0; i < numFeatures; ++i) {
            scores.push_back(MutualInformation(i));
            features.push_back(i);
        }
        // sort & reduce scores and features
        sort(features.begin(), features.end(), [&](int i, int j)
            { return scores[i] > scores[j]; });
        sort(scores.begin(), scores.end(), greater<precision_t>());
        features.resize(k);
        scores.resize(k);
        fitted = true;
        return *this;
    }
    precision_t SelectKBestWeighted::entropyLabel()
    {
        return entropy(labels);
    }
    precision_t SelectKBestWeighted::entropy(const sample_t& data)
    {
        precision_t ventropy = 0, totalWeight = 0;
        score_t counts(numClasses + 1, 0);
        for (auto i = 0; i < static_cast<int>(data.size()); ++i) {
            counts[data[i]] += weights[i];
            totalWeight += weights[i];
        }
        for (auto count : counts) {
            precision_t p = count / totalWeight;
            if (p > 0) {
                if (nat) {
                    ventropy -= p * log(p);
                } else {
                    ventropy -= p * log2(p);
                }
            }
        }
        return ventropy;
    }
    // H(Y|X) = sum_{x in X} p(x) H(Y|X=x)
    precision_t SelectKBestWeighted::conditionalEntropy(const int feature)
    {
        unordered_map<value_t, precision_t> featureCounts;
        unordered_map<value_t, unordered_map<value_t, precision_t>> jointCounts;
        featureCounts.clear();
        jointCounts.clear();
        precision_t totalWeight = 0;
        for (auto i = 0; i < numSamples; i++) {
            featureCounts[samples[i][feature]] += weights[i];
            jointCounts[samples[i][feature]][labels[i]] += weights[i];
            totalWeight += weights[i];
        }
        if (totalWeight == 0)
            throw invalid_argument("Total weight should not be zero");
        precision_t entropy = 0;
        for (auto& [feat, count] : featureCounts) {
            auto p_f = count / totalWeight;
            precision_t entropy_f = 0;
            for (auto& [label, jointCount] : jointCounts[feat]) {
                auto p_l_f = jointCount / count;
                if (p_l_f > 0) {
                    if (nat) {
                        entropy_f -= p_l_f * log(p_l_f);
                    } else {
                        entropy_f -= p_l_f * log2(p_l_f);
                    }
                }
            }
            entropy += p_f * entropy_f;
        }
        return entropy;
    }
    // I(X;Y) = H(Y) - H(Y|X)
    precision_t SelectKBestWeighted::MutualInformation(const int i)
    {
        return entropyLabel() - conditionalEntropy(i);
    }
    score_t SelectKBestWeighted::getScores() const
    {
        if (!fitted)
            throw logic_error("score not fitted");
        return scores;
    }
    //Return the indices of the selected features
    labels_t SelectKBestWeighted::getFeatures() const
    {
        if (!fitted)
            throw logic_error("score not fitted");
        return features;
    }
 }
--- a/lib/featureselect/FeatureSelect.h
+++ b/lib/featureselect/FeatureSelect.h
@ -1,38 +0,0 @@
 #ifndef SELECT_K_BEST_WEIGHTED_H
 #define SELECT_K_BEST_WEIGHTED_H
 #include <map>
 #include <vector>
 #include <string>
 using namespace std;
 namespace features {
    typedef float precision_t;
    typedef int value_t;
    typedef vector<value_t> sample_t;
    typedef vector<sample_t> samples_t;
    typedef vector<value_t> labels_t;
    typedef vector<precision_t> score_t, weights_t;
    class SelectKBestWeighted {
    private:
        const samples_t samples;
        const labels_t labels;
        const weights_t weights;
        const int k;
        bool nat; // use natural log or log2
        int numFeatures, numClasses, numSamples;
        bool fitted;
        score_t scores; // scores of the features
        labels_t features; // indices of the selected features
        precision_t entropyLabel();
        precision_t entropy(const sample_t&);
        precision_t conditionalEntropy(const int);
        precision_t MutualInformation(const int);
    public:
        SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool);
        SelectKBestWeighted& fit();
        score_t getScores() const;
        labels_t getFeatures() const; //Return the indices of the selected features
        static inline string version() { return "0.1.0"; };
    };
 }
 #endif
--- a/src/BayesNet/BayesMetrics.cc
+++ b/src/BayesNet/BayesMetrics.cc
@ -21,6 +21,31 @@ namespace bayesnet {
        }
        samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32));
    }
    vector<int> Metrics::SelectKBestWeighted(const torch::Tensor& weights, unsigned k)
    {
        auto n = samples.size(1);
        if (k == 0) {
            k = n;
        }
        // compute scores
        scoresKBest.reserve(n);
        auto label = samples.index({ -1, "..." });
        for (int i = 0; i < n; ++i) {
            scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights));
            featuresKBest.push_back(i);
        }
        // sort & reduce scores and features
        sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j)
            { return scoresKBest[i] > scoresKBest[j]; });
        sort(scoresKBest.begin(), scoresKBest.end(), std::greater<double>());
        featuresKBest.resize(k);
        scoresKBest.resize(k);
        return featuresKBest;
    }
    vector<double> Metrics::getScoresKBest() const
    {
        return scoresKBest;
    }
    vector<pair<string, string>> Metrics::doCombinations(const vector<string>& source)
    {
        vector<pair<string, string>> result;
--- a/src/BayesNet/BayesMetrics.h
+++ b/src/BayesNet/BayesMetrics.h
@ -12,6 +12,8 @@ namespace bayesnet {
        vector<string> features;
        string className;
        int classNumStates = 0;
        vector<double> scoresKBest;
        vector<int> featuresKBest; // sorted indices of the features
        double entropy(const Tensor& feature, const Tensor& weights);
        double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
        vector<pair<string, string>> doCombinations(const vector<string>&);
@ -19,6 +21,8 @@ namespace bayesnet {
        Metrics() = default;
        Metrics(const torch::Tensor& samples, const vector<string>& features, const string& className, const int classNumStates);
        Metrics(const vector<vector<int>>& vsamples, const vector<int>& labels, const vector<string>& features, const string& className, const int classNumStates);
        vector<int> SelectKBestWeighted(const torch::Tensor& weights, unsigned k = 0);
        vector<double> getScoresKBest() const;
        double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights);
        vector<float> conditionalEdgeWeights(vector<float>& weights); // To use in Python
        Tensor conditionalEdge(const torch::Tensor& weights);
--- a/src/BayesNet/BoostAODE.cc
+++ b/src/BayesNet/BoostAODE.cc
@ -1,36 +1,35 @@
 #include "BoostAODE.h"
-#include "FeatureSelect.h"
+#include "BayesMetrics.h"
 namespace bayesnet {
    BoostAODE::BoostAODE() : Ensemble() {}
    void BoostAODE::buildModel(const torch::Tensor& weights)
    {
        models.clear();
        int n_samples = dataset.size(1);
        int n_features = dataset.size(0);
        features::samples_t vsamples;
        for (auto i = 0; i < n_samples; ++i) {
            auto row = dataset.index({ "...", i });
            // convert row to std::vector<int>
            auto vrow = vector<int>(row.data_ptr<int>(), row.data_ptr<int>() + row.numel());
            vsamples.push_back(vrow);
        }
        auto vweights = features::weights_t(n_samples, 1.0 / n_samples);
        auto row = dataset.index({ -1, "..." });
        auto yv = features::labels_t(row.data_ptr<int>(), row.data_ptr<int>() + row.numel());
        auto featureSelection = features::SelectKBestWeighted(vsamples, yv, vweights, n_features, true);
        auto features = featureSelection.fit().getFeatures();
        // features = (
        //     CSelectKBestWeighted(
        //         self.X_, self.y_, weights, k = self.n_features_in_
        //     )
        //     .fit()
        //     .get_features()
        auto scores = features::score_t(n_features, 0.0);
        for (int i = 0; i < features.size(); ++i) {
            models.push_back(std::make_unique<SPODE>(i));
        }
    }
    void BoostAODE::trainModel(const torch::Tensor& weights)
    {
        // End building vectors
        Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kDouble);
        auto X_ = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." });
        auto featureSelection = metrics.SelectKBestWeighted(weights_, n); // Get all the features sorted
        for (int i = 0; i < features.size(); ++i) {
            models[i].fit(dataset, features, className, states, weights_);
            auto ypred = models[i].predict(X_);
            // em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
            // am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1)
            // # Step 3.2: Update weights for next classifier
            // weights = [
            //     wm * np.exp(am * (ym != yp))
            //         for wm, ym, yp in zip(weights, self.y_, y_pred)
            // ]
            // # Step 4: Add the new model
            // self.estimators_.append(estimator)
        }
    }
    vector<string> BoostAODE::graph(const string& title) const
    {
        return Ensemble::graph(title);
--- a/src/BayesNet/BoostAODE.h
+++ b/src/BayesNet/BoostAODE.h
@ -6,6 +6,7 @@ namespace bayesnet {
    class BoostAODE : public Ensemble {
    protected:
        void buildModel(const torch::Tensor& weights) override;
        void trainModel(const torch::Tensor& weights) override;
    public:
        BoostAODE();
        virtual ~BoostAODE() {};
--- a/src/BayesNet/CMakeLists.txt
+++ b/src/BayesNet/CMakeLists.txt
@ -1,9 +1,8 @@
 include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
 include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
 include_directories(${BayesNet_SOURCE_DIR}/lib/featureselect)
 include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
 include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
 add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc 
    KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc 
    Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
-target_link_libraries(BayesNet mdlp FeatureSelect "${TORCH_LIBRARIES}")
+target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}")
--- a/src/BayesNet/SPODELd.cc
+++ b/src/BayesNet/SPODELd.cc
@ -21,7 +21,6 @@ namespace bayesnet {
    SPODELd& SPODELd::fit(torch::Tensor& dataset, vector<string>& features_, string className_, map<string, vector<int>>& states_)
    {
        Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone();
        cout << "Xf " << Xf.sizes() << " dtype: " << Xf.dtype() << endl;
        y = dataset.index({ -1, "..." }).clone();
        // This first part should go in a Classifier method called fit_local_discretization o fit_float...
        features = features_;
		`@ -1 +0,0 @@`
			`add_library(FeatureSelect FeatureSelect.cpp)`