Fix weights mistakes in computation

2023-08-16 12:32:51 +02:00
parent 4d4780c1d5
commit 80b20f35b4
16 changed files with 262 additions and 75 deletions
--- a/src/BayesNet/BayesMetrics.cc
+++ b/src/BayesNet/BayesMetrics.cc
@@ -38,12 +38,14 @@ namespace bayesnet {
        auto source = vector<string>(features);
        source.push_back(className);
        auto combinations = doCombinations(source);
+        double totalWeight = weights.sum().item<double>();
        // Compute class prior
-        auto margin = torch::zeros({ classNumStates });
+        auto margin = torch::zeros({ classNumStates }, torch::kFloat);
        for (int value = 0; value < classNumStates; ++value) {
            auto mask = samples.index({ -1,  "..." }) == value;
-            margin[value] = mask.sum().item<float>() / samples.size(1);
+            margin[value] = mask.sum().item<double>() / samples.size(1);
        }
+        cout << "Margin: " << margin;
        for (auto [first, second] : combinations) {
            int index_first = find(features.begin(), features.end(), first) - features.begin();
            int index_second = find(features.begin(), features.end(), second) - features.begin();
@@ -54,7 +56,7 @@ namespace bayesnet {
                auto second_dataset = samples.index({ index_second, mask });
                auto weights_dataset = weights.index({ mask });
                auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset);
-                auto pb = margin[value].item<float>();
+                auto pb = margin[value].item<double>();
                accumulated += pb * mi;
            }
            result.push_back(accumulated);
@@ -81,7 +83,7 @@ namespace bayesnet {
    double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights)
    {
        torch::Tensor counts = feature.bincount(weights);
-        int totalWeight = counts.sum().item<int>();
+        double totalWeight = counts.sum().item<double>();
        torch::Tensor probs = counts.to(torch::kFloat) / totalWeight;
        torch::Tensor logProbs = torch::log(probs);
        torch::Tensor entropy = -probs * logProbs;
@@ -95,7 +97,7 @@ namespace bayesnet {
        unordered_map<int, unordered_map<int, double>> jointCounts;
        double totalWeight = 0;
        for (auto i = 0; i < numSamples; i++) {
-            jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += 1;
+            jointCounts[secondFeature[i].item<int>()][firstFeature[i].item<int>()] += weights[i].item<double>();
            totalWeight += weights[i].item<float>();
        }
        if (totalWeight == 0)
--- a/src/BayesNet/BoostAODE.cc
+++ b/src/BayesNet/BoostAODE.cc
@@ -1,10 +1,32 @@
 #include "BoostAODE.h"
+#include "FeatureSelect.h"

 namespace bayesnet {
    BoostAODE::BoostAODE() : Ensemble() {}
    void BoostAODE::buildModel(const torch::Tensor& weights)
    {
        models.clear();
+        int n_samples = dataset.size(1);
+        int n_features = dataset.size(0);
+        features::samples_t vsamples;
+        for (auto i = 0; i < n_samples; ++i) {
+            auto row = dataset.index({ "...", i });
+            // convert row to std::vector<int>
+            auto vrow = vector<int>(row.data_ptr<int>(), row.data_ptr<int>() + row.numel());
+            vsamples.push_back(vrow);
+        }
+        auto vweights = features::weights_t(n_samples, 1.0 / n_samples);
+        auto row = dataset.index({ -1, "..." });
+        auto yv = features::labels_t(row.data_ptr<int>(), row.data_ptr<int>() + row.numel());
+        auto featureSelection = features::SelectKBestWeighted(vsamples, yv, vweights, n_features, true);
+        auto features = featureSelection.fit().getFeatures();
+        // features = (
+        //     CSelectKBestWeighted(
+        //         self.X_, self.y_, weights, k = self.n_features_in_
+        //     )
+        //     .fit()
+        //     .get_features()
+        auto scores = features::score_t(n_features, 0.0);
        for (int i = 0; i < features.size(); ++i) {
            models.push_back(std::make_unique<SPODE>(i));
        }
--- a/src/BayesNet/CMakeLists.txt
+++ b/src/BayesNet/CMakeLists.txt
@@ -1,7 +1,9 @@
 include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp)
 include_directories(${BayesNet_SOURCE_DIR}/lib/Files)
+include_directories(${BayesNet_SOURCE_DIR}/lib/featureselect)
 include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet)
 include_directories(${BayesNet_SOURCE_DIR}/src/Platform)
 add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc 
-    KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
-target_link_libraries(BayesNet mdlp ArffFiles "${TORCH_LIBRARIES}")
+    KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc 
+    Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc)
+target_link_libraries(BayesNet mdlp FeatureSelect "${TORCH_LIBRARIES}")
--- a/src/BayesNet/Classifier.cc
+++ b/src/BayesNet/Classifier.cc
@@ -43,7 +43,7 @@ namespace bayesnet {
    {
        dataset = X;
        buildDataset(y);
-        const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat);
+        const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat);
        return build(features, className, states, weights);
    }
    // X is nxm where n is the number of features and m the number of samples
@@ -55,13 +55,13 @@ namespace bayesnet {
        }
        auto ytmp = torch::tensor(y, kInt32);
        buildDataset(ytmp);
-        const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat);
+        const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat);
        return build(features, className, states, weights);
    }
    Classifier& Classifier::fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states)
    {
        this->dataset = dataset;
-        const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat);
+        const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat);
        return build(features, className, states, weights);
    }
    Classifier& Classifier::fit(torch::Tensor& dataset, vector<string>& features, string className, map<string, vector<int>>& states, const torch::Tensor& weights)
--- a/src/BayesNet/Network.cc
+++ b/src/BayesNet/Network.cc
@@ -5,7 +5,6 @@
 namespace bayesnet {
    Network::Network() : features(vector<string>()), className(""), classNumStates(0), fitted(false) {}
    Network::Network(float maxT) : features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {}
-    Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector<string>()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {}
    Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other.
        getmaxThreads()), fitted(other.fitted)
    {
@@ -174,6 +173,7 @@ namespace bayesnet {
    void Network::completeFit(const map<string, vector<int>>& states, const torch::Tensor& weights)
    {
        setStates(states);
+        laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation
        int maxThreadsRunning = static_cast<int>(std::thread::hardware_concurrency() * maxThreads);
        if (maxThreadsRunning < 1) {
            maxThreadsRunning = 1;
@@ -347,7 +347,7 @@ namespace bayesnet {
        }
        // Normalize result
        double sum = accumulate(result.begin(), result.end(), 0.0);
-        transform(result.begin(), result.end(), result.begin(), [sum](double& value) { return value / sum; });
+        transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; });
        return result;
    }
    vector<string> Network::show() const
@@ -435,6 +435,7 @@ namespace bayesnet {
    {
        for (auto& node : nodes) {
            cout << "* " << node.first << ": (" << node.second->getNumStates() << ") : " << node.second->getCPT().sizes() << endl;
+            cout << node.second->getCPT() << endl;
        }
    }
 }
--- a/src/BayesNet/Network.h
+++ b/src/BayesNet/Network.h
@@ -13,7 +13,7 @@ namespace bayesnet {
        int classNumStates;
        vector<string> features; // Including classname
        string className;
-        int laplaceSmoothing = 1;
+        double laplaceSmoothing;
        torch::Tensor samples; // nxm tensor used to fit the model
        bool isCyclic(const std::string&, std::unordered_set<std::string>&, std::unordered_set<std::string>&);
        vector<double> predict_sample(const vector<int>&);
@@ -25,7 +25,6 @@ namespace bayesnet {
        void setStates(const map<string, vector<int>>&);
    public:
        Network();
-        explicit Network(float, int);
        explicit Network(float);
        explicit Network(Network&);
        torch::Tensor& getSamples();
--- a/src/BayesNet/Node.cc
+++ b/src/BayesNet/Node.cc
@@ -84,7 +84,7 @@ namespace bayesnet {
        }
        return result;
    }
-    void Node::computeCPT(const torch::Tensor& dataset, const vector<string>& features, const int laplaceSmoothing, const torch::Tensor& weights)
+    void Node::computeCPT(const torch::Tensor& dataset, const vector<string>& features, const double laplaceSmoothing, const torch::Tensor& weights)
    {
        dimensions.clear();
        // Get dimensions of the CPT
@@ -111,7 +111,7 @@ namespace bayesnet {
                coordinates.push_back(dataset.index({ parent_index, n_sample }));
            }
            // Increment the count of the corresponding coordinate
-            cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item<float>());
+            cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item<double>());
        }
        // Normalize the counts
        cpTable = cpTable / cpTable.sum(0);
--- a/src/BayesNet/Node.h
+++ b/src/BayesNet/Node.h
@@ -26,7 +26,7 @@ namespace bayesnet {
        vector<Node*>& getParents();
        vector<Node*>& getChildren();
        torch::Tensor& getCPT();
-        void computeCPT(const torch::Tensor& dataset, const vector<string>& features, const int laplaceSmoothing, const torch::Tensor& weights);
+        void computeCPT(const torch::Tensor& dataset, const vector<string>& features, const double laplaceSmoothing, const torch::Tensor& weights);
        int getNumStates() const;
        void setNumStates(int);
        unsigned minFill();
--- a/src/BayesNet/TAN.cc
+++ b/src/BayesNet/TAN.cc
@@ -22,6 +22,8 @@ namespace bayesnet {
        auto root = mi[mi.size() - 1].first;
        // 2. Compute mutual information between each feature and the class
        auto weights_matrix = metrics.conditionalEdge(weights);
+        cout << "*** Weights matrix ***\n";
+        cout << weights_matrix << "\n";
        // 3. Compute the maximum spanning tree
        auto mst = metrics.maximumSpanningTree(features, weights_matrix, root);
        // 4. Add edges from the maximum spanning tree to the model