Remove duplicated code in BestResults

2023-09-28 00:59:34 +02:00 · 2023-09-28 00:59:34 +02:00 · 3b06534327
commit 3b06534327
parent ac89a451e3
4 changed files with 25 additions and 236 deletions
--- a/src/Platform/BestResults.cc
+++ b/src/Platform/BestResults.cc
@ -7,8 +7,6 @@
 #include "Result.h"
 #include "Colors.h"
 #include "Statistics.h"
-#include <boost/math/distributions/chi_squared.hpp>
-#include <boost/math/distributions/normal.hpp>



@ -27,12 +25,6 @@ std::string ftime_to_string(TP tp)
    buffer << std::put_time(gmt, "%Y-%m-%d %H:%M");
    return buffer.str();
 }
-struct WTL {
-    int win;
-    int tie;
-    int loss;
-};
-
 namespace platform {

    string BestResults::build()
@ -114,9 +106,10 @@ namespace platform {
        }
        throw invalid_argument("Unable to open result file. [" + fileName + "]");
    }
-    set<string> BestResults::getModels()
+    vector<string> BestResults::getModels()
    {
        set<string> models;
+        vector<string> result;
        auto files = loadResultFiles();
        if (files.size() == 0) {
            cerr << Colors::MAGENTA() << "No result files were found!" << Colors::RESET() << endl;
@ -129,7 +122,8 @@ namespace platform {
            // add the model to the vector of models
            models.insert(fileModel);
        }
-        return models;
+        result = vector<string>(models.begin(), models.end());
+        return result;
    }

    void BestResults::buildAll()
@ -171,7 +165,7 @@ namespace platform {
            odd = !odd;
        }
    }
-    json BestResults::buildTableResults(set<string> models)
+    json BestResults::buildTableResults(vector<string> models)
    {
        int numberOfDatasets = 0;
        bool first = true;
@ -208,168 +202,8 @@ namespace platform {
        table["dateTable"] = ftime_to_string(maxDate);
        return table;
    }
-    map<string, float> assignRanks(vector<pair<string, double>>& ranksOrder)
-    {
-        // sort the ranksOrder vector by value
-        sort(ranksOrder.begin(), ranksOrder.end(), [](const pair<string, double>& a, const pair<string, double>& b) {
-            return a.second > b.second;
-            });
-        //Assign ranks to  values and if they are the same they share the same averaged rank
-        map<string, float> ranks;
-        for (int i = 0; i < ranksOrder.size(); i++) {
-            ranks[ranksOrder[i].first] = i + 1.0;
-        }
-        int i = 0;
-        while (i < static_cast<int>(ranksOrder.size())) {
-            int j = i + 1;
-            int sumRanks = ranks[ranksOrder[i].first];
-            while (j < static_cast<int>(ranksOrder.size()) && ranksOrder[i].second == ranksOrder[j].second) {
-                sumRanks += ranks[ranksOrder[j++].first];
-            }
-            if (j > i + 1) {
-                float averageRank = (float)sumRanks / (j - i);
-                for (int k = i; k < j; k++) {
-                    ranks[ranksOrder[k].first] = averageRank;
-                }
-            }
-            i = j;
-        }
-        return ranks;
-    }

-    map<int, WTL> computeWTL(int controlIdx, vector<string> models, json table)
-    {
-        // Compute the WTL matrix
-        map<int, WTL> wtl;
-        int nModels = models.size();
-        for (int i = 0; i < nModels; ++i) {
-            wtl[i] = { 0, 0, 0 };
-        }
-        json origin = table.begin().value();
-        for (auto const& item : origin.items()) {
-            auto controlModel = models.at(controlIdx);
-            double controlValue = table[controlModel].at(item.key()).at(0).get<double>();
-            for (int i = 0; i < nModels; ++i) {
-                if (i == controlIdx) {
-                    continue;
-                }
-                double value = table[models[i]].at(item.key()).at(0).get<double>();
-                if (value < controlValue) {
-                    wtl[i].win++;
-                } else if (value == controlValue) {
-                    wtl[i].tie++;
-                } else {
-                    wtl[i].loss++;
-                }
-            }
-        }
-        return wtl;
-    }
-
-    void postHocHolm(int controlIdx, vector<string> models, int nDatasets, map<string, float> ranks, double significance, map<int, WTL> wtl)
-    {
-        // Reference https://link.springer.com/article/10.1007/s44196-022-00083-8
-        // Post-hoc Holm test
-        // Calculate the p-value for the models paired with the control model
-        int nModels = models.size();
-        map<int, double> stats; // p-value of each model paired with the control model
-        boost::math::normal dist(0.0, 1.0);
-        double diff = sqrt(nModels * (nModels + 1) / (6.0 * nDatasets));
-        for (int i = 0; i < nModels; i++) {
-            if (i == controlIdx) {
-                stats[i] = 0.0;
-                continue;
-            }
-            double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
-            double p_value = (long double)2 * (1 - cdf(dist, z));
-            stats[i] = p_value;
-        }
-        // Sort the models by p-value
-        vector<pair<int, double>> statsOrder;
-        for (const auto& stat : stats) {
-            statsOrder.push_back({ stat.first, stat.second });
-        }
-        sort(statsOrder.begin(), statsOrder.end(), [](const pair<int, double>& a, const pair<int, double>& b) {
-            return a.second < b.second;
-            });
-
-        // Holm adjustment
-        for (int i = 0; i < statsOrder.size(); ++i) {
-            auto item = statsOrder.at(i);
-            double before = i == 0 ? 0.0 : statsOrder.at(i - 1).second;
-            double p_value = min((double)1.0, item.second * (nModels - i));
-            p_value = max(before, p_value);
-            statsOrder[i] = { item.first, p_value };
-        }
-        cout << Colors::CYAN();
-        cout << "  *************************************************************************************************************" << endl;
-        cout << "  Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << endl;
-        cout << "  Control model: " << models[controlIdx] << endl;
-        cout << "  Model        p-value      rank      win tie loss" << endl;
-        cout << "  ============ ============ ========= === === ====" << endl;
-        // sort ranks from lowest to highest
-        vector<pair<string, float>> ranksOrder;
-        for (const auto& rank : ranks) {
-            ranksOrder.push_back({ rank.first, rank.second });
-        }
-        sort(ranksOrder.begin(), ranksOrder.end(), [](const pair<string, float>& a, const pair<string, float>& b) {
-            return a.second < b.second;
-            });
-        for (const auto& item : ranksOrder) {
-            if (item.first == models.at(controlIdx)) {
-                continue;
-            }
-            auto idx = distance(models.begin(), find(models.begin(), models.end(), item.first));
-            double pvalue = 0.0;
-            for (const auto& stat : statsOrder) {
-                if (stat.first == idx) {
-                    pvalue = stat.second;
-                }
-            }
-            cout << "  " << left << setw(12) << item.first << " " << setprecision(10) << fixed << pvalue << setprecision(7) << " " << item.second;
-            cout << " " << right << setw(3) << wtl.at(idx).win << " " << setw(3) << wtl.at(idx).tie << " " << setw(4) << wtl.at(idx).loss << endl;
-        }
-        cout << "  *************************************************************************************************************" << endl;
-        cout << Colors::RESET();
-    }
-    bool friedmanTest(vector<string> models, int nDatasets, map<string, float> ranks, double significance = 0.05)
-    {
-        // Friedman test
-        // Calculate the Friedman statistic
-        int nModels = models.size();
-        if (nModels < 3 || nDatasets < 3) {
-            throw runtime_error("Can't make the Friedman test with less than 3 models and/or less than 3 datasets.");
-        }
-        cout << Colors::BLUE() << endl;
-        cout << "***************************************************************************************************************" << endl;
-        cout << Colors::GREEN() << "Friedman test: H0: 'There is no significant differences between all the classifiers.'" << Colors::BLUE() << endl;
-        double degreesOfFreedom = nModels - 1.0;
-        double sumSquared = 0;
-        for (const auto& rank : ranks) {
-            sumSquared += pow(rank.second, 2);
-        }
-        // Compute the Friedman statistic as in https://link.springer.com/article/10.1007/s44196-022-00083-8
-        double friedmanQ = 12.0 * nDatasets / (nModels * (nModels + 1)) * (sumSquared - (nModels * pow(nModels + 1, 2)) / 4);
-        cout << "Friedman statistic: " << friedmanQ << endl;
-        // Calculate the critical value
-        boost::math::chi_squared chiSquared(degreesOfFreedom);
-        long double p_value = (long double)1.0 - cdf(chiSquared, friedmanQ);
-        double criticalValue = quantile(chiSquared, 1 - significance);
-        std::cout << "Critical Chi-Square Value for df=" << fixed << (int)degreesOfFreedom
-            << " and alpha=" << setprecision(2) << fixed << significance << ": " << setprecision(7) << scientific << criticalValue << std::endl;
-        cout << "p-value: " << scientific << p_value << " is " << (p_value < significance ? "less" : "greater") << " than " << setprecision(2) << fixed << significance << endl;
-        bool result;
-        if (p_value < significance) {
-            cout << Colors::GREEN() << "The null hypothesis H0 is rejected." << endl;
-            result = true;
-        } else {
-            cout << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << endl;
-            result = false;
-        }
-        cout << Colors::BLUE() << "***************************************************************************************************************" << endl;
-        return result;
-    }
-    void BestResults::printTableResults(set<string> models, json table)
+    void BestResults::printTableResults(vector<string> models, json table)
    {
        cout << Colors::GREEN() << "Best results for " << score << " as of " << table.at("dateTable").get<string>() << endl;
        cout << "------------------------------------------------" << endl;
@ -386,8 +220,6 @@ namespace platform {
        auto i = 0;
        bool odd = true;
        map<string, double> totals;
-        map<string, float> ranks;
-        map<string, float> ranksTotal;
        int nDatasets = table.begin().value().size();
        for (const auto& model : models) {
            totals[model] = 0.0;
@ -398,23 +230,12 @@ namespace platform {
            cout << color << setw(3) << fixed << right << i++ << " ";
            cout << setw(25) << left << item.key() << " ";
            double maxValue = 0;
-            vector<pair<string, double>> ranksOrder;
            // Find out the max value for this dataset
            for (const auto& model : models) {
                double value = table[model].at(item.key()).at(0).get<double>();
                if (value > maxValue) {
                    maxValue = value;
                }
-                ranksOrder.push_back({ model, value });
-            }
-            // Assign the ranks
-            ranks = assignRanks(ranksOrder);
-            if (ranksTotal.size() == 0) {
-                ranksTotal = ranks;
-            } else {
-                for (const auto& rank : ranks) {
-                    ranksTotal[rank.first] += rank.second;
-                }
            }
            // Print the row with red colors on max values
            for (const auto& model : models) {
@ -425,7 +246,6 @@ namespace platform {
                }
                totals[model] += value;
                cout << efectiveColor << setw(12) << setprecision(10) << fixed << value << " ";
-                // cout << efectiveColor << setw(12) << setprecision(10) << fixed << ranks[model] << " ";
            }
            cout << endl;
            odd = !odd;
@ -449,50 +269,7 @@ namespace platform {
            }
            cout << efectiveColor << setw(12) << setprecision(9) << fixed << totals[model] << " ";
        }
-        // Output the averaged ranks
        cout << endl;
-        int min = 1;
-        for (auto& rank : ranksTotal) {
-            if (rank.second < min) {
-                min = rank.second;
-            }
-            rank.second /= nDatasets;
-        }
-        cout << Colors::BLUE() << setw(30) << "    Ranks....................";
-        for (const auto& model : models) {
-            string efectiveColor = Colors::BLUE();
-            if (ranksTotal[model] == min) {
-                efectiveColor = Colors::RED();
-            }
-            cout << efectiveColor << setw(12) << setprecision(4) << fixed << (double)ranksTotal[model] << " ";
-        }
-        cout << endl;
-        cout << Colors::GREEN() << setw(30) << "    Averaged ranks...........";
-        for (const auto& model : models) {
-            string efectiveColor = Colors::GREEN();
-            if (ranksTotal[model] == min) {
-                efectiveColor = Colors::RED();
-            }
-            cout << efectiveColor << setw(12) << setprecision(9) << fixed << (double)ranksTotal[model] << " ";
-        }
-        cout << endl;
-        vector<string> vModels(models.begin(), models.end());
-        vector<string> datasets;
-        for (const auto& dataset : table.begin().value().items()) {
-            datasets.push_back(dataset.key());
-        }
-        double significance = 0.05;
-        if (friedman) {
-            friedmanTest(vModels, nDatasets, ranksTotal, significance);
-            // Stablish the control model as the one with the lowest averaged rank
-            int controlIdx = distance(ranks.begin(), min_element(ranks.begin(), ranks.end(), [](const auto& l, const auto& r) { return l.second < r.second; }));
-            auto wtl = computeWTL(controlIdx, vModels, table);
-            postHocHolm(controlIdx, vModels, nDatasets, ranksTotal, significance, wtl);
-        }
-
-        Statistics stats(vModels, datasets, table, significance);
-        stats.friedmanTest();
-        stats.postHocHolmTest();
    }
    void BestResults::reportAll()
    {
@ -501,5 +278,16 @@ namespace platform {
        json table = buildTableResults(models);
        // Print the table of results
        printTableResults(models, table);
+        // Compute the Friedman test
+        if (friedman) {
+            vector<string> datasets;
+            for (const auto& dataset : table.begin().value().items()) {
+                datasets.push_back(dataset.key());
+            }
+            double significance = 0.05;
+            Statistics stats(models, datasets, table, significance);
+            auto result = stats.friedmanTest();
+            stats.postHocHolmTest(result);
+        }
    }
 }
--- a/src/Platform/BestResults.h
+++ b/src/Platform/BestResults.h
@ -14,10 +14,10 @@ namespace platform {
        void reportAll();
        void buildAll();
    private:
-        set<string> getModels();
+        vector<string> getModels();
        vector<string> loadResultFiles();
-        json buildTableResults(set<string> models);
-        void printTableResults(set<string> models, json table);
+        json buildTableResults(vector<string> models);
+        void printTableResults(vector<string> models, json table);
        string bestResultFile();
        json loadFile(const string& fileName);
        string path;
--- a/src/Platform/Statistics.cc
+++ b/src/Platform/Statistics.cc
@ -102,7 +102,7 @@ namespace platform {
        }
    }

-    void Statistics::postHocHolmTest()
+    void Statistics::postHocHolmTest(bool friedmanResult)
    {
        if (!fitted) {
            fit();
@ -139,7 +139,8 @@ namespace platform {
            p_value = max(before, p_value);
            statsOrder[i] = { item.first, p_value };
        }
-        cout << Colors::MAGENTA();
+        auto color = friedmanResult ? Colors::GREEN() : Colors::YELLOW();
+        cout << color;
        cout << "  *************************************************************************************************************" << endl;
        cout << "  Post-hoc Holm test: H0: 'There is no significant differences between the control model and the other models.'" << endl;
        cout << "  Control model: " << models[controlIdx] << endl;
@ -203,7 +204,7 @@ namespace platform {
            cout << Colors::YELLOW() << "The null hypothesis H0 is accepted. Computed p-values will not be significant." << endl;
            result = false;
        }
-        cout << Colors::BLUE() << "***************************************************************************************************************" << endl;
+        cout << Colors::BLUE() << "***************************************************************************************************************" << Colors::RESET() << endl;
        return result;
    }
 } // namespace platform
--- a/src/Platform/Statistics.h
+++ b/src/Platform/Statistics.h
@ -17,7 +17,7 @@ namespace platform {
    public:
        Statistics(vector<string>& models, vector<string>& datasets, json data, double significance = 0.05);
        bool friedmanTest();
-        void postHocHolmTest();
+        void postHocHolmTest(bool friedmanResult);
    private:
        void fit();
        void computeRanks();