Begin b_list excel

2024-02-29 12:53:11 +01:00
parent 9a26baec47
commit c69dc08134
58 changed files with 148 additions and 39 deletions
--- a/src/common/CLocale.h
+++ b/src/common/CLocale.h
@@ -0,0 +1,22 @@
+#ifndef LOCALE_H
+#define LOCALE_H
+#include <locale>
+#include <iostream>
+#include <string>
+namespace platform {
+    struct separation : std::numpunct<char> {
+        char do_decimal_point() const { return ','; }
+        char do_thousands_sep() const { return '.'; }
+        std::string do_grouping() const { return "\03"; }
+    };
+    class ConfigLocale {
+    public:
+        explicit ConfigLocale()
+        {
+            std::locale mylocale(std::cout.getloc(), new separation);
+            std::locale::global(mylocale);
+            std::cout.imbue(mylocale);
+        }
+    };
+}
+#endif 
--- a/src/common/Colors.h
+++ b/src/common/Colors.h
@@ -0,0 +1,15 @@
+#ifndef COLORS_H
+#define COLORS_H
+class Colors {
+public:
+    static std::string MAGENTA() { return "\033[1;35m"; }
+    static std::string BLUE() { return "\033[1;34m"; }
+    static std::string CYAN() { return "\033[1;36m"; }
+    static std::string GREEN() { return "\033[1;32m"; }
+    static std::string YELLOW() { return "\033[1;33m"; }
+    static std::string RED() { return "\033[1;31m"; }
+    static std::string WHITE() { return "\033[1;37m"; }
+    static std::string IBLUE() { return "\033[0;94m"; }
+    static std::string RESET() { return "\033[0m"; }
+};
+#endif // COLORS_H
--- a/src/common/Dataset.cc
+++ b/src/common/Dataset.cc
@@ -0,0 +1,215 @@
+#include "Dataset.h"
+#include "ArffFiles.h"
+#include <fstream>
+namespace platform {
+    Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
+    {
+    }
+    std::string Dataset::getName() const
+    {
+        return name;
+    }
+    std::string Dataset::getClassName() const
+    {
+        return className;
+    }
+    std::vector<std::string> Dataset::getFeatures() const
+    {
+        if (loaded) {
+            return features;
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    int Dataset::getNFeatures() const
+    {
+        if (loaded) {
+            return n_features;
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    int Dataset::getNSamples() const
+    {
+        if (loaded) {
+            return n_samples;
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    std::map<std::string, std::vector<int>> Dataset::getStates() const
+    {
+        if (loaded) {
+            return states;
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
+    {
+        if (loaded) {
+            return { Xv, yv };
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
+    {
+        if (loaded) {
+            return { Xd, yv };
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
+    {
+        if (loaded) {
+            buildTensors();
+            return { X, y };
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    void Dataset::load_csv()
+    {
+        ifstream file(path + "/" + name + ".csv");
+        if (file.is_open()) {
+            std::string line;
+            getline(file, line);
+            std::vector<std::string> tokens = split(line, ',');
+            features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
+            if (className == "-1") {
+                className = tokens.back();
+            }
+            for (auto i = 0; i < features.size(); ++i) {
+                Xv.push_back(std::vector<float>());
+            }
+            while (getline(file, line)) {
+                tokens = split(line, ',');
+                for (auto i = 0; i < features.size(); ++i) {
+                    Xv[i].push_back(stof(tokens[i]));
+                }
+                yv.push_back(stoi(tokens.back()));
+            }
+            file.close();
+        } else {
+            throw std::invalid_argument("Unable to open dataset file.");
+        }
+    }
+    void Dataset::computeStates()
+    {
+        for (int i = 0; i < features.size(); ++i) {
+            states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
+            auto item = states.at(features[i]);
+            iota(begin(item), end(item), 0);
+        }
+        states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
+        iota(begin(states.at(className)), end(states.at(className)), 0);
+    }
+    void Dataset::load_arff()
+    {
+        auto arff = ArffFiles();
+        arff.load(path + "/" + name + ".arff", className);
+        // Get Dataset X, y
+        Xv = arff.getX();
+        yv = arff.getY();
+        // Get className & Features
+        className = arff.getClassName();
+        auto attributes = arff.getAttributes();
+        transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
+    }
+    std::vector<std::string> tokenize(std::string line)
+    {
+        std::vector<std::string> tokens;
+        for (auto i = 0; i < line.size(); ++i) {
+            if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
+                std::string token = line.substr(0, i);
+                tokens.push_back(token);
+                line.erase(line.begin(), line.begin() + i + 1);
+                i = 0;
+                while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
+                    line.erase(line.begin(), line.begin() + i + 1);
+            }
+        }
+        if (line.size() > 0) {
+            tokens.push_back(line);
+        }
+        return tokens;
+    }
+    void Dataset::load_rdata()
+    {
+        ifstream file(path + "/" + name + "_R.dat");
+        if (file.is_open()) {
+            std::string line;
+            getline(file, line);
+            line = ArffFiles::trim(line);
+            std::vector<std::string> tokens = tokenize(line);
+            transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
+            if (className == "-1") {
+                className = ArffFiles::trim(tokens.back());
+            }
+            for (auto i = 0; i < features.size(); ++i) {
+                Xv.push_back(std::vector<float>());
+            }
+            while (getline(file, line)) {
+                tokens = tokenize(line);
+                // We have to skip the first token, which is the instance number.
+                for (auto i = 1; i < features.size() + 1; ++i) {
+                    const float value = stof(tokens[i]);
+                    Xv[i - 1].push_back(value);
+                }
+                yv.push_back(stoi(tokens.back()));
+            }
+            file.close();
+        } else {
+            throw std::invalid_argument("Unable to open dataset file.");
+        }
+    }
+    void Dataset::load()
+    {
+        if (loaded) {
+            return;
+        }
+        if (fileType == CSV) {
+            load_csv();
+        } else if (fileType == ARFF) {
+            load_arff();
+        } else if (fileType == RDATA) {
+            load_rdata();
+        }
+        if (discretize) {
+            Xd = discretizeDataset(Xv, yv);
+            computeStates();
+        }
+        n_samples = Xv[0].size();
+        n_features = Xv.size();
+        loaded = true;
+    }
+    void Dataset::buildTensors()
+    {
+        if (discretize) {
+            X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
+        } else {
+            X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
+        }
+        for (int i = 0; i < features.size(); ++i) {
+            if (discretize) {
+                X.index_put_({ i,  "..." }, torch::tensor(Xd[i], torch::kInt32));
+            } else {
+                X.index_put_({ i,  "..." }, torch::tensor(Xv[i], torch::kFloat32));
+            }
+        }
+        y = torch::tensor(yv, torch::kInt32);
+    }
+    std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
+    {
+        std::vector<mdlp::labels_t> Xd;
+        auto fimdlp = mdlp::CPPFImdlp();
+        for (int i = 0; i < X.size(); i++) {
+            fimdlp.fit(X[i], y);
+            mdlp::labels_t& xd = fimdlp.transform(X[i]);
+            Xd.push_back(xd);
+        }
+        return Xd;
+    }
+}
--- a/src/common/Dataset.h
+++ b/src/common/Dataset.h
@@ -0,0 +1,78 @@
+#ifndef DATASET_H
+#define DATASET_H
+#include <torch/torch.h>
+#include <map>
+#include <vector>
+#include <string>
+#include "CPPFImdlp.h"
+#include "Utils.h"
+namespace platform {
+    enum fileType_t { CSV, ARFF, RDATA };
+    class SourceData {
+    public:
+        SourceData(std::string source)
+        {
+            if (source == "Surcov") {
+                path = "datasets/";
+                fileType = CSV;
+            } else if (source == "Arff") {
+                path = "datasets/";
+                fileType = ARFF;
+            } else if (source == "Tanveer") {
+                path = "data/";
+                fileType = RDATA;
+            } else {
+                throw std::invalid_argument("Unknown source.");
+            }
+        }
+        std::string getPath()
+        {
+            return path;
+        }
+        fileType_t getFileType()
+        {
+            return fileType;
+        }
+    private:
+        std::string path;
+        fileType_t fileType;
+    };
+    class Dataset {
+    private:
+        std::string path;
+        std::string name;
+        fileType_t fileType;
+        std::string className;
+        int n_samples{ 0 }, n_features{ 0 };
+        std::vector<std::string> features;
+        std::map<std::string, std::vector<int>> states;
+        bool loaded;
+        bool discretize;
+        torch::Tensor X, y;
+        std::vector<std::vector<float>> Xv;
+        std::vector<std::vector<int>> Xd;
+        std::vector<int> yv;
+        void buildTensors();
+        void load_csv();
+        void load_arff();
+        void load_rdata();
+        void computeStates();
+        std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
+    public:
+        Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
+        explicit Dataset(const Dataset&);
+        std::string getName() const;
+        std::string getClassName() const;
+        std::vector<string> getFeatures() const;
+        std::map<std::string, std::vector<int>> getStates() const;
+        std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
+        std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
+        std::pair<torch::Tensor&, torch::Tensor&> getTensors();
+        int getNFeatures() const;
+        int getNSamples() const;
+        void load();
+        const bool inline isLoaded() const { return loaded; };
+    };
+};
+
+#endif
--- a/src/common/Datasets.cc
+++ b/src/common/Datasets.cc
@@ -0,0 +1,129 @@
+#include "Datasets.h"
+#include <fstream>
+namespace platform {
+    void Datasets::load()
+    {
+        auto sd = SourceData(sfileType);
+        fileType = sd.getFileType();
+        path = sd.getPath();
+        ifstream catalog(path + "all.txt");
+        if (catalog.is_open()) {
+            std::string line;
+            while (getline(catalog, line)) {
+                if (line.empty() || line[0] == '#') {
+                    continue;
+                }
+                std::vector<std::string> tokens = split(line, ',');
+                std::string name = tokens[0];
+                std::string className;
+                if (tokens.size() == 1) {
+                    className = "-1";
+                } else {
+                    className = tokens[1];
+                }
+                datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
+            }
+            catalog.close();
+        } else {
+            throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
+        }
+    }
+    std::vector<std::string> Datasets::getNames()
+    {
+        std::vector<std::string> result;
+        transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
+        return result;
+    }
+    std::vector<std::string> Datasets::getFeatures(const std::string& name) const
+    {
+        if (datasets.at(name)->isLoaded()) {
+            return datasets.at(name)->getFeatures();
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
+    {
+        if (datasets.at(name)->isLoaded()) {
+            return datasets.at(name)->getStates();
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    void Datasets::loadDataset(const std::string& name) const
+    {
+        if (datasets.at(name)->isLoaded()) {
+            return;
+        } else {
+            datasets.at(name)->load();
+        }
+    }
+    std::string Datasets::getClassName(const std::string& name) const
+    {
+        if (datasets.at(name)->isLoaded()) {
+            return datasets.at(name)->getClassName();
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    int Datasets::getNSamples(const std::string& name) const
+    {
+        if (datasets.at(name)->isLoaded()) {
+            return datasets.at(name)->getNSamples();
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    int Datasets::getNClasses(const std::string& name)
+    {
+        if (datasets.at(name)->isLoaded()) {
+            auto className = datasets.at(name)->getClassName();
+            if (discretize) {
+                auto states = getStates(name);
+                return states.at(className).size();
+            }
+            auto [Xv, yv] = getVectors(name);
+            return *std::max_element(yv.begin(), yv.end()) + 1;
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    std::vector<int> Datasets::getClassesCounts(const std::string& name) const
+    {
+        if (datasets.at(name)->isLoaded()) {
+            auto [Xv, yv] = datasets.at(name)->getVectors();
+            std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
+            for (auto y : yv) {
+                counts[y]++;
+            }
+            return counts;
+        } else {
+            throw std::invalid_argument("Dataset not loaded.");
+        }
+    }
+    pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
+    {
+        if (!datasets[name]->isLoaded()) {
+            datasets[name]->load();
+        }
+        return datasets[name]->getVectors();
+    }
+    pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
+    {
+        if (!datasets[name]->isLoaded()) {
+            datasets[name]->load();
+        }
+        return datasets[name]->getVectorsDiscretized();
+    }
+    pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
+    {
+        if (!datasets[name]->isLoaded()) {
+            datasets[name]->load();
+        }
+        return datasets[name]->getTensors();
+    }
+    bool Datasets::isDataset(const std::string& name) const
+    {
+        return datasets.find(name) != datasets.end();
+    }
+}
--- a/src/common/Datasets.h
+++ b/src/common/Datasets.h
@@ -0,0 +1,30 @@
+#ifndef DATASETS_H
+#define DATASETS_H
+#include "Dataset.h"
+namespace platform {
+    class Datasets {
+    private:
+        std::string path;
+        fileType_t fileType;
+        std::string sfileType;
+        std::map<std::string, std::unique_ptr<Dataset>> datasets;
+        bool discretize;
+        void load(); // Loads the list of datasets
+    public:
+        explicit Datasets(bool discretize, std::string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
+        std::vector<string> getNames();
+        std::vector<string> getFeatures(const std::string& name) const;
+        int getNSamples(const std::string& name) const;
+        std::string getClassName(const std::string& name) const;
+        int getNClasses(const std::string& name);
+        std::vector<int> getClassesCounts(const std::string& name) const;
+        std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
+        std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
+        std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
+        std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
+        bool isDataset(const std::string& name) const;
+        void loadDataset(const std::string& name) const;
+    };
+};
+
+#endif
--- a/src/common/DotEnv.h
+++ b/src/common/DotEnv.h
@@ -0,0 +1,55 @@
+#ifndef DOTENV_H
+#define DOTENV_H
+#include <string>
+#include <map>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <iostream>
+#include "Utils.h"
+
+//#include "Dataset.h"
+namespace platform {
+    class DotEnv {
+    private:
+        std::map<std::string, std::string> env;
+    public:
+        DotEnv()
+        {
+            std::ifstream file(".env");
+            if (!file.is_open()) {
+                std::cerr << "File .env not found" << std::endl;
+                exit(1);
+            }
+            std::string line;
+            while (std::getline(file, line)) {
+                line = trim(line);
+                if (line.empty() || line[0] == '#') {
+                    continue;
+                }
+                std::istringstream iss(line);
+                std::string key, value;
+                if (std::getline(iss, key, '=') && std::getline(iss, value)) {
+                    env[key] = value;
+                }
+            }
+        }
+        std::string get(const std::string& key)
+        {
+            return env.at(key);
+        }
+        std::vector<int> getSeeds()
+        {
+            auto seeds = std::vector<int>();
+            auto seeds_str = env["seeds"];
+            seeds_str = trim(seeds_str);
+            seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
+            auto seeds_str_split = split(seeds_str, ',');
+            transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
+                return stoi(str);
+                });
+            return seeds;
+        }
+    };
+}
+#endif
--- a/src/common/Paths.h
+++ b/src/common/Paths.h
@@ -0,0 +1,39 @@
+#ifndef PATHS_H
+#define PATHS_H
+#include <string>
+#include <filesystem>
+#include "DotEnv.h"
+namespace platform {
+    class Paths {
+    public:
+        static std::string results() { return "results/"; }
+        static std::string hiddenResults() { return "hidden_results/"; }
+        static std::string excel() { return "excel/"; }
+        static std::string grid() { return "grid/"; }
+        static std::string datasets()
+        {
+            auto env = platform::DotEnv();
+            return env.get("source_data");
+        }
+        static void createPath(const std::string& path)
+        {
+            // Create directory if it does not exist
+            try {
+                std::filesystem::create_directory(path);
+            }
+            catch (std::exception& e) {
+                throw std::runtime_error("Could not create directory " + path);
+            }
+        }
+        static std::string excelResults() { return "some_results.xlsx"; }
+        static std::string grid_input(const std::string& model)
+        {
+            return grid() + "grid_" + model + "_input.json";
+        }
+        static std::string grid_output(const std::string& model)
+        {
+            return grid() + "grid_" + model + "_output.json";
+        }
+    };
+}
+#endif
--- a/src/common/Symbols.h
+++ b/src/common/Symbols.h
@@ -0,0 +1,18 @@
+#ifndef SYMBOLS_H
+#define SYMBOLS_H
+#include <string>
+namespace platform {
+    class Symbols {
+    public:
+        inline static const std::string check_mark{ "\u2714" };
+        inline static const std::string exclamation{ "\u2757" };
+        inline static const std::string black_star{ "\u2605" };
+        inline static const std::string cross{ "\u2717" };
+        inline static const std::string upward_arrow{ "\u27B6" };
+        inline static const std::string down_arrow{ "\u27B4" };
+        inline static const std::string equal_best{ check_mark };
+        inline static const std::string better_best{ black_star };
+        inline static const std::string notebook{ "\U0001F5C8" };
+    };
+}
+#endif // !SYMBOLS_H
--- a/src/common/Timer.h
+++ b/src/common/Timer.h
@@ -0,0 +1,43 @@
+#ifndef TIMER_H
+#define TIMER_H
+#include <chrono>
+#include <string>
+#include <sstream>
+
+namespace platform {
+    class Timer {
+    private:
+        std::chrono::high_resolution_clock::time_point begin;
+        std::chrono::high_resolution_clock::time_point end;
+    public:
+        Timer() = default;
+        ~Timer() = default;
+        void start() { begin = std::chrono::high_resolution_clock::now(); }
+        void stop() { end = std::chrono::high_resolution_clock::now(); }
+        double getDuration()
+        {
+            stop();
+            std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double >> (end - begin);
+            return time_span.count();
+        }
+        double getLapse()
+        {
+            std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double >> (std::chrono::high_resolution_clock::now() - begin);
+            return time_span.count();
+        }
+        std::string getDurationString(bool lapse = false)
+        {
+            double duration = lapse ? getLapse() : getDuration();
+            return translate2String(duration);
+        }
+        std::string translate2String(double duration)
+        {
+            double durationShow = duration > 3600 ? duration / 3600 : duration > 60 ? duration / 60 : duration;
+            std::string durationUnit = duration > 3600 ? "h" : duration > 60 ? "m" : "s";
+            std::stringstream ss;
+            ss << std::setprecision(2) << std::fixed << durationShow << " " << durationUnit;
+            return ss.str();
+        }
+    };
+} /* namespace platform */
+#endif /* TIMER_H */
--- a/src/common/Utils.h
+++ b/src/common/Utils.h
@@ -0,0 +1,30 @@
+#ifndef UTILS_H
+#define UTILS_H
+#include <sstream>
+#include <string>
+#include <vector>
+namespace platform {
+    //static std::vector<std::string> split(const std::string& text, char delimiter);
+    static std::vector<std::string> split(const std::string& text, char delimiter)
+    {
+        std::vector<std::string> result;
+        std::stringstream ss(text);
+        std::string token;
+        while (std::getline(ss, token, delimiter)) {
+            result.push_back(token);
+        }
+        return result;
+    }
+    static std::string trim(const std::string& str)
+    {
+        std::string result = str;
+        result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
+            return !std::isspace(ch);
+            }));
+        result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
+            return !std::isspace(ch);
+            }).base(), result.end());
+        return result;
+    }
+}
+#endif