Begin b_list excel

This commit is contained in:
2024-02-29 12:53:11 +01:00
parent 9a26baec47
commit c69dc08134
58 changed files with 148 additions and 39 deletions

22
src/common/CLocale.h Normal file
View File

@@ -0,0 +1,22 @@
#ifndef LOCALE_H
#define LOCALE_H
#include <locale>
#include <iostream>
#include <string>
namespace platform {
struct separation : std::numpunct<char> {
char do_decimal_point() const { return ','; }
char do_thousands_sep() const { return '.'; }
std::string do_grouping() const { return "\03"; }
};
class ConfigLocale {
public:
explicit ConfigLocale()
{
std::locale mylocale(std::cout.getloc(), new separation);
std::locale::global(mylocale);
std::cout.imbue(mylocale);
}
};
}
#endif

15
src/common/Colors.h Normal file
View File

@@ -0,0 +1,15 @@
#ifndef COLORS_H
#define COLORS_H
class Colors {
public:
static std::string MAGENTA() { return "\033[1;35m"; }
static std::string BLUE() { return "\033[1;34m"; }
static std::string CYAN() { return "\033[1;36m"; }
static std::string GREEN() { return "\033[1;32m"; }
static std::string YELLOW() { return "\033[1;33m"; }
static std::string RED() { return "\033[1;31m"; }
static std::string WHITE() { return "\033[1;37m"; }
static std::string IBLUE() { return "\033[0;94m"; }
static std::string RESET() { return "\033[0m"; }
};
#endif // COLORS_H

215
src/common/Dataset.cc Normal file
View File

@@ -0,0 +1,215 @@
#include "Dataset.h"
#include "ArffFiles.h"
#include <fstream>
namespace platform {
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
{
}
std::string Dataset::getName() const
{
return name;
}
std::string Dataset::getClassName() const
{
return className;
}
std::vector<std::string> Dataset::getFeatures() const
{
if (loaded) {
return features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNFeatures() const
{
if (loaded) {
return n_features;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Dataset::getNSamples() const
{
if (loaded) {
return n_samples;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::map<std::string, std::vector<int>> Dataset::getStates() const
{
if (loaded) {
return states;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Dataset::getVectors()
{
if (loaded) {
return { Xv, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
buildTensors();
return { X, y };
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
void Dataset::load_csv()
{
ifstream file(path + "/" + name + ".csv");
if (file.is_open()) {
std::string line;
getline(file, line);
std::vector<std::string> tokens = split(line, ',');
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
if (className == "-1") {
className = tokens.back();
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = split(line, ',');
for (auto i = 0; i < features.size(); ++i) {
Xv[i].push_back(stof(tokens[i]));
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
{
auto arff = ArffFiles();
arff.load(path + "/" + name + ".arff", className);
// Get Dataset X, y
Xv = arff.getX();
yv = arff.getY();
// Get className & Features
className = arff.getClassName();
auto attributes = arff.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& attribute) { return attribute.first; });
}
std::vector<std::string> tokenize(std::string line)
{
std::vector<std::string> tokens;
for (auto i = 0; i < line.size(); ++i) {
if (line[i] == ' ' || line[i] == '\t' || line[i] == '\n') {
std::string token = line.substr(0, i);
tokens.push_back(token);
line.erase(line.begin(), line.begin() + i + 1);
i = 0;
while (line[i] == ' ' || line[i] == '\t' || line[i] == '\n')
line.erase(line.begin(), line.begin() + i + 1);
}
}
if (line.size() > 0) {
tokens.push_back(line);
}
return tokens;
}
void Dataset::load_rdata()
{
ifstream file(path + "/" + name + "_R.dat");
if (file.is_open()) {
std::string line;
getline(file, line);
line = ArffFiles::trim(line);
std::vector<std::string> tokens = tokenize(line);
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
if (className == "-1") {
className = ArffFiles::trim(tokens.back());
}
for (auto i = 0; i < features.size(); ++i) {
Xv.push_back(std::vector<float>());
}
while (getline(file, line)) {
tokens = tokenize(line);
// We have to skip the first token, which is the instance number.
for (auto i = 1; i < features.size() + 1; ++i) {
const float value = stof(tokens[i]);
Xv[i - 1].push_back(value);
}
yv.push_back(stoi(tokens.back()));
}
file.close();
} else {
throw std::invalid_argument("Unable to open dataset file.");
}
}
void Dataset::load()
{
if (loaded) {
return;
}
if (fileType == CSV) {
load_csv();
} else if (fileType == ARFF) {
load_arff();
} else if (fileType == RDATA) {
load_rdata();
}
if (discretize) {
Xd = discretizeDataset(Xv, yv);
computeStates();
}
n_samples = Xv[0].size();
n_features = Xv.size();
loaded = true;
}
void Dataset::buildTensors()
{
if (discretize) {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
} else {
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
}
for (int i = 0; i < features.size(); ++i) {
if (discretize) {
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
} else {
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
}
}
y = torch::tensor(yv, torch::kInt32);
}
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
std::vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
}

78
src/common/Dataset.h Normal file
View File

@@ -0,0 +1,78 @@
#ifndef DATASET_H
#define DATASET_H
#include <torch/torch.h>
#include <map>
#include <vector>
#include <string>
#include "CPPFImdlp.h"
#include "Utils.h"
namespace platform {
enum fileType_t { CSV, ARFF, RDATA };
class SourceData {
public:
SourceData(std::string source)
{
if (source == "Surcov") {
path = "datasets/";
fileType = CSV;
} else if (source == "Arff") {
path = "datasets/";
fileType = ARFF;
} else if (source == "Tanveer") {
path = "data/";
fileType = RDATA;
} else {
throw std::invalid_argument("Unknown source.");
}
}
std::string getPath()
{
return path;
}
fileType_t getFileType()
{
return fileType;
}
private:
std::string path;
fileType_t fileType;
};
class Dataset {
private:
std::string path;
std::string name;
fileType_t fileType;
std::string className;
int n_samples{ 0 }, n_features{ 0 };
std::vector<std::string> features;
std::map<std::string, std::vector<int>> states;
bool loaded;
bool discretize;
torch::Tensor X, y;
std::vector<std::vector<float>> Xv;
std::vector<std::vector<int>> Xd;
std::vector<int> yv;
void buildTensors();
void load_csv();
void load_arff();
void load_rdata();
void computeStates();
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
public:
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
explicit Dataset(const Dataset&);
std::string getName() const;
std::string getClassName() const;
std::vector<string> getFeatures() const;
std::map<std::string, std::vector<int>> getStates() const;
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
int getNFeatures() const;
int getNSamples() const;
void load();
const bool inline isLoaded() const { return loaded; };
};
};
#endif

129
src/common/Datasets.cc Normal file
View File

@@ -0,0 +1,129 @@
#include "Datasets.h"
#include <fstream>
namespace platform {
void Datasets::load()
{
auto sd = SourceData(sfileType);
fileType = sd.getFileType();
path = sd.getPath();
ifstream catalog(path + "all.txt");
if (catalog.is_open()) {
std::string line;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
std::vector<std::string> tokens = split(line, ',');
std::string name = tokens[0];
std::string className;
if (tokens.size() == 1) {
className = "-1";
} else {
className = tokens[1];
}
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
}
catalog.close();
} else {
throw std::invalid_argument("Unable to open catalog file. [" + path + "all.txt" + "]");
}
}
std::vector<std::string> Datasets::getNames()
{
std::vector<std::string> result;
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
return result;
}
std::vector<std::string> Datasets::getFeatures(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getFeatures();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getStates();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
void Datasets::loadDataset(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return;
} else {
datasets.at(name)->load();
}
}
std::string Datasets::getClassName(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getClassName();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNSamples(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
return datasets.at(name)->getNSamples();
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
int Datasets::getNClasses(const std::string& name)
{
if (datasets.at(name)->isLoaded()) {
auto className = datasets.at(name)->getClassName();
if (discretize) {
auto states = getStates(name);
return states.at(className).size();
}
auto [Xv, yv] = getVectors(name);
return *std::max_element(yv.begin(), yv.end()) + 1;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
{
if (datasets.at(name)->isLoaded()) {
auto [Xv, yv] = datasets.at(name)->getVectors();
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
for (auto y : yv) {
counts[y]++;
}
return counts;
} else {
throw std::invalid_argument("Dataset not loaded.");
}
}
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectors();
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getVectorsDiscretized();
}
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
{
if (!datasets[name]->isLoaded()) {
datasets[name]->load();
}
return datasets[name]->getTensors();
}
bool Datasets::isDataset(const std::string& name) const
{
return datasets.find(name) != datasets.end();
}
}

30
src/common/Datasets.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef DATASETS_H
#define DATASETS_H
#include "Dataset.h"
namespace platform {
class Datasets {
private:
std::string path;
fileType_t fileType;
std::string sfileType;
std::map<std::string, std::unique_ptr<Dataset>> datasets;
bool discretize;
void load(); // Loads the list of datasets
public:
explicit Datasets(bool discretize, std::string sfileType) : discretize(discretize), sfileType(sfileType) { load(); };
std::vector<string> getNames();
std::vector<string> getFeatures(const std::string& name) const;
int getNSamples(const std::string& name) const;
std::string getClassName(const std::string& name) const;
int getNClasses(const std::string& name);
std::vector<int> getClassesCounts(const std::string& name) const;
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
bool isDataset(const std::string& name) const;
void loadDataset(const std::string& name) const;
};
};
#endif

55
src/common/DotEnv.h Normal file
View File

@@ -0,0 +1,55 @@
#ifndef DOTENV_H
#define DOTENV_H
#include <string>
#include <map>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <iostream>
#include "Utils.h"
//#include "Dataset.h"
namespace platform {
class DotEnv {
private:
std::map<std::string, std::string> env;
public:
DotEnv()
{
std::ifstream file(".env");
if (!file.is_open()) {
std::cerr << "File .env not found" << std::endl;
exit(1);
}
std::string line;
while (std::getline(file, line)) {
line = trim(line);
if (line.empty() || line[0] == '#') {
continue;
}
std::istringstream iss(line);
std::string key, value;
if (std::getline(iss, key, '=') && std::getline(iss, value)) {
env[key] = value;
}
}
}
std::string get(const std::string& key)
{
return env.at(key);
}
std::vector<int> getSeeds()
{
auto seeds = std::vector<int>();
auto seeds_str = env["seeds"];
seeds_str = trim(seeds_str);
seeds_str = seeds_str.substr(1, seeds_str.size() - 2);
auto seeds_str_split = split(seeds_str, ',');
transform(seeds_str_split.begin(), seeds_str_split.end(), back_inserter(seeds), [](const std::string& str) {
return stoi(str);
});
return seeds;
}
};
}
#endif

39
src/common/Paths.h Normal file
View File

@@ -0,0 +1,39 @@
#ifndef PATHS_H
#define PATHS_H
#include <string>
#include <filesystem>
#include "DotEnv.h"
namespace platform {
class Paths {
public:
static std::string results() { return "results/"; }
static std::string hiddenResults() { return "hidden_results/"; }
static std::string excel() { return "excel/"; }
static std::string grid() { return "grid/"; }
static std::string datasets()
{
auto env = platform::DotEnv();
return env.get("source_data");
}
static void createPath(const std::string& path)
{
// Create directory if it does not exist
try {
std::filesystem::create_directory(path);
}
catch (std::exception& e) {
throw std::runtime_error("Could not create directory " + path);
}
}
static std::string excelResults() { return "some_results.xlsx"; }
static std::string grid_input(const std::string& model)
{
return grid() + "grid_" + model + "_input.json";
}
static std::string grid_output(const std::string& model)
{
return grid() + "grid_" + model + "_output.json";
}
};
}
#endif

18
src/common/Symbols.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef SYMBOLS_H
#define SYMBOLS_H
#include <string>
namespace platform {
class Symbols {
public:
inline static const std::string check_mark{ "\u2714" };
inline static const std::string exclamation{ "\u2757" };
inline static const std::string black_star{ "\u2605" };
inline static const std::string cross{ "\u2717" };
inline static const std::string upward_arrow{ "\u27B6" };
inline static const std::string down_arrow{ "\u27B4" };
inline static const std::string equal_best{ check_mark };
inline static const std::string better_best{ black_star };
inline static const std::string notebook{ "\U0001F5C8" };
};
}
#endif // !SYMBOLS_H

43
src/common/Timer.h Normal file
View File

@@ -0,0 +1,43 @@
#ifndef TIMER_H
#define TIMER_H
#include <chrono>
#include <string>
#include <sstream>
namespace platform {
class Timer {
private:
std::chrono::high_resolution_clock::time_point begin;
std::chrono::high_resolution_clock::time_point end;
public:
Timer() = default;
~Timer() = default;
void start() { begin = std::chrono::high_resolution_clock::now(); }
void stop() { end = std::chrono::high_resolution_clock::now(); }
double getDuration()
{
stop();
std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double >> (end - begin);
return time_span.count();
}
double getLapse()
{
std::chrono::duration<double> time_span = std::chrono::duration_cast<std::chrono::duration<double >> (std::chrono::high_resolution_clock::now() - begin);
return time_span.count();
}
std::string getDurationString(bool lapse = false)
{
double duration = lapse ? getLapse() : getDuration();
return translate2String(duration);
}
std::string translate2String(double duration)
{
double durationShow = duration > 3600 ? duration / 3600 : duration > 60 ? duration / 60 : duration;
std::string durationUnit = duration > 3600 ? "h" : duration > 60 ? "m" : "s";
std::stringstream ss;
ss << std::setprecision(2) << std::fixed << durationShow << " " << durationUnit;
return ss.str();
}
};
} /* namespace platform */
#endif /* TIMER_H */

30
src/common/Utils.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef UTILS_H
#define UTILS_H
#include <sstream>
#include <string>
#include <vector>
namespace platform {
//static std::vector<std::string> split(const std::string& text, char delimiter);
static std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(token);
}
return result;
}
static std::string trim(const std::string& str)
{
std::string result = str;
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
return !std::isspace(ch);
}));
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), result.end());
return result;
}
}
#endif