Add traintest split in gridsearch
This commit is contained in:
@@ -15,10 +15,6 @@ namespace platform {
|
||||
{
|
||||
return name;
|
||||
}
|
||||
std::string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
std::vector<std::string> Dataset::getFeatures() const
|
||||
{
|
||||
if (loaded) {
|
||||
@@ -43,6 +39,42 @@ namespace platform {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::string Dataset::getClassName() const
|
||||
{
|
||||
return className;
|
||||
}
|
||||
int Dataset::getNClasses() const
|
||||
{
|
||||
if (loaded) {
|
||||
if (discretize) {
|
||||
return states.at(className).size();
|
||||
}
|
||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<std::string> Dataset::getLabels() const
|
||||
{
|
||||
// Return the labels factorization result
|
||||
if (loaded) {
|
||||
return labels;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<int> Dataset::getClassesCounts() const
|
||||
{
|
||||
if (loaded) {
|
||||
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
|
||||
for (auto y : yv) {
|
||||
counts[y]++;
|
||||
}
|
||||
return counts;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::map<std::string, std::vector<int>> Dataset::getStates() const
|
||||
{
|
||||
if (loaded) {
|
||||
@@ -70,7 +102,6 @@ namespace platform {
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
buildTensors();
|
||||
return { X, y };
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
@@ -79,29 +110,32 @@ namespace platform {
|
||||
void Dataset::load_csv()
|
||||
{
|
||||
ifstream file(path + "/" + name + ".csv");
|
||||
if (file.is_open()) {
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
labels.clear();
|
||||
file.close();
|
||||
} else {
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
labels.clear();
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
std::vector<std::string> tokens = split(line, ',');
|
||||
features = std::vector<std::string>(tokens.begin(), tokens.end() - 1);
|
||||
if (className == "-1") {
|
||||
className = tokens.back();
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = split(line, ',');
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv[i].push_back(stof(tokens[i]));
|
||||
}
|
||||
auto label = trim(tokens.back());
|
||||
if (find(labels.begin(), labels.end(), label) == labels.end()) {
|
||||
labels.push_back(label);
|
||||
}
|
||||
yv.push_back(stoi(label));
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
@@ -147,32 +181,35 @@ namespace platform {
|
||||
void Dataset::load_rdata()
|
||||
{
|
||||
ifstream file(path + "/" + name + "_R.dat");
|
||||
if (file.is_open()) {
|
||||
std::string line;
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
std::vector<std::string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
yv.push_back(stoi(tokens.back()));
|
||||
}
|
||||
labels.clear();
|
||||
file.close();
|
||||
} else {
|
||||
if (!file.is_open()) {
|
||||
throw std::invalid_argument("Unable to open dataset file.");
|
||||
}
|
||||
std::string line;
|
||||
labels.clear();
|
||||
getline(file, line);
|
||||
line = ArffFiles::trim(line);
|
||||
std::vector<std::string> tokens = tokenize(line);
|
||||
transform(tokens.begin(), tokens.end() - 1, back_inserter(features), [](const auto& attribute) { return ArffFiles::trim(attribute); });
|
||||
if (className == "-1") {
|
||||
className = ArffFiles::trim(tokens.back());
|
||||
}
|
||||
for (auto i = 0; i < features.size(); ++i) {
|
||||
Xv.push_back(std::vector<float>());
|
||||
}
|
||||
while (getline(file, line)) {
|
||||
tokens = tokenize(line);
|
||||
// We have to skip the first token, which is the instance number.
|
||||
for (auto i = 1; i < features.size() + 1; ++i) {
|
||||
const float value = stof(tokens[i]);
|
||||
Xv[i - 1].push_back(value);
|
||||
}
|
||||
auto label = trim(tokens.back());
|
||||
if (find(labels.begin(), labels.end(), label) == labels.end()) {
|
||||
labels.push_back(label);
|
||||
}
|
||||
yv.push_back(stoi(label));
|
||||
}
|
||||
file.close();
|
||||
}
|
||||
void Dataset::load()
|
||||
{
|
||||
@@ -200,27 +237,13 @@ namespace platform {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (discretize) {
|
||||
Xd = discretizeDataset(Xv, yv);
|
||||
computeStates();
|
||||
}
|
||||
loaded = true;
|
||||
}
|
||||
void Dataset::buildTensors()
|
||||
{
|
||||
if (discretize) {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kInt32);
|
||||
} else {
|
||||
X = torch::zeros({ static_cast<int>(n_features), static_cast<int>(n_samples) }, torch::kFloat32);
|
||||
}
|
||||
// Build Tensors
|
||||
X = torch::zeros({ n_features, n_samples }, torch::kFloat32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
if (discretize) {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32));
|
||||
} else {
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
X.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kFloat32));
|
||||
}
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
loaded = true;
|
||||
}
|
||||
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||
{
|
||||
@@ -233,9 +256,40 @@ namespace platform {
|
||||
}
|
||||
return Xd;
|
||||
}
|
||||
std::pair <torch::Tensor&, torch::Tensor&> Dataset::getDiscretizedTrainTestTensors()
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> Dataset::getTrainTestTensors(std::vector<int>& train, std::vector<int>& test)
|
||||
{
|
||||
auto discretizer = Discretization::instance()->create("mdlp");
|
||||
return { X_train, X_test };
|
||||
if (!loaded) {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
auto train_t = torch::tensor(train);
|
||||
int samples_train = train.size();
|
||||
int samples_test = test.size();
|
||||
auto test_t = torch::tensor(test);
|
||||
X_train = X.index({ "...", train_t });
|
||||
y_train = y.index({ train_t });
|
||||
X_test = X.index({ "...", test_t });
|
||||
y_test = y.index({ test_t });
|
||||
if (discretize) {
|
||||
auto discretizer = Discretization::instance()->create(discretizer_algorithm);
|
||||
auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32);
|
||||
auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32);
|
||||
for (int feature = 0; feature < n_features; ++feature) {
|
||||
if (numericFeatures[feature]) {
|
||||
auto X_train_feature = X_train.index({ feature, "..." }).to(torch::kFloat32);
|
||||
auto X_test_feature = X_test.index({ feature, "..." }).to(torch::kFloat32);
|
||||
discretizer->fit(X_train_feature, y_train);
|
||||
auto X_train_feature_d = discretizer->transform(X_train_feature);
|
||||
auto X_test_feature_d = discretizer->transform(X_test_feature);
|
||||
X_train_d.index_put_({ feature, "..." }, X_train_feature_d.to(torch::kInt32));
|
||||
X_test_d.index_put_({ feature, "..." }, X_test_feature_d.to(torch::kInt32));
|
||||
} else {
|
||||
X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32));
|
||||
X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32));
|
||||
}
|
||||
}
|
||||
X_train = X_train_d;
|
||||
X_test = X_test_d;
|
||||
}
|
||||
return { X_train, X_test, y_train, y_test };
|
||||
}
|
||||
}
|
@@ -4,27 +4,30 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <common/DiscretizationRegister.h>
|
||||
#include "Utils.h"
|
||||
#include "SourceData.h"
|
||||
namespace platform {
|
||||
class Dataset {
|
||||
public:
|
||||
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector<int> numericFeaturesIdx) :
|
||||
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector<int> numericFeaturesIdx, std::string discretizer_algo = "none") :
|
||||
path(path), name(name), className(className), discretize(discretize),
|
||||
loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx)
|
||||
loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx), discretizer_algorithm(discretizer_algo)
|
||||
{
|
||||
};
|
||||
explicit Dataset(const Dataset&);
|
||||
std::string getName() const;
|
||||
std::string getClassName() const;
|
||||
std::vector<std::string> getLabels() const { return labels; }
|
||||
int getNClasses() const;
|
||||
std::vector<std::string> getLabels() const; // return the labels factorization result
|
||||
std::vector<int> getClassesCounts() const;
|
||||
std::vector<string> getFeatures() const;
|
||||
std::map<std::string, std::vector<int>> getStates() const;
|
||||
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
||||
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getDiscretizedTrainTestTensors();
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(std::vector<int>& train, std::vector<int>& test);
|
||||
int getNFeatures() const;
|
||||
int getNSamples() const;
|
||||
std::vector<bool>& getNumericFeatures() { return numericFeatures; }
|
||||
@@ -37,6 +40,7 @@ namespace platform {
|
||||
std::string className;
|
||||
int n_samples{ 0 }, n_features{ 0 };
|
||||
std::vector<int> numericFeaturesIdx;
|
||||
std::string discretizer_algorithm;
|
||||
std::vector<bool> numericFeatures; // true if feature is numeric
|
||||
std::vector<std::string> features;
|
||||
std::vector<std::string> labels;
|
||||
@@ -44,11 +48,10 @@ namespace platform {
|
||||
bool loaded;
|
||||
bool discretize;
|
||||
torch::Tensor X, y;
|
||||
torch::Tensor X_train, X_test;
|
||||
torch::Tensor X_train, X_test, y_train, y_test;
|
||||
std::vector<std::vector<float>> Xv;
|
||||
std::vector<std::vector<int>> Xd;
|
||||
std::vector<int> yv;
|
||||
void buildTensors();
|
||||
void load_csv();
|
||||
void load_arff();
|
||||
void load_rdata();
|
||||
|
@@ -54,7 +54,7 @@ namespace platform {
|
||||
throw std::invalid_argument("Invalid catalog file format.");
|
||||
|
||||
}
|
||||
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType, numericFeaturesIdx);
|
||||
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType, numericFeaturesIdx, discretizer_algorithm);
|
||||
}
|
||||
catalog.close();
|
||||
}
|
||||
@@ -64,110 +64,6 @@ namespace platform {
|
||||
transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; });
|
||||
return result;
|
||||
}
|
||||
std::vector<std::string> Datasets::getFeatures(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getFeatures();
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<std::string> Datasets::getLabels(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getLabels();
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getStates();
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
void Datasets::loadDataset(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return;
|
||||
} else {
|
||||
datasets.at(name)->load();
|
||||
}
|
||||
}
|
||||
std::string Datasets::getClassName(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getClassName();
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
int Datasets::getNSamples(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getNSamples();
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
int Datasets::getNClasses(const std::string& name)
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
auto className = datasets.at(name)->getClassName();
|
||||
if (discretize) {
|
||||
auto states = getStates(name);
|
||||
return states.at(className).size();
|
||||
}
|
||||
auto [Xv, yv] = getVectors(name);
|
||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<bool>& Datasets::getNumericFeatures(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
return datasets.at(name)->getNumericFeatures();
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
|
||||
{
|
||||
if (datasets.at(name)->isLoaded()) {
|
||||
auto [Xv, yv] = datasets.at(name)->getVectors();
|
||||
std::vector<int> counts(*std::max_element(yv.begin(), yv.end()) + 1);
|
||||
for (auto y : yv) {
|
||||
counts[y]++;
|
||||
}
|
||||
return counts;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getVectors();
|
||||
}
|
||||
pair<std::vector<std::vector<int>>&, std::vector<int>&> Datasets::getVectorsDiscretized(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getVectorsDiscretized();
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Datasets::getTensors(const std::string& name)
|
||||
{
|
||||
if (!datasets[name]->isLoaded()) {
|
||||
datasets[name]->load();
|
||||
}
|
||||
return datasets[name]->getTensors();
|
||||
}
|
||||
bool Datasets::isDataset(const std::string& name) const
|
||||
{
|
||||
return datasets.find(name) != datasets.end();
|
||||
|
@@ -4,34 +4,23 @@
|
||||
namespace platform {
|
||||
class Datasets {
|
||||
public:
|
||||
explicit Datasets(bool discretize, std::string sfileType, std::string discretizer_algo = "none") : discretize(discretize), sfileType(sfileType), discretizer_algo(discretizer_algo)
|
||||
explicit Datasets(bool discretize, std::string sfileType, std::string discretizer_algorithm = "none") :
|
||||
discretize(discretize), sfileType(sfileType), discretizer_algorithm(discretizer_algorithm)
|
||||
{
|
||||
if (discretizer_algo == "none" && discretize) {
|
||||
if (discretizer_algorithm == "none" && discretize) {
|
||||
throw std::runtime_error("Can't discretize without discretization algorithm");
|
||||
}
|
||||
load();
|
||||
};
|
||||
std::vector<std::string> getNames();
|
||||
std::vector<std::string> getFeatures(const std::string& name) const;
|
||||
int getNSamples(const std::string& name) const;
|
||||
std::vector<std::string> getLabels(const std::string& name) const;
|
||||
std::string getClassName(const std::string& name) const;
|
||||
int getNClasses(const std::string& name);
|
||||
std::vector<bool>& getNumericFeatures(const std::string& name) const;
|
||||
std::vector<int> getClassesCounts(const std::string& name) const;
|
||||
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
|
||||
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
|
||||
std::pair<std::vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized(const std::string& name);
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors(const std::string& name);
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(const std::vector<int>& train_idx, const std::vector<int>& test_idx);
|
||||
bool isDataset(const std::string& name) const;
|
||||
void loadDataset(const std::string& name) const;
|
||||
Dataset& getDataset(const std::string& name) const { return *datasets.at(name); }
|
||||
std::string toString() const;
|
||||
private:
|
||||
std::string path;
|
||||
fileType_t fileType;
|
||||
std::string sfileType;
|
||||
std::string discretizer_algo;
|
||||
std::string discretizer_algorithm;
|
||||
std::map<std::string, std::unique_ptr<Dataset>> datasets;
|
||||
bool discretize;
|
||||
void load(); // Loads the list of datasets
|
||||
|
Reference in New Issue
Block a user