Add numeric features management to Dataset
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
cmake_minimum_required(VERSION 3.20)
|
cmake_minimum_required(VERSION 3.20)
|
||||||
|
|
||||||
project(Platform
|
project(Platform
|
||||||
VERSION 1.0.4
|
VERSION 1.1.0
|
||||||
DESCRIPTION "Platform to run Experiments with classifiers."
|
DESCRIPTION "Platform to run Experiments with classifiers."
|
||||||
HOMEPAGE_URL "https://github.com/rmontanana/platform"
|
HOMEPAGE_URL "https://github.com/rmontanana/platform"
|
||||||
LANGUAGES CXX
|
LANGUAGES CXX
|
||||||
|
Submodule lib/libxlsxwriter updated: c89c551221...f483e65f2e
2
lib/mdlp
2
lib/mdlp
Submodule lib/mdlp updated: 236d1b2f8b...633aa52849
@@ -21,7 +21,7 @@ include_directories(
|
|||||||
add_executable(
|
add_executable(
|
||||||
b_best commands/b_best.cpp best/Statistics.cpp
|
b_best commands/b_best.cpp best/Statistics.cpp
|
||||||
best/BestResultsExcel.cpp best/BestResults.cpp
|
best/BestResultsExcel.cpp best/BestResults.cpp
|
||||||
common/Datasets.cpp common/Dataset.cpp
|
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||||
main/Models.cpp main/Scores.cpp
|
main/Models.cpp main/Scores.cpp
|
||||||
reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp
|
reports/ReportExcel.cpp reports/ReportBase.cpp reports/ExcelFile.cpp
|
||||||
results/Result.cpp
|
results/Result.cpp
|
||||||
@@ -32,14 +32,14 @@ target_link_libraries(b_best Boost::boost "${PyClassifiers}" "${BayesNet}" mdlp
|
|||||||
set(grid_sources GridSearch.cpp GridData.cpp)
|
set(grid_sources GridSearch.cpp GridData.cpp)
|
||||||
list(TRANSFORM grid_sources PREPEND grid/)
|
list(TRANSFORM grid_sources PREPEND grid/)
|
||||||
add_executable(b_grid commands/b_grid.cpp ${grid_sources}
|
add_executable(b_grid commands/b_grid.cpp ${grid_sources}
|
||||||
common/Datasets.cpp common/Dataset.cpp
|
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||||
main/HyperParameters.cpp main/Models.cpp
|
main/HyperParameters.cpp main/Models.cpp
|
||||||
)
|
)
|
||||||
target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy)
|
target_link_libraries(b_grid ${MPI_CXX_LIBRARIES} "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::python Boost::numpy)
|
||||||
|
|
||||||
# b_list
|
# b_list
|
||||||
add_executable(b_list commands/b_list.cpp
|
add_executable(b_list commands/b_list.cpp
|
||||||
common/Datasets.cpp common/Dataset.cpp
|
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||||
main/Models.cpp main/Scores.cpp
|
main/Models.cpp main/Scores.cpp
|
||||||
reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
reports/ReportExcel.cpp reports/ExcelFile.cpp reports/ReportBase.cpp reports/DatasetsExcel.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
||||||
results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
results/Result.cpp results/ResultsDatasetExcel.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
||||||
@@ -50,7 +50,7 @@ target_link_libraries(b_list "${PyClassifiers}" "${BayesNet}" mdlp ${Python3_LIB
|
|||||||
set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp)
|
set(main_sources Experiment.cpp Models.cpp HyperParameters.cpp Scores.cpp)
|
||||||
list(TRANSFORM main_sources PREPEND main/)
|
list(TRANSFORM main_sources PREPEND main/)
|
||||||
add_executable(b_main commands/b_main.cpp ${main_sources}
|
add_executable(b_main commands/b_main.cpp ${main_sources}
|
||||||
common/Datasets.cpp common/Dataset.cpp
|
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||||
reports/ReportConsole.cpp reports/ReportBase.cpp
|
reports/ReportConsole.cpp reports/ReportBase.cpp
|
||||||
results/Result.cpp
|
results/Result.cpp
|
||||||
)
|
)
|
||||||
@@ -61,7 +61,7 @@ set(manage_sources ManageScreen.cpp CommandParser.cpp ResultsManager.cpp)
|
|||||||
list(TRANSFORM manage_sources PREPEND manage/)
|
list(TRANSFORM manage_sources PREPEND manage/)
|
||||||
add_executable(
|
add_executable(
|
||||||
b_manage commands/b_manage.cpp ${manage_sources}
|
b_manage commands/b_manage.cpp ${manage_sources}
|
||||||
common/Datasets.cpp common/Dataset.cpp
|
common/Datasets.cpp common/Dataset.cpp common/Discretization.cpp
|
||||||
reports/ReportConsole.cpp reports/ReportExcel.cpp reports/ReportExcelCompared.cpp reports/ReportBase.cpp reports/ExcelFile.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
reports/ReportConsole.cpp reports/ReportExcel.cpp reports/ReportExcelCompared.cpp reports/ReportBase.cpp reports/ExcelFile.cpp reports/DatasetsConsole.cpp reports/ReportsPaged.cpp
|
||||||
results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
results/Result.cpp results/ResultsDataset.cpp results/ResultsDatasetConsole.cpp
|
||||||
main/Scores.cpp
|
main/Scores.cpp
|
||||||
|
@@ -2,7 +2,12 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include "Dataset.h"
|
#include "Dataset.h"
|
||||||
namespace platform {
|
namespace platform {
|
||||||
Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType)
|
Dataset::Dataset(const Dataset& dataset) :
|
||||||
|
path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples),
|
||||||
|
n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features),
|
||||||
|
states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y),
|
||||||
|
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv),
|
||||||
|
fileType(dataset.fileType)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
std::string Dataset::getName() const
|
std::string Dataset::getName() const
|
||||||
@@ -180,12 +185,20 @@ namespace platform {
|
|||||||
} else if (fileType == RDATA) {
|
} else if (fileType == RDATA) {
|
||||||
load_rdata();
|
load_rdata();
|
||||||
}
|
}
|
||||||
|
n_samples = Xv[0].size();
|
||||||
|
n_features = Xv.size();
|
||||||
|
if (numericFeaturesIdx.at(0) == -1) {
|
||||||
|
numericFeatures = std::vector<bool>(n_features, true);
|
||||||
|
} else {
|
||||||
|
numericFeatures = std::vector<bool>(n_features, false);
|
||||||
|
for (auto i : numericFeaturesIdx) {
|
||||||
|
numericFeatures[i] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (discretize) {
|
if (discretize) {
|
||||||
Xd = discretizeDataset(Xv, yv);
|
Xd = discretizeDataset(Xv, yv);
|
||||||
computeStates();
|
computeStates();
|
||||||
}
|
}
|
||||||
n_samples = Xv[0].size();
|
|
||||||
n_features = Xv.size();
|
|
||||||
loaded = true;
|
loaded = true;
|
||||||
}
|
}
|
||||||
void Dataset::buildTensors()
|
void Dataset::buildTensors()
|
||||||
@@ -215,4 +228,9 @@ namespace platform {
|
|||||||
}
|
}
|
||||||
return Xd;
|
return Xd;
|
||||||
}
|
}
|
||||||
|
std::pair <torch::Tensor&, torch::Tensor&> Dataset::getDiscretizedTrainTestTensors()
|
||||||
|
{
|
||||||
|
auto discretizer = Discretization::instance()->create("mdlp");
|
||||||
|
return { X_train, X_test };
|
||||||
|
}
|
||||||
}
|
}
|
@@ -4,14 +4,17 @@
|
|||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <CPPFImdlp.h>
|
#include <common/DiscretizationRegister.h>
|
||||||
#include "Utils.h"
|
#include "Utils.h"
|
||||||
#include "SourceData.h"
|
#include "SourceData.h"
|
||||||
namespace platform {
|
namespace platform {
|
||||||
|
|
||||||
class Dataset {
|
class Dataset {
|
||||||
public:
|
public:
|
||||||
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {};
|
Dataset(const std::string& path, const std::string& name, const std::string& className, bool discretize, fileType_t fileType, std::vector<int> numericFeaturesIdx) :
|
||||||
|
path(path), name(name), className(className), discretize(discretize),
|
||||||
|
loaded(false), fileType(fileType), numericFeaturesIdx(numericFeaturesIdx)
|
||||||
|
{
|
||||||
|
};
|
||||||
explicit Dataset(const Dataset&);
|
explicit Dataset(const Dataset&);
|
||||||
std::string getName() const;
|
std::string getName() const;
|
||||||
std::string getClassName() const;
|
std::string getClassName() const;
|
||||||
@@ -20,9 +23,11 @@ namespace platform {
|
|||||||
std::map<std::string, std::vector<int>> getStates() const;
|
std::map<std::string, std::vector<int>> getStates() const;
|
||||||
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
||||||
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
|
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
|
||||||
|
std::pair<torch::Tensor&, torch::Tensor&> getDiscretizedTrainTestTensors();
|
||||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||||
int getNFeatures() const;
|
int getNFeatures() const;
|
||||||
int getNSamples() const;
|
int getNSamples() const;
|
||||||
|
std::vector<bool>& getNumericFeatures() { return numericFeatures; }
|
||||||
void load();
|
void load();
|
||||||
const bool inline isLoaded() const { return loaded; };
|
const bool inline isLoaded() const { return loaded; };
|
||||||
private:
|
private:
|
||||||
@@ -31,12 +36,15 @@ namespace platform {
|
|||||||
fileType_t fileType;
|
fileType_t fileType;
|
||||||
std::string className;
|
std::string className;
|
||||||
int n_samples{ 0 }, n_features{ 0 };
|
int n_samples{ 0 }, n_features{ 0 };
|
||||||
|
std::vector<int> numericFeaturesIdx;
|
||||||
|
std::vector<bool> numericFeatures; // true if feature is numeric
|
||||||
std::vector<std::string> features;
|
std::vector<std::string> features;
|
||||||
std::vector<std::string> labels;
|
std::vector<std::string> labels;
|
||||||
std::map<std::string, std::vector<int>> states;
|
std::map<std::string, std::vector<int>> states;
|
||||||
bool loaded;
|
bool loaded;
|
||||||
bool discretize;
|
bool discretize;
|
||||||
torch::Tensor X, y;
|
torch::Tensor X, y;
|
||||||
|
torch::Tensor X_train, X_test;
|
||||||
std::vector<std::vector<float>> Xv;
|
std::vector<std::vector<float>> Xv;
|
||||||
std::vector<std::vector<int>> Xd;
|
std::vector<std::vector<int>> Xd;
|
||||||
std::vector<int> yv;
|
std::vector<int> yv;
|
||||||
|
@@ -1,27 +1,47 @@
|
|||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include "Datasets.h"
|
#include "Datasets.h"
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
namespace platform {
|
namespace platform {
|
||||||
|
using json = nlohmann::ordered_json;
|
||||||
|
const std::string message_dataset_not_loaded = "dataset not loaded.";
|
||||||
void Datasets::load()
|
void Datasets::load()
|
||||||
{
|
{
|
||||||
auto sd = SourceData(sfileType);
|
auto sd = SourceData(sfileType);
|
||||||
fileType = sd.getFileType();
|
fileType = sd.getFileType();
|
||||||
path = sd.getPath();
|
path = sd.getPath();
|
||||||
ifstream catalog(path + "all.txt");
|
ifstream catalog(path + "all.txt");
|
||||||
|
std::vector<int> numericFeaturesIdx;
|
||||||
if (catalog.is_open()) {
|
if (catalog.is_open()) {
|
||||||
std::string line;
|
std::string line;
|
||||||
while (getline(catalog, line)) {
|
while (getline(catalog, line)) {
|
||||||
if (line.empty() || line[0] == '#') {
|
if (line.empty() || line[0] == '#') {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
std::vector<std::string> tokens = split(line, ',');
|
std::vector<std::string> tokens = split(line, ';');
|
||||||
std::string name = tokens[0];
|
std::string name = tokens[0];
|
||||||
std::string className;
|
std::string className;
|
||||||
|
numericFeaturesIdx.clear();
|
||||||
if (tokens.size() == 1) {
|
if (tokens.size() == 1) {
|
||||||
className = "-1";
|
className = "-1";
|
||||||
|
numericFeaturesIdx.push_back(-1);
|
||||||
} else {
|
} else {
|
||||||
className = tokens[1];
|
className = tokens[1];
|
||||||
|
if (tokens.size() > 2) {
|
||||||
|
auto numericFeatures = tokens[2];
|
||||||
|
if (numericFeatures == "all") {
|
||||||
|
numericFeaturesIdx.push_back(-1);
|
||||||
|
} else {
|
||||||
|
auto features = json::parse(numericFeatures);
|
||||||
|
for (auto& f : features) {
|
||||||
|
numericFeaturesIdx.push_back(f);
|
||||||
}
|
}
|
||||||
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType);
|
}
|
||||||
|
} else {
|
||||||
|
numericFeaturesIdx.push_back(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets[name] = make_unique<Dataset>(path, name, className, discretize, fileType, numericFeaturesIdx);
|
||||||
}
|
}
|
||||||
catalog.close();
|
catalog.close();
|
||||||
} else {
|
} else {
|
||||||
@@ -39,7 +59,7 @@ namespace platform {
|
|||||||
if (datasets.at(name)->isLoaded()) {
|
if (datasets.at(name)->isLoaded()) {
|
||||||
return datasets.at(name)->getFeatures();
|
return datasets.at(name)->getFeatures();
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("Dataset not loaded.");
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::vector<std::string> Datasets::getLabels(const std::string& name) const
|
std::vector<std::string> Datasets::getLabels(const std::string& name) const
|
||||||
@@ -47,7 +67,7 @@ namespace platform {
|
|||||||
if (datasets.at(name)->isLoaded()) {
|
if (datasets.at(name)->isLoaded()) {
|
||||||
return datasets.at(name)->getLabels();
|
return datasets.at(name)->getLabels();
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("Dataset not loaded.");
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
|
map<std::string, std::vector<int>> Datasets::getStates(const std::string& name) const
|
||||||
@@ -55,7 +75,7 @@ namespace platform {
|
|||||||
if (datasets.at(name)->isLoaded()) {
|
if (datasets.at(name)->isLoaded()) {
|
||||||
return datasets.at(name)->getStates();
|
return datasets.at(name)->getStates();
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("Dataset not loaded.");
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void Datasets::loadDataset(const std::string& name) const
|
void Datasets::loadDataset(const std::string& name) const
|
||||||
@@ -71,7 +91,7 @@ namespace platform {
|
|||||||
if (datasets.at(name)->isLoaded()) {
|
if (datasets.at(name)->isLoaded()) {
|
||||||
return datasets.at(name)->getClassName();
|
return datasets.at(name)->getClassName();
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("Dataset not loaded.");
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int Datasets::getNSamples(const std::string& name) const
|
int Datasets::getNSamples(const std::string& name) const
|
||||||
@@ -79,7 +99,7 @@ namespace platform {
|
|||||||
if (datasets.at(name)->isLoaded()) {
|
if (datasets.at(name)->isLoaded()) {
|
||||||
return datasets.at(name)->getNSamples();
|
return datasets.at(name)->getNSamples();
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("Dataset not loaded.");
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
int Datasets::getNClasses(const std::string& name)
|
int Datasets::getNClasses(const std::string& name)
|
||||||
@@ -93,7 +113,15 @@ namespace platform {
|
|||||||
auto [Xv, yv] = getVectors(name);
|
auto [Xv, yv] = getVectors(name);
|
||||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("Dataset not loaded.");
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::vector<bool>& Datasets::getNumericFeatures(const std::string& name) const
|
||||||
|
{
|
||||||
|
if (datasets.at(name)->isLoaded()) {
|
||||||
|
return datasets.at(name)->getNumericFeatures();
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
|
std::vector<int> Datasets::getClassesCounts(const std::string& name) const
|
||||||
@@ -106,7 +134,7 @@ namespace platform {
|
|||||||
}
|
}
|
||||||
return counts;
|
return counts;
|
||||||
} else {
|
} else {
|
||||||
throw std::invalid_argument("Dataset not loaded.");
|
throw std::invalid_argument(message_dataset_not_loaded);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
|
pair<std::vector<std::vector<float>>&, std::vector<int>&> Datasets::getVectors(const std::string& name)
|
||||||
|
@@ -11,6 +11,7 @@ namespace platform {
|
|||||||
std::vector<std::string> getLabels(const std::string& name) const;
|
std::vector<std::string> getLabels(const std::string& name) const;
|
||||||
std::string getClassName(const std::string& name) const;
|
std::string getClassName(const std::string& name) const;
|
||||||
int getNClasses(const std::string& name);
|
int getNClasses(const std::string& name);
|
||||||
|
std::vector<bool>& getNumericFeatures(const std::string& name) const;
|
||||||
std::vector<int> getClassesCounts(const std::string& name) const;
|
std::vector<int> getClassesCounts(const std::string& name) const;
|
||||||
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
|
std::map<std::string, std::vector<int>> getStates(const std::string& name) const;
|
||||||
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
|
std::pair<std::vector<std::vector<float>>&, std::vector<int>&> getVectors(const std::string& name);
|
||||||
|
55
src/common/Discretization.cpp
Normal file
55
src/common/Discretization.cpp
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
#include "Discretization.h"
|
||||||
|
|
||||||
|
namespace platform {
|
||||||
|
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
|
||||||
|
Discretization* Discretization::factory = nullptr;
|
||||||
|
Discretization* Discretization::instance()
|
||||||
|
{
|
||||||
|
//manages singleton
|
||||||
|
if (factory == nullptr)
|
||||||
|
factory = new Discretization();
|
||||||
|
return factory;
|
||||||
|
}
|
||||||
|
void Discretization::registerFactoryFunction(const std::string& name,
|
||||||
|
function<mdlp::Discretizer* (void)> classFactoryFunction)
|
||||||
|
{
|
||||||
|
// register the class factory function
|
||||||
|
functionRegistry[name] = classFactoryFunction;
|
||||||
|
}
|
||||||
|
std::shared_ptr<mdlp::Discretizer> Discretization::create(const std::string& name)
|
||||||
|
{
|
||||||
|
mdlp::Discretizer* instance = nullptr;
|
||||||
|
|
||||||
|
// find name in the registry and call factory method.
|
||||||
|
auto it = functionRegistry.find(name);
|
||||||
|
if (it != functionRegistry.end())
|
||||||
|
instance = it->second();
|
||||||
|
// wrap instance in a shared ptr and return
|
||||||
|
if (instance != nullptr)
|
||||||
|
return std::unique_ptr<mdlp::Discretizer>(instance);
|
||||||
|
else
|
||||||
|
throw std::runtime_error("Discretizer not found: " + name);
|
||||||
|
}
|
||||||
|
std::vector<std::string> Discretization::getNames()
|
||||||
|
{
|
||||||
|
std::vector<std::string> names;
|
||||||
|
transform(functionRegistry.begin(), functionRegistry.end(), back_inserter(names),
|
||||||
|
[](const pair<std::string, function<mdlp::Discretizer* (void)>>& pair) { return pair.first; });
|
||||||
|
return names;
|
||||||
|
}
|
||||||
|
std::string Discretization::toString()
|
||||||
|
{
|
||||||
|
std::string result = "";
|
||||||
|
std::string sep = "";
|
||||||
|
for (const auto& pair : functionRegistry) {
|
||||||
|
result += sep + pair.first;
|
||||||
|
sep = ", ";
|
||||||
|
}
|
||||||
|
return "{" + result + "}";
|
||||||
|
}
|
||||||
|
RegistrarDiscretization::RegistrarDiscretization(const std::string& name, function<mdlp::Discretizer* (void)> classFactoryFunction)
|
||||||
|
{
|
||||||
|
// register the class factory function
|
||||||
|
Discretization::instance()->registerFactoryFunction(name, classFactoryFunction);
|
||||||
|
}
|
||||||
|
}
|
33
src/common/Discretization.h
Normal file
33
src/common/Discretization.h
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
#ifndef DISCRETIZATION_H
|
||||||
|
#define DISCRETIZATION_H
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include <Discretizer.h>
|
||||||
|
#include <BinDisc.h>
|
||||||
|
#include <CPPFImdlp.h>
|
||||||
|
namespace platform {
|
||||||
|
class Discretization {
|
||||||
|
public:
|
||||||
|
Discretization(Discretization&) = delete;
|
||||||
|
void operator=(const Discretization&) = delete;
|
||||||
|
// Idea from: https://www.codeproject.com/Articles/567242/AplusC-2b-2bplusObjectplusFactory
|
||||||
|
static Discretization* instance();
|
||||||
|
std::shared_ptr<mdlp::Discretizer> create(const std::string& name);
|
||||||
|
void registerFactoryFunction(const std::string& name,
|
||||||
|
function<mdlp::Discretizer* (void)> classFactoryFunction);
|
||||||
|
std::vector<string> getNames();
|
||||||
|
std::string toString();
|
||||||
|
private:
|
||||||
|
map<std::string, function<mdlp::Discretizer* (void)>> functionRegistry;
|
||||||
|
static Discretization* factory; //singleton
|
||||||
|
Discretization() {};
|
||||||
|
};
|
||||||
|
class RegistrarDiscretization {
|
||||||
|
public:
|
||||||
|
RegistrarDiscretization(const std::string& className, function<mdlp::Discretizer* (void)> classFactoryFunction);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
#endif
|
10
src/common/DiscretizationRegister.h
Normal file
10
src/common/DiscretizationRegister.h
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
#ifndef DISCRETIZATIONREGISTER_H
|
||||||
|
#define DISCRETIZATIONREGISTER_H
|
||||||
|
#include <common/Discretization.h>
|
||||||
|
static platform::RegistrarDiscretization registrarM("mdlp",
|
||||||
|
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();});
|
||||||
|
static platform::RegistrarDiscretization registrarBU("BinUniform",
|
||||||
|
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);});
|
||||||
|
static platform::RegistrarDiscretization registrarBQ("BinQuantile",
|
||||||
|
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);});
|
||||||
|
#endif
|
@@ -3,17 +3,8 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <algorithm>
|
||||||
namespace platform {
|
namespace platform {
|
||||||
static std::vector<std::string> split(const std::string& text, char delimiter)
|
|
||||||
{
|
|
||||||
std::vector<std::string> result;
|
|
||||||
std::stringstream ss(text);
|
|
||||||
std::string token;
|
|
||||||
while (std::getline(ss, token, delimiter)) {
|
|
||||||
result.push_back(token);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
static std::string trim(const std::string& str)
|
static std::string trim(const std::string& str)
|
||||||
{
|
{
|
||||||
std::string result = str;
|
std::string result = str;
|
||||||
@@ -25,5 +16,15 @@ namespace platform {
|
|||||||
}).base(), result.end());
|
}).base(), result.end());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
static std::vector<std::string> split(const std::string& text, char delimiter)
|
||||||
|
{
|
||||||
|
std::vector<std::string> result;
|
||||||
|
std::stringstream ss(text);
|
||||||
|
std::string token;
|
||||||
|
while (std::getline(ss, token, delimiter)) {
|
||||||
|
result.push_back(trim(token));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
@@ -115,7 +115,7 @@ namespace platform {
|
|||||||
}
|
}
|
||||||
void Experiment::cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files)
|
void Experiment::cross_validation(const std::string& fileName, bool quiet, bool no_train_score, bool generate_fold_files)
|
||||||
{
|
{
|
||||||
auto datasets = Datasets(discretized, Paths::datasets());
|
auto datasets = Datasets(false, Paths::datasets()); // Never discretize here
|
||||||
// Get dataset
|
// Get dataset
|
||||||
auto [X, y] = datasets.getTensors(fileName);
|
auto [X, y] = datasets.getTensors(fileName);
|
||||||
auto states = datasets.getStates(fileName);
|
auto states = datasets.getStates(fileName);
|
||||||
@@ -176,6 +176,12 @@ namespace platform {
|
|||||||
auto y_train = y.index({ train_t });
|
auto y_train = y.index({ train_t });
|
||||||
auto X_test = X.index({ "...", test_t });
|
auto X_test = X.index({ "...", test_t });
|
||||||
auto y_test = y.index({ test_t });
|
auto y_test = y.index({ test_t });
|
||||||
|
if (discretized) {
|
||||||
|
// compute states too
|
||||||
|
// discretizer->fit(X_train, y_train);
|
||||||
|
// X_train = discretizer->transform(X_train);
|
||||||
|
// X_test = discretizer->transform(X_test);
|
||||||
|
}
|
||||||
if (generate_fold_files)
|
if (generate_fold_files)
|
||||||
generate_files(fileName, discretized, stratified, seed, nfold, X_train, y_train, X_test, y_test, train, test);
|
generate_files(fileName, discretized, stratified, seed, nfold, X_train, y_train, X_test, y_test, train, test);
|
||||||
if (!quiet)
|
if (!quiet)
|
||||||
|
@@ -1,3 +1,4 @@
|
|||||||
|
#include <algorithm>
|
||||||
#include "common/Colors.h"
|
#include "common/Colors.h"
|
||||||
#include "common/Datasets.h"
|
#include "common/Datasets.h"
|
||||||
#include "common/Paths.h"
|
#include "common/Paths.h"
|
||||||
@@ -12,7 +13,7 @@ namespace platform {
|
|||||||
auto part = temp.substr(0, DatasetsConsole::BALANCE_LENGTH);
|
auto part = temp.substr(0, DatasetsConsole::BALANCE_LENGTH);
|
||||||
line += part + "\n";
|
line += part + "\n";
|
||||||
body.push_back(line);
|
body.push_back(line);
|
||||||
line = string(name_len + 22, ' ');
|
line = string(name_len + 28, ' ');
|
||||||
temp = temp.substr(DatasetsConsole::BALANCE_LENGTH);
|
temp = temp.substr(DatasetsConsole::BALANCE_LENGTH);
|
||||||
}
|
}
|
||||||
line += temp + "\n";
|
line += temp + "\n";
|
||||||
@@ -26,8 +27,8 @@ namespace platform {
|
|||||||
std::stringstream sheader;
|
std::stringstream sheader;
|
||||||
auto datasets_names = datasets.getNames();
|
auto datasets_names = datasets.getNames();
|
||||||
int maxName = std::max(size_t(7), (*max_element(datasets_names.begin(), datasets_names.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size());
|
int maxName = std::max(size_t(7), (*max_element(datasets_names.begin(), datasets_names.end(), [](const std::string& a, const std::string& b) { return a.size() < b.size(); })).size());
|
||||||
std::vector<std::string> header_labels = { " #", "Dataset", "Sampl.", "Feat.", "Cls", "Balance" };
|
std::vector<std::string> header_labels = { " #", "Dataset", "Sampl.", "Feat.", "#Num.", "Cls", "Balance" };
|
||||||
std::vector<int> header_lengths = { 3, maxName, 6, 5, 3, DatasetsConsole::BALANCE_LENGTH };
|
std::vector<int> header_lengths = { 3, maxName, 6, 5, 5, 3, DatasetsConsole::BALANCE_LENGTH };
|
||||||
sheader << Colors::GREEN();
|
sheader << Colors::GREEN();
|
||||||
for (int i = 0; i < header_labels.size(); i++) {
|
for (int i = 0; i < header_labels.size(); i++) {
|
||||||
sheader << setw(header_lengths[i]) << left << header_labels[i] << " ";
|
sheader << setw(header_lengths[i]) << left << header_labels[i] << " ";
|
||||||
@@ -50,7 +51,11 @@ namespace platform {
|
|||||||
datasets.loadDataset(dataset);
|
datasets.loadDataset(dataset);
|
||||||
auto nSamples = datasets.getNSamples(dataset);
|
auto nSamples = datasets.getNSamples(dataset);
|
||||||
line << setw(6) << right << nSamples << " ";
|
line << setw(6) << right << nSamples << " ";
|
||||||
line << setw(5) << right << datasets.getFeatures(dataset).size() << " ";
|
auto nFeatures = datasets.getFeatures(dataset).size();
|
||||||
|
line << setw(5) << right << nFeatures << " ";
|
||||||
|
auto numericFeatures = datasets.getNumericFeatures(dataset);
|
||||||
|
auto num = std::count(numericFeatures.begin(), numericFeatures.end(), true);
|
||||||
|
line << setw(5) << right << num << " ";
|
||||||
line << setw(3) << right << datasets.getNClasses(dataset) << " ";
|
line << setw(3) << right << datasets.getNClasses(dataset) << " ";
|
||||||
std::string sep = "";
|
std::string sep = "";
|
||||||
oss.str("");
|
oss.str("");
|
||||||
@@ -63,6 +68,7 @@ namespace platform {
|
|||||||
data[dataset] = json::object();
|
data[dataset] = json::object();
|
||||||
data[dataset]["samples"] = nSamples;
|
data[dataset]["samples"] = nSamples;
|
||||||
data[dataset]["features"] = datasets.getFeatures(dataset).size();
|
data[dataset]["features"] = datasets.getFeatures(dataset).size();
|
||||||
|
data[dataset]["numericFeatures"] = num;
|
||||||
data[dataset]["classes"] = datasets.getNClasses(dataset);
|
data[dataset]["classes"] = datasets.getNClasses(dataset);
|
||||||
data[dataset]["balance"] = oss.str();
|
data[dataset]["balance"] = oss.str();
|
||||||
}
|
}
|
||||||
|
@@ -17,11 +17,11 @@ namespace platform {
|
|||||||
int balanceSize = 75; // Min size of the column
|
int balanceSize = 75; // Min size of the column
|
||||||
worksheet = workbook_add_worksheet(workbook, "Datasets");
|
worksheet = workbook_add_worksheet(workbook, "Datasets");
|
||||||
// Header
|
// Header
|
||||||
worksheet_merge_range(worksheet, 0, 0, 0, 5, "Datasets", styles["headerFirst"]);
|
worksheet_merge_range(worksheet, 0, 0, 0, 6, "Datasets", styles["headerFirst"]);
|
||||||
// Body header
|
// Body header
|
||||||
row = 2;
|
row = 2;
|
||||||
int col = 0;
|
int col = 0;
|
||||||
for (const auto& name : { "Nº", "Dataset", "Samples", "Features", "Classes", "Balance" }) {
|
for (const auto& name : { "Nº", "Dataset", "Samples", "Features", "#Numer.", "Classes", "Balance" }) {
|
||||||
writeString(row, col++, name, "bodyHeader");
|
writeString(row, col++, name, "bodyHeader");
|
||||||
}
|
}
|
||||||
// Body
|
// Body
|
||||||
@@ -34,12 +34,13 @@ namespace platform {
|
|||||||
writeString(row, 1, key.c_str(), "text");
|
writeString(row, 1, key.c_str(), "text");
|
||||||
writeInt(row, 2, value["samples"], "ints");
|
writeInt(row, 2, value["samples"], "ints");
|
||||||
writeInt(row, 3, value["features"], "ints");
|
writeInt(row, 3, value["features"], "ints");
|
||||||
writeInt(row, 4, value["classes"], "ints");
|
writeInt(row, 4, value["numericFeatures"], "ints");
|
||||||
writeString(row, 5, value["balance"].get<std::string>().c_str(), "text");
|
writeInt(row, 5, value["classes"], "ints");
|
||||||
|
writeString(row, 6, value["balance"].get<std::string>().c_str(), "text");
|
||||||
}
|
}
|
||||||
// Format columns
|
// Format columns
|
||||||
worksheet_freeze_panes(worksheet, 3, 2);
|
worksheet_freeze_panes(worksheet, 3, 2);
|
||||||
std::vector<int> columns_sizes = { 5, datasetNameSize, 10, 10, 10, balanceSize };
|
std::vector<int> columns_sizes = { 5, datasetNameSize, 10, 10, 10, 10, balanceSize };
|
||||||
for (int i = 0; i < columns_sizes.size(); ++i) {
|
for (int i = 0; i < columns_sizes.size(); ++i) {
|
||||||
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
|
worksheet_set_column(worksheet, i, i, columns_sizes.at(i), NULL);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user