fit discretizer only with train data

This commit is contained in:
2024-06-09 00:50:55 +02:00
parent 361c51d864
commit 643633e6dd
9 changed files with 38 additions and 44 deletions

View File

@@ -130,7 +130,7 @@ namespace platform {
stats[i] = 0.0;
continue;
}
double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
double z = std::abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
double p_value = (long double)2 * (1 - cdf(dist, z));
stats[i] = p_value;
}

View File

@@ -7,7 +7,7 @@ namespace platform {
path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples),
n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features),
states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y),
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv),
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), yv(dataset.yv),
fileType(dataset.fileType)
{
}
@@ -46,9 +46,6 @@ namespace platform {
int Dataset::getNClasses() const
{
if (loaded) {
if (discretize) {
return states.at(className).size();
}
return *std::max_element(yv.begin(), yv.end()) + 1;
} else {
throw std::invalid_argument(message_dataset_not_loaded);
@@ -91,14 +88,6 @@ namespace platform {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
{
if (loaded) {
return { Xd, yv };
} else {
throw std::invalid_argument(message_dataset_not_loaded);
}
}
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
{
if (loaded) {
@@ -140,11 +129,13 @@ namespace platform {
void Dataset::computeStates()
{
for (int i = 0; i < features.size(); ++i) {
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
auto [max_value, idx] = torch::max(X_train.index({ i, "..." }), 0);
states[features[i]] = std::vector<int>(max_value.item<int>() + 1);
auto item = states.at(features[i]);
iota(begin(item), end(item), 0);
}
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
auto [max_value, idx] = torch::max(y_train, 0);
states[className] = std::vector<int>(max_value.item<int>() + 1);
iota(begin(states.at(className)), end(states.at(className)), 0);
}
void Dataset::load_arff()
@@ -245,17 +236,6 @@ namespace platform {
y = torch::tensor(yv, torch::kInt32);
loaded = true;
}
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
{
std::vector<mdlp::labels_t> Xd;
auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], y);
mdlp::labels_t& xd = fimdlp.transform(X[i]);
Xd.push_back(xd);
}
return Xd;
}
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> Dataset::getTrainTestTensors(std::vector<int>& train, std::vector<int>& test)
{
if (!loaded) {
@@ -273,15 +253,14 @@ namespace platform {
auto discretizer = Discretization::instance()->create(discretizer_algorithm);
auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32);
auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32);
for (int feature = 0; feature < n_features; ++feature) {
for (auto feature = 0; feature < n_features; ++feature) {
if (numericFeatures[feature]) {
auto X_train_feature = X_train.index({ feature, "..." }).to(torch::kFloat32);
auto X_test_feature = X_test.index({ feature, "..." }).to(torch::kFloat32);
discretizer->fit(X_train_feature, y_train);
auto X_train_feature_d = discretizer->transform(X_train_feature);
auto X_test_feature_d = discretizer->transform(X_test_feature);
X_train_d.index_put_({ feature, "..." }, X_train_feature_d.to(torch::kInt32));
X_test_d.index_put_({ feature, "..." }, X_test_feature_d.to(torch::kInt32));
auto feature_train = X_train.index({ feature, "..." });
auto feature_test = X_test.index({ feature, "..." });
auto feature_train_disc = discretizer->fit_transform_t(feature_train, y_train);
auto feature_test_disc = discretizer->transform_t(feature_test);
X_train_d.index_put_({ feature, "..." }, feature_train_disc);
X_test_d.index_put_({ feature, "..." }, feature_test_disc);
} else {
X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32));
X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32));
@@ -289,7 +268,12 @@ namespace platform {
}
X_train = X_train_d;
X_test = X_test_d;
assert(X_train.dtype() == torch::kInt32);
assert(X_test.dtype() == torch::kInt32);
computeStates();
}
assert(y_train.dtype() == torch::kInt32);
assert(y_test.dtype() == torch::kInt32);
return { X_train, X_test, y_train, y_test };
}
}

View File

@@ -25,7 +25,6 @@ namespace platform {
std::vector<string> getFeatures() const;
std::map<std::string, std::vector<int>> getStates() const;
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(std::vector<int>& train, std::vector<int>& test);
int getNFeatures() const;
@@ -50,7 +49,6 @@ namespace platform {
torch::Tensor X, y;
torch::Tensor X_train, X_test, y_train, y_test;
std::vector<std::vector<float>> Xv;
std::vector<std::vector<int>> Xd;
std::vector<int> yv;
void load_csv();
void load_arff();

View File

@@ -3,8 +3,12 @@
#include <common/Discretization.h>
static platform::RegistrarDiscretization registrarM("mdlp",
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();});
static platform::RegistrarDiscretization registrarBU("BinUniform",
static platform::RegistrarDiscretization registrarBU3("bin3u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ("BinQuantile",
static platform::RegistrarDiscretization registrarBQ3("bin3q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);});
static platform::RegistrarDiscretization registrarBU4("bin4u",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::UNIFORM);});
static platform::RegistrarDiscretization registrarBQ4("bin4q",
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::QUANTILE);});
#endif

View File

@@ -29,7 +29,7 @@ namespace platform {
{"framework", {"bulma", "bootstrap"}},
{"margin", {"0.1", "0.2", "0.3"}},
{"n_folds", {"5", "10"}},
{"discretiz_algo", {"mdlp", "bin3u", "bin3q"}},
{"discretiz_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q"}},
{"platform", {"any"}},
{"model", {"any"}},
{"seeds", {"any"}},

View File

@@ -127,7 +127,6 @@ namespace platform {
auto& dataset = datasets.getDataset(dataset_name);
auto combinations = grid.getGrid(dataset_name);
auto [X, y] = dataset.getTensors();
auto states = dataset.getStates();
auto features = dataset.getFeatures();
auto className = dataset.getClassName();
//
@@ -140,6 +139,7 @@ namespace platform {
fold = new folding::KFold(config.n_folds, y.size(0), seed);
auto [train, test] = fold->getFold(n_fold);
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
double best_fold_score = 0.0;
int best_idx_combination = -1;
json best_fold_hyper;

View File

@@ -118,7 +118,7 @@ namespace platform {
//
// Load dataset and prepare data
//
auto datasets = Datasets(false, Paths::datasets()); // Never discretize here
auto datasets = Datasets(discretized, Paths::datasets(), discretization_algo);
auto& dataset = datasets.getDataset(fileName);
dataset.load();
auto [X, y] = dataset.getTensors(); // Only need y for folding
@@ -186,7 +186,7 @@ namespace platform {
train_timer.start();
auto [train, test] = fold->getFold(nfold);
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
auto states = dataset.getStates();
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
if (generate_fold_files)
generate_files(fileName, discretized, stratified, seed, nfold, X_train, y_train, X_test, y_test, train, test);
if (!quiet)
@@ -194,6 +194,14 @@ namespace platform {
//
// Train model
//
std::cout << "X_Train.dtype: " << X_train.dtype() << "\n";
std::cout << "y_Train.dtype: " << y_train.dtype() << "\n";
std::cout << "X_Test.dtype: " << X_test.dtype() << "\n";
std::cout << "y_Test.dtype: " << y_test.dtype() << "\n";
for (int i = 0; i < features.size(); i++) {
std::cout << "Feature: " << features[i] << " states: " << states[features[i]].size() << "\n";
}
std::cout << "className: " << className << " states: " << states[className].size() << "\n";
clf->fit(X_train, y_train, features, className, states);
if (!quiet)
showProgress(nfold + 1, getColor(clf->getStatus()), "b");