From 643633e6dd092a9f40ddc622cce4cdbf0b228ddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sun, 9 Jun 2024 00:50:55 +0200 Subject: [PATCH] fit discretizer only with train data --- lib/argparse | 2 +- lib/mdlp | 2 +- src/best/Statistics.cpp | 2 +- src/common/Dataset.cpp | 50 ++++++++++------------------- src/common/Dataset.h | 2 -- src/common/DiscretizationRegister.h | 8 +++-- src/common/DotEnv.h | 2 +- src/grid/GridSearch.cpp | 2 +- src/main/Experiment.cpp | 12 +++++-- 9 files changed, 38 insertions(+), 44 deletions(-) diff --git a/lib/argparse b/lib/argparse index eab1d75..e462ab9 160000 --- a/lib/argparse +++ b/lib/argparse @@ -1 +1 @@ -Subproject commit eab1d75e49970857eba1fdef5afb68befa2fa16f +Subproject commit e462ab980c0852bd1df2ee1f4ec81826246b6f21 diff --git a/lib/mdlp b/lib/mdlp index 633aa52..c4e6c04 160000 --- a/lib/mdlp +++ b/lib/mdlp @@ -1 +1 @@ -Subproject commit 633aa52849a61a5da9f5d6ea9f2401fd0c48ad47 +Subproject commit c4e6c041fe7f769ec24c0a2bd66a5aff482fd630 diff --git a/src/best/Statistics.cpp b/src/best/Statistics.cpp index 0fbff1e..d5284e1 100644 --- a/src/best/Statistics.cpp +++ b/src/best/Statistics.cpp @@ -130,7 +130,7 @@ namespace platform { stats[i] = 0.0; continue; } - double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff; + double z = std::abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff; double p_value = (long double)2 * (1 - cdf(dist, z)); stats[i] = p_value; } diff --git a/src/common/Dataset.cpp b/src/common/Dataset.cpp index 7265787..b59ead4 100644 --- a/src/common/Dataset.cpp +++ b/src/common/Dataset.cpp @@ -7,7 +7,7 @@ namespace platform { path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), - X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), + X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), yv(dataset.yv), fileType(dataset.fileType) { } @@ -46,9 +46,6 @@ namespace platform { int Dataset::getNClasses() const { if (loaded) { - if (discretize) { - return states.at(className).size(); - } return *std::max_element(yv.begin(), yv.end()) + 1; } else { throw std::invalid_argument(message_dataset_not_loaded); @@ -91,14 +88,6 @@ namespace platform { throw std::invalid_argument(message_dataset_not_loaded); } } - pair>&, std::vector&> Dataset::getVectorsDiscretized() - { - if (loaded) { - return { Xd, yv }; - } else { - throw std::invalid_argument(message_dataset_not_loaded); - } - } pair Dataset::getTensors() { if (loaded) { @@ -140,11 +129,13 @@ namespace platform { void Dataset::computeStates() { for (int i = 0; i < features.size(); ++i) { - states[features[i]] = std::vector(*max_element(Xd[i].begin(), Xd[i].end()) + 1); + auto [max_value, idx] = torch::max(X_train.index({ i, "..." }), 0); + states[features[i]] = std::vector(max_value.item() + 1); auto item = states.at(features[i]); iota(begin(item), end(item), 0); } - states[className] = std::vector(*max_element(yv.begin(), yv.end()) + 1); + auto [max_value, idx] = torch::max(y_train, 0); + states[className] = std::vector(max_value.item() + 1); iota(begin(states.at(className)), end(states.at(className)), 0); } void Dataset::load_arff() @@ -245,17 +236,6 @@ namespace platform { y = torch::tensor(yv, torch::kInt32); loaded = true; } - std::vector Dataset::discretizeDataset(std::vector& X, mdlp::labels_t& y) - { - std::vector Xd; - auto fimdlp = mdlp::CPPFImdlp(); - for (int i = 0; i < X.size(); i++) { - fimdlp.fit(X[i], y); - mdlp::labels_t& xd = fimdlp.transform(X[i]); - Xd.push_back(xd); - } - return Xd; - } std::tuple Dataset::getTrainTestTensors(std::vector& train, std::vector& test) { if (!loaded) { @@ -273,15 +253,14 @@ namespace platform { auto discretizer = Discretization::instance()->create(discretizer_algorithm); auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32); auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32); - for (int feature = 0; feature < n_features; ++feature) { + for (auto feature = 0; feature < n_features; ++feature) { if (numericFeatures[feature]) { - auto X_train_feature = X_train.index({ feature, "..." }).to(torch::kFloat32); - auto X_test_feature = X_test.index({ feature, "..." }).to(torch::kFloat32); - discretizer->fit(X_train_feature, y_train); - auto X_train_feature_d = discretizer->transform(X_train_feature); - auto X_test_feature_d = discretizer->transform(X_test_feature); - X_train_d.index_put_({ feature, "..." }, X_train_feature_d.to(torch::kInt32)); - X_test_d.index_put_({ feature, "..." }, X_test_feature_d.to(torch::kInt32)); + auto feature_train = X_train.index({ feature, "..." }); + auto feature_test = X_test.index({ feature, "..." }); + auto feature_train_disc = discretizer->fit_transform_t(feature_train, y_train); + auto feature_test_disc = discretizer->transform_t(feature_test); + X_train_d.index_put_({ feature, "..." }, feature_train_disc); + X_test_d.index_put_({ feature, "..." }, feature_test_disc); } else { X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32)); X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32)); @@ -289,7 +268,12 @@ namespace platform { } X_train = X_train_d; X_test = X_test_d; + assert(X_train.dtype() == torch::kInt32); + assert(X_test.dtype() == torch::kInt32); + computeStates(); } + assert(y_train.dtype() == torch::kInt32); + assert(y_test.dtype() == torch::kInt32); return { X_train, X_test, y_train, y_test }; } } \ No newline at end of file diff --git a/src/common/Dataset.h b/src/common/Dataset.h index 3bf6fc0..07dd401 100644 --- a/src/common/Dataset.h +++ b/src/common/Dataset.h @@ -25,7 +25,6 @@ namespace platform { std::vector getFeatures() const; std::map> getStates() const; std::pair>&, std::vector&> getVectors(); - std::pair>&, std::vector&> getVectorsDiscretized(); std::pair getTensors(); std::tuple getTrainTestTensors(std::vector& train, std::vector& test); int getNFeatures() const; @@ -50,7 +49,6 @@ namespace platform { torch::Tensor X, y; torch::Tensor X_train, X_test, y_train, y_test; std::vector> Xv; - std::vector> Xd; std::vector yv; void load_csv(); void load_arff(); diff --git a/src/common/DiscretizationRegister.h b/src/common/DiscretizationRegister.h index 79cb912..9a6a65a 100644 --- a/src/common/DiscretizationRegister.h +++ b/src/common/DiscretizationRegister.h @@ -3,8 +3,12 @@ #include static platform::RegistrarDiscretization registrarM("mdlp", [](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();}); -static platform::RegistrarDiscretization registrarBU("BinUniform", +static platform::RegistrarDiscretization registrarBU3("bin3u", [](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);}); -static platform::RegistrarDiscretization registrarBQ("BinQuantile", +static platform::RegistrarDiscretization registrarBQ3("bin3q", [](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);}); +static platform::RegistrarDiscretization registrarBU4("bin4u", + [](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::UNIFORM);}); +static platform::RegistrarDiscretization registrarBQ4("bin4q", + [](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::QUANTILE);}); #endif \ No newline at end of file diff --git a/src/common/DotEnv.h b/src/common/DotEnv.h index c2c70b7..d246f36 100644 --- a/src/common/DotEnv.h +++ b/src/common/DotEnv.h @@ -29,7 +29,7 @@ namespace platform { {"framework", {"bulma", "bootstrap"}}, {"margin", {"0.1", "0.2", "0.3"}}, {"n_folds", {"5", "10"}}, - {"discretiz_algo", {"mdlp", "bin3u", "bin3q"}}, + {"discretiz_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q"}}, {"platform", {"any"}}, {"model", {"any"}}, {"seeds", {"any"}}, diff --git a/src/grid/GridSearch.cpp b/src/grid/GridSearch.cpp index 556127f..6d8699c 100644 --- a/src/grid/GridSearch.cpp +++ b/src/grid/GridSearch.cpp @@ -127,7 +127,6 @@ namespace platform { auto& dataset = datasets.getDataset(dataset_name); auto combinations = grid.getGrid(dataset_name); auto [X, y] = dataset.getTensors(); - auto states = dataset.getStates(); auto features = dataset.getFeatures(); auto className = dataset.getClassName(); // @@ -140,6 +139,7 @@ namespace platform { fold = new folding::KFold(config.n_folds, y.size(0), seed); auto [train, test] = fold->getFold(n_fold); auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test); + auto states = dataset.getStates(); // Get the states of the features Once they are discretized double best_fold_score = 0.0; int best_idx_combination = -1; json best_fold_hyper; diff --git a/src/main/Experiment.cpp b/src/main/Experiment.cpp index c09eeae..08016e9 100644 --- a/src/main/Experiment.cpp +++ b/src/main/Experiment.cpp @@ -118,7 +118,7 @@ namespace platform { // // Load dataset and prepare data // - auto datasets = Datasets(false, Paths::datasets()); // Never discretize here + auto datasets = Datasets(discretized, Paths::datasets(), discretization_algo); auto& dataset = datasets.getDataset(fileName); dataset.load(); auto [X, y] = dataset.getTensors(); // Only need y for folding @@ -186,7 +186,7 @@ namespace platform { train_timer.start(); auto [train, test] = fold->getFold(nfold); auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test); - auto states = dataset.getStates(); + auto states = dataset.getStates(); // Get the states of the features Once they are discretized if (generate_fold_files) generate_files(fileName, discretized, stratified, seed, nfold, X_train, y_train, X_test, y_test, train, test); if (!quiet) @@ -194,6 +194,14 @@ namespace platform { // // Train model // + std::cout << "X_Train.dtype: " << X_train.dtype() << "\n"; + std::cout << "y_Train.dtype: " << y_train.dtype() << "\n"; + std::cout << "X_Test.dtype: " << X_test.dtype() << "\n"; + std::cout << "y_Test.dtype: " << y_test.dtype() << "\n"; + for (int i = 0; i < features.size(); i++) { + std::cout << "Feature: " << features[i] << " states: " << states[features[i]].size() << "\n"; + } + std::cout << "className: " << className << " states: " << states[className].size() << "\n"; clf->fit(X_train, y_train, features, className, states); if (!quiet) showProgress(nfold + 1, getColor(clf->getStatus()), "b");