fit discretizer only with train data
This commit is contained in:
Submodule lib/argparse updated: eab1d75e49...e462ab980c
2
lib/mdlp
2
lib/mdlp
Submodule lib/mdlp updated: 633aa52849...c4e6c041fe
@@ -130,7 +130,7 @@ namespace platform {
|
||||
stats[i] = 0.0;
|
||||
continue;
|
||||
}
|
||||
double z = abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
|
||||
double z = std::abs(ranks.at(models[controlIdx]) - ranks.at(models[i])) / diff;
|
||||
double p_value = (long double)2 * (1 - cdf(dist, z));
|
||||
stats[i] = p_value;
|
||||
}
|
||||
|
@@ -7,7 +7,7 @@ namespace platform {
|
||||
path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples),
|
||||
n_features(dataset.n_features), numericFeatures(dataset.numericFeatures), features(dataset.features),
|
||||
states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y),
|
||||
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv),
|
||||
X_train(dataset.X_train), X_test(dataset.X_test), Xv(dataset.Xv), yv(dataset.yv),
|
||||
fileType(dataset.fileType)
|
||||
{
|
||||
}
|
||||
@@ -46,9 +46,6 @@ namespace platform {
|
||||
int Dataset::getNClasses() const
|
||||
{
|
||||
if (loaded) {
|
||||
if (discretize) {
|
||||
return states.at(className).size();
|
||||
}
|
||||
return *std::max_element(yv.begin(), yv.end()) + 1;
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
@@ -91,14 +88,6 @@ namespace platform {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
pair<std::vector<std::vector<int>>&, std::vector<int>&> Dataset::getVectorsDiscretized()
|
||||
{
|
||||
if (loaded) {
|
||||
return { Xd, yv };
|
||||
} else {
|
||||
throw std::invalid_argument(message_dataset_not_loaded);
|
||||
}
|
||||
}
|
||||
pair<torch::Tensor&, torch::Tensor&> Dataset::getTensors()
|
||||
{
|
||||
if (loaded) {
|
||||
@@ -140,11 +129,13 @@ namespace platform {
|
||||
void Dataset::computeStates()
|
||||
{
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = std::vector<int>(*max_element(Xd[i].begin(), Xd[i].end()) + 1);
|
||||
auto [max_value, idx] = torch::max(X_train.index({ i, "..." }), 0);
|
||||
states[features[i]] = std::vector<int>(max_value.item<int>() + 1);
|
||||
auto item = states.at(features[i]);
|
||||
iota(begin(item), end(item), 0);
|
||||
}
|
||||
states[className] = std::vector<int>(*max_element(yv.begin(), yv.end()) + 1);
|
||||
auto [max_value, idx] = torch::max(y_train, 0);
|
||||
states[className] = std::vector<int>(max_value.item<int>() + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
}
|
||||
void Dataset::load_arff()
|
||||
@@ -245,17 +236,6 @@ namespace platform {
|
||||
y = torch::tensor(yv, torch::kInt32);
|
||||
loaded = true;
|
||||
}
|
||||
std::vector<mdlp::labels_t> Dataset::discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||
{
|
||||
std::vector<mdlp::labels_t> Xd;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return Xd;
|
||||
}
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> Dataset::getTrainTestTensors(std::vector<int>& train, std::vector<int>& test)
|
||||
{
|
||||
if (!loaded) {
|
||||
@@ -273,15 +253,14 @@ namespace platform {
|
||||
auto discretizer = Discretization::instance()->create(discretizer_algorithm);
|
||||
auto X_train_d = torch::zeros({ n_features, samples_train }, torch::kInt32);
|
||||
auto X_test_d = torch::zeros({ n_features, samples_test }, torch::kInt32);
|
||||
for (int feature = 0; feature < n_features; ++feature) {
|
||||
for (auto feature = 0; feature < n_features; ++feature) {
|
||||
if (numericFeatures[feature]) {
|
||||
auto X_train_feature = X_train.index({ feature, "..." }).to(torch::kFloat32);
|
||||
auto X_test_feature = X_test.index({ feature, "..." }).to(torch::kFloat32);
|
||||
discretizer->fit(X_train_feature, y_train);
|
||||
auto X_train_feature_d = discretizer->transform(X_train_feature);
|
||||
auto X_test_feature_d = discretizer->transform(X_test_feature);
|
||||
X_train_d.index_put_({ feature, "..." }, X_train_feature_d.to(torch::kInt32));
|
||||
X_test_d.index_put_({ feature, "..." }, X_test_feature_d.to(torch::kInt32));
|
||||
auto feature_train = X_train.index({ feature, "..." });
|
||||
auto feature_test = X_test.index({ feature, "..." });
|
||||
auto feature_train_disc = discretizer->fit_transform_t(feature_train, y_train);
|
||||
auto feature_test_disc = discretizer->transform_t(feature_test);
|
||||
X_train_d.index_put_({ feature, "..." }, feature_train_disc);
|
||||
X_test_d.index_put_({ feature, "..." }, feature_test_disc);
|
||||
} else {
|
||||
X_train_d.index_put_({ feature, "..." }, X_train.index({ feature, "..." }).to(torch::kInt32));
|
||||
X_test_d.index_put_({ feature, "..." }, X_test.index({ feature, "..." }).to(torch::kInt32));
|
||||
@@ -289,7 +268,12 @@ namespace platform {
|
||||
}
|
||||
X_train = X_train_d;
|
||||
X_test = X_test_d;
|
||||
assert(X_train.dtype() == torch::kInt32);
|
||||
assert(X_test.dtype() == torch::kInt32);
|
||||
computeStates();
|
||||
}
|
||||
assert(y_train.dtype() == torch::kInt32);
|
||||
assert(y_test.dtype() == torch::kInt32);
|
||||
return { X_train, X_test, y_train, y_test };
|
||||
}
|
||||
}
|
@@ -25,7 +25,6 @@ namespace platform {
|
||||
std::vector<string> getFeatures() const;
|
||||
std::map<std::string, std::vector<int>> getStates() const;
|
||||
std::pair<vector<std::vector<float>>&, std::vector<int>&> getVectors();
|
||||
std::pair<vector<std::vector<int>>&, std::vector<int>&> getVectorsDiscretized();
|
||||
std::pair<torch::Tensor&, torch::Tensor&> getTensors();
|
||||
std::tuple<torch::Tensor&, torch::Tensor&, torch::Tensor&, torch::Tensor&> getTrainTestTensors(std::vector<int>& train, std::vector<int>& test);
|
||||
int getNFeatures() const;
|
||||
@@ -50,7 +49,6 @@ namespace platform {
|
||||
torch::Tensor X, y;
|
||||
torch::Tensor X_train, X_test, y_train, y_test;
|
||||
std::vector<std::vector<float>> Xv;
|
||||
std::vector<std::vector<int>> Xd;
|
||||
std::vector<int> yv;
|
||||
void load_csv();
|
||||
void load_arff();
|
||||
|
@@ -3,8 +3,12 @@
|
||||
#include <common/Discretization.h>
|
||||
static platform::RegistrarDiscretization registrarM("mdlp",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::CPPFImdlp();});
|
||||
static platform::RegistrarDiscretization registrarBU("BinUniform",
|
||||
static platform::RegistrarDiscretization registrarBU3("bin3u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ("BinQuantile",
|
||||
static platform::RegistrarDiscretization registrarBQ3("bin3q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(3, mdlp::strategy_t::QUANTILE);});
|
||||
static platform::RegistrarDiscretization registrarBU4("bin4u",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::UNIFORM);});
|
||||
static platform::RegistrarDiscretization registrarBQ4("bin4q",
|
||||
[](void) -> mdlp::Discretizer* { return new mdlp::BinDisc(4, mdlp::strategy_t::QUANTILE);});
|
||||
#endif
|
@@ -29,7 +29,7 @@ namespace platform {
|
||||
{"framework", {"bulma", "bootstrap"}},
|
||||
{"margin", {"0.1", "0.2", "0.3"}},
|
||||
{"n_folds", {"5", "10"}},
|
||||
{"discretiz_algo", {"mdlp", "bin3u", "bin3q"}},
|
||||
{"discretiz_algo", {"mdlp", "bin3u", "bin3q", "bin4u", "bin4q"}},
|
||||
{"platform", {"any"}},
|
||||
{"model", {"any"}},
|
||||
{"seeds", {"any"}},
|
||||
|
@@ -127,7 +127,6 @@ namespace platform {
|
||||
auto& dataset = datasets.getDataset(dataset_name);
|
||||
auto combinations = grid.getGrid(dataset_name);
|
||||
auto [X, y] = dataset.getTensors();
|
||||
auto states = dataset.getStates();
|
||||
auto features = dataset.getFeatures();
|
||||
auto className = dataset.getClassName();
|
||||
//
|
||||
@@ -140,6 +139,7 @@ namespace platform {
|
||||
fold = new folding::KFold(config.n_folds, y.size(0), seed);
|
||||
auto [train, test] = fold->getFold(n_fold);
|
||||
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
|
||||
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
|
||||
double best_fold_score = 0.0;
|
||||
int best_idx_combination = -1;
|
||||
json best_fold_hyper;
|
||||
|
@@ -118,7 +118,7 @@ namespace platform {
|
||||
//
|
||||
// Load dataset and prepare data
|
||||
//
|
||||
auto datasets = Datasets(false, Paths::datasets()); // Never discretize here
|
||||
auto datasets = Datasets(discretized, Paths::datasets(), discretization_algo);
|
||||
auto& dataset = datasets.getDataset(fileName);
|
||||
dataset.load();
|
||||
auto [X, y] = dataset.getTensors(); // Only need y for folding
|
||||
@@ -186,7 +186,7 @@ namespace platform {
|
||||
train_timer.start();
|
||||
auto [train, test] = fold->getFold(nfold);
|
||||
auto [X_train, X_test, y_train, y_test] = dataset.getTrainTestTensors(train, test);
|
||||
auto states = dataset.getStates();
|
||||
auto states = dataset.getStates(); // Get the states of the features Once they are discretized
|
||||
if (generate_fold_files)
|
||||
generate_files(fileName, discretized, stratified, seed, nfold, X_train, y_train, X_test, y_test, train, test);
|
||||
if (!quiet)
|
||||
@@ -194,6 +194,14 @@ namespace platform {
|
||||
//
|
||||
// Train model
|
||||
//
|
||||
std::cout << "X_Train.dtype: " << X_train.dtype() << "\n";
|
||||
std::cout << "y_Train.dtype: " << y_train.dtype() << "\n";
|
||||
std::cout << "X_Test.dtype: " << X_test.dtype() << "\n";
|
||||
std::cout << "y_Test.dtype: " << y_test.dtype() << "\n";
|
||||
for (int i = 0; i < features.size(); i++) {
|
||||
std::cout << "Feature: " << features[i] << " states: " << states[features[i]].size() << "\n";
|
||||
}
|
||||
std::cout << "className: " << className << " states: " << states[className].size() << "\n";
|
||||
clf->fit(X_train, y_train, features, className, states);
|
||||
if (!quiet)
|
||||
showProgress(nfold + 1, getColor(clf->getStatus()), "b");
|
||||
|
Reference in New Issue
Block a user