Refactor TestUtils to allow partial and shuffle dataset load
This commit is contained in:
parent
ae469b8146
commit
793b2d3cd5
2
.vscode/launch.json
vendored
2
.vscode/launch.json
vendored
@ -16,7 +16,7 @@
|
||||
"name": "test",
|
||||
"program": "${workspaceFolder}/build_debug/tests/TestBayesNet",
|
||||
"args": [
|
||||
"\"Test Cannot build dataset with wrong data vector\""
|
||||
"\"Bisection Best\""
|
||||
],
|
||||
"cwd": "${workspaceFolder}/build_debug/tests"
|
||||
},
|
||||
|
@ -140,7 +140,7 @@ TEST_CASE("Oddities", "[BoostAODE]")
|
||||
TEST_CASE("Bisection Best", "[BoostAODE]")
|
||||
{
|
||||
auto clf = bayesnet::BoostAODE();
|
||||
auto raw = RawDatasets("mfeat-factors", true, 500);
|
||||
auto raw = RawDatasets("mfeat-factors", true, 300, true);
|
||||
clf.setHyperparameters({
|
||||
{"bisection", true},
|
||||
{"maxTolerance", 3},
|
||||
@ -149,8 +149,8 @@ TEST_CASE("Bisection Best", "[BoostAODE]")
|
||||
{"convergence_best", true},
|
||||
});
|
||||
clf.fit(raw.X_train, raw.y_train, raw.features, raw.className, raw.states);
|
||||
REQUIRE(clf.getNumberOfNodes() == 217);
|
||||
REQUIRE(clf.getNumberOfEdges() == 431);
|
||||
REQUIRE(clf.getNumberOfNodes() == 434);
|
||||
REQUIRE(clf.getNumberOfEdges() == 862);
|
||||
REQUIRE(clf.getNotes().size() == 3);
|
||||
REQUIRE(clf.getNotes()[0] == "Convergence threshold reached & 15 models eliminated");
|
||||
REQUIRE(clf.getNotes()[1] == "Used features in train: 16 of 216");
|
||||
@ -162,7 +162,7 @@ TEST_CASE("Bisection Best", "[BoostAODE]")
|
||||
}
|
||||
TEST_CASE("Bisection Best vs Last", "[BoostAODE]")
|
||||
{
|
||||
auto raw = RawDatasets("mfeat-factors", true, 1500);
|
||||
auto raw = RawDatasets("mfeat-factors", true, 500);
|
||||
auto clf = bayesnet::BoostAODE(true);
|
||||
auto hyperparameters = nlohmann::json{
|
||||
{"select_features", "IWSS"},
|
||||
|
@ -4,6 +4,7 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// ***************************************************************
|
||||
|
||||
#include <random>
|
||||
#include "TestUtils.h"
|
||||
#include "bayesnet/config.h"
|
||||
|
||||
@ -15,97 +16,109 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
pair<std::vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<std::string> features)
|
||||
class ShuffleArffFiles : public ArffFiles {
|
||||
public:
|
||||
ShuffleArffFiles(int num_lines = 0, bool shuffle = false) : ArffFiles(), num_lines(num_lines), shuffle(shuffle) {}
|
||||
void load(const std::string& file_name, bool class_last = true)
|
||||
{
|
||||
ArffFiles::load(file_name, class_last);
|
||||
if (num_lines > 0) {
|
||||
if (num_lines > getY().size()) {
|
||||
throw std::invalid_argument("num_lines must be less than the number of lines in the file");
|
||||
}
|
||||
auto indices = std::vector<int>(num_lines);
|
||||
std::iota(indices.begin(), indices.end(), 0);
|
||||
if (shuffle) {
|
||||
std::mt19937 g{ 173 };
|
||||
std::shuffle(indices.begin(), indices.end(), g);
|
||||
}
|
||||
auto XX = std::vector<std::vector<float>>(attributes.size(), std::vector<float>(num_lines));
|
||||
auto yy = std::vector<int>(num_lines);
|
||||
for (int i = 0; i < num_lines; i++) {
|
||||
yy[i] = getY()[indices[i]];
|
||||
for (int j = 0; j < attributes.size(); j++) {
|
||||
XX[j][i] = X[j][indices[i]];
|
||||
}
|
||||
}
|
||||
X = XX;
|
||||
y = yy;
|
||||
}
|
||||
}
|
||||
private:
|
||||
int num_lines;
|
||||
bool shuffle;
|
||||
};
|
||||
|
||||
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_lines_, bool shuffle_)
|
||||
{
|
||||
std::vector<mdlp::labels_t> Xd;
|
||||
num_lines = num_lines_;
|
||||
shuffle = shuffle_;
|
||||
discretize = discretize_;
|
||||
// Xt can be either discretized or not
|
||||
// Xv is always discretized
|
||||
loadDataset(file_name);
|
||||
auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
|
||||
dataset = torch::cat({ Xt, yresized }, 0);
|
||||
nSamples = dataset.size(1);
|
||||
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
|
||||
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
|
||||
classNumStates = discretize ? states.at(className).size() : 0;
|
||||
auto fold = folding::StratifiedKFold(5, yt, 271);
|
||||
auto [train, test] = fold.getFold(0);
|
||||
auto train_t = torch::tensor(train);
|
||||
auto test_t = torch::tensor(test);
|
||||
// Get train and validation sets
|
||||
X_train = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), train_t });
|
||||
y_train = dataset.index({ -1, train_t });
|
||||
X_test = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), test_t });
|
||||
y_test = dataset.index({ -1, test_t });
|
||||
std::cout << to_string();
|
||||
}
|
||||
|
||||
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
|
||||
{
|
||||
|
||||
map<std::string, int> maxes;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
fimdlp.fit(X[i], yv);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||
Xd.push_back(xd);
|
||||
Xv.push_back(xd);
|
||||
}
|
||||
return { Xd, maxes };
|
||||
return maxes;
|
||||
}
|
||||
|
||||
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y)
|
||||
void RawDatasets::loadDataset(const std::string& name)
|
||||
{
|
||||
std::vector<mdlp::labels_t> Xd;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], y);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
Xd.push_back(xd);
|
||||
}
|
||||
return Xd;
|
||||
}
|
||||
|
||||
bool file_exists(const std::string& name)
|
||||
{
|
||||
if (FILE* file = fopen(name.c_str(), "r")) {
|
||||
fclose(file);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
tuple<torch::Tensor, torch::Tensor, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset)
|
||||
{
|
||||
auto handler = ArffFiles();
|
||||
handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff", class_last);
|
||||
auto handler = ShuffleArffFiles(num_lines, shuffle);
|
||||
handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff", true);
|
||||
// Get Dataset X, y
|
||||
std::vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
yv = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
std::vector<std::string> features;
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
||||
torch::Tensor Xd;
|
||||
auto states = map<std::string, std::vector<int>>();
|
||||
if (discretize_dataset) {
|
||||
auto Xr = discretizeDataset(X, y);
|
||||
Xd = torch::zeros({ static_cast<int>(Xr.size()), static_cast<int>(Xr[0].size()) }, torch::kInt32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = std::vector<int>(*max_element(Xr[i].begin(), Xr[i].end()) + 1);
|
||||
auto item = states.at(features[i]);
|
||||
iota(begin(item), end(item), 0);
|
||||
Xd.index_put_({ i, "..." }, torch::tensor(Xr[i], torch::kInt32));
|
||||
}
|
||||
states[className] = std::vector<int>(*max_element(y.begin(), y.end()) + 1);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
} else {
|
||||
Xd = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
Xd.index_put_({ i, "..." }, torch::tensor(X[i]));
|
||||
}
|
||||
}
|
||||
return { Xd, torch::tensor(y, torch::kInt32), features, className, states };
|
||||
}
|
||||
|
||||
tuple<std::vector<std::vector<int>>, std::vector<int>, std::vector<std::string>, std::string, map<std::string, std::vector<int>>> loadFile(const std::string& name)
|
||||
{
|
||||
auto handler = ArffFiles();
|
||||
handler.load(Paths::datasets() + static_cast<std::string>(name) + ".arff");
|
||||
// Get Dataset X, y
|
||||
std::vector<mdlp::samples_t>& X = handler.getX();
|
||||
mdlp::labels_t& y = handler.getY();
|
||||
// Get className & Features
|
||||
auto className = handler.getClassName();
|
||||
std::vector<std::string> features;
|
||||
className = handler.getClassName();
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
||||
// Discretize Dataset
|
||||
std::vector<mdlp::labels_t> Xd;
|
||||
map<std::string, int> maxes;
|
||||
tie(Xd, maxes) = discretize(X, y, features);
|
||||
maxes[className] = *max_element(y.begin(), y.end()) + 1;
|
||||
map<std::string, std::vector<int>> states;
|
||||
for (auto feature : features) {
|
||||
states[feature] = std::vector<int>(maxes[feature]);
|
||||
auto maxValues = discretizeDataset(X);
|
||||
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
|
||||
if (discretize) {
|
||||
// discretize the tensor as well
|
||||
Xt = torch::zeros({ static_cast<int>(Xv.size()), static_cast<int>(Xv[0].size()) }, torch::kInt32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
states[features[i]] = std::vector<int>(maxValues[features[i]]);
|
||||
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
|
||||
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
|
||||
}
|
||||
states[className] = std::vector<int>(maxValues[className]);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
} else {
|
||||
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
|
||||
}
|
||||
}
|
||||
states[className] = std::vector<int>(maxes[className]);
|
||||
return { Xd, y, features, className, states };
|
||||
yt = torch::tensor(yv, torch::kInt32);
|
||||
}
|
||||
|
||||
|
@ -13,37 +13,60 @@
|
||||
#include <tuple>
|
||||
#include <ArffFiles.h>
|
||||
#include <CPPFImdlp.h>
|
||||
#include <folding.hpp>
|
||||
|
||||
bool file_exists(const std::string& name);
|
||||
std::pair<vector<mdlp::labels_t>, map<std::string, int>> discretize(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y, std::vector<string> features);
|
||||
std::vector<mdlp::labels_t> discretizeDataset(std::vector<mdlp::samples_t>& X, mdlp::labels_t& y);
|
||||
std::tuple<vector<vector<int>>, std::vector<int>, std::vector<string>, std::string, map<std::string, std::vector<int>>> loadFile(const std::string& name);
|
||||
std::tuple<torch::Tensor, torch::Tensor, std::vector<string>, std::string, map<std::string, std::vector<int>>> loadDataset(const std::string& name, bool class_last, bool discretize_dataset);
|
||||
|
||||
class RawDatasets {
|
||||
public:
|
||||
RawDatasets(const std::string& file_name, bool discretize)
|
||||
{
|
||||
// Xt can be either discretized or not
|
||||
tie(Xt, yt, featurest, classNamet, statest) = loadDataset(file_name, true, discretize);
|
||||
// Xv is always discretized
|
||||
tie(Xv, yv, featuresv, classNamev, statesv) = loadFile(file_name);
|
||||
auto yresized = torch::transpose(yt.view({ yt.size(0), 1 }), 0, 1);
|
||||
dataset = torch::cat({ Xt, yresized }, 0);
|
||||
nSamples = dataset.size(1);
|
||||
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
|
||||
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
|
||||
classNumStates = discretize ? statest.at(classNamet).size() : 0;
|
||||
}
|
||||
RawDatasets(const std::string& file_name, bool discretize_, int num_lines_ = 0, bool shuffle_ = false);
|
||||
torch::Tensor Xt, yt, dataset, weights;
|
||||
torch::Tensor X_train, y_train, X_test, y_test;
|
||||
std::vector<vector<int>> Xv;
|
||||
std::vector<double> weightsv;
|
||||
std::vector<int> yv;
|
||||
std::vector<string> featurest, featuresv;
|
||||
map<std::string, std::vector<int>> statest, statesv;
|
||||
std::string classNamet, classNamev;
|
||||
std::vector<double> weightsv;
|
||||
std::vector<string> features;
|
||||
std::string className;
|
||||
map<std::string, std::vector<int>> states;
|
||||
int nSamples, classNumStates;
|
||||
double epsilon = 1e-5;
|
||||
bool discretize;
|
||||
int num_lines = 0;
|
||||
bool shuffle = false;
|
||||
private:
|
||||
std::string to_string()
|
||||
{
|
||||
std::string features_ = "";
|
||||
for (auto& f : features) {
|
||||
features_ += f + " ";
|
||||
}
|
||||
std::string states_ = "";
|
||||
for (auto& s : states) {
|
||||
states_ += s.first + " ";
|
||||
for (auto& v : s.second) {
|
||||
states_ += std::to_string(v) + " ";
|
||||
}
|
||||
states_ += "\n";
|
||||
}
|
||||
return "Xt dimensions: " + std::to_string(Xt.size(0)) + " " + std::to_string(Xt.size(1)) + "\n"
|
||||
"Xv dimensions: " + std::to_string(Xv.size()) + " " + std::to_string(Xv[0].size()) + "\n"
|
||||
+ "yt dimensions: " + std::to_string(yt.size(0)) + "\n"
|
||||
+ "yv dimensions: " + std::to_string(yv.size()) + "\n"
|
||||
+ "X_train dimensions: " + std::to_string(X_train.size(0)) + " " + std::to_string(X_train.size(1)) + "\n"
|
||||
+ "X_test dimensions: " + std::to_string(X_test.size(0)) + " " + std::to_string(X_test.size(1)) + "\n"
|
||||
+ "y_train dimensions: " + std::to_string(y_train.size(0)) + "\n"
|
||||
+ "y_test dimensions: " + std::to_string(y_test.size(0)) + "\n"
|
||||
+ "features: " + std::to_string(features.size()) + "\n"
|
||||
+ features_ + "\n"
|
||||
+ "className: " + className + "\n"
|
||||
+ "states: " + std::to_string(states.size()) + "\n"
|
||||
+ "nSamples: " + std::to_string(nSamples) + "\n"
|
||||
+ "classNumStates: " + std::to_string(classNumStates) + "\n"
|
||||
+ "states: " + states_ + "\n";
|
||||
|
||||
|
||||
}
|
||||
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
|
||||
void loadDataset(const std::string& name);
|
||||
};
|
||||
|
||||
#endif //TEST_UTILS_H
|
Loading…
Reference in New Issue
Block a user