Fix discretize only numerics in tests

This commit is contained in:
2025-08-21 12:56:41 +02:00
parent 8578d68c57
commit f1a2349245
7 changed files with 49075 additions and 13 deletions

View File

@@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed ### Fixed
- Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical. - Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical.
- Fix testutils to return states for all features:
- An empty vector is now returned for numeric features.
- Categorical features now return their unique states.
## [1.2.1] - 2025-07-19 ## [1.2.1] - 2025-07-19

View File

@@ -34,7 +34,7 @@ NC = \033[0m # No Color
define ClearTests define ClearTests
@for t in $(test_targets); do \ @for t in $(test_targets); do \
if [ -f $(f_debug)/tests/$$t ]; then \ if [ -f $(f_debug)/tests/$$t ]; then \
echo ">>> Cleaning $$t..." ; \ echo ">>> Removing $$t..." ; \
rm -f $(f_debug)/tests/$$t ; \ rm -f $(f_debug)/tests/$$t ; \
fi ; \ fi ; \
done done
@@ -99,11 +99,12 @@ debug: ## Setup debug version using Conan
release: ## Setup release version using Conan release: ## Setup release version using Conan
@$(call setup_target,"Release","$(f_release)","ENABLE_TESTING=OFF") @$(call setup_target,"Release","$(f_release)","ENABLE_TESTING=OFF")
buildd: ## Build the debug targets buildd: ## Build the debug && test targets
cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS) @cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
@cmake --build $(f_debug) -t $(test_targets) --parallel $(JOBS)
buildr: ## Build the release targets buildr: ## Build the release targets
cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS) @cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
# Install targets # Install targets

View File

@@ -20,7 +20,7 @@
#include "bayesnet/ensembles/AODELd.h" #include "bayesnet/ensembles/AODELd.h"
#include "bayesnet/ensembles/BoostAODE.h" #include "bayesnet/ensembles/BoostAODE.h"
const std::string ACTUAL_VERSION = "1.2.1"; const std::string ACTUAL_VERSION = "1.2.2";
TEST_CASE("Test Bayesian Classifiers score & version", "[Models]") TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
{ {
@@ -496,3 +496,47 @@ TEST_CASE("Local discretization hyperparameters", "[Models]")
REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing)); REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
REQUIRE(clft.getStatus() == bayesnet::NORMAL); REQUIRE(clft.getStatus() == bayesnet::NORMAL);
} }
TEST_CASE("Test Dataset Loading", "[Datasets]")
{
int max_sample = 4;
// Test loading a dataset
RawDatasets dataset("iris", true);
REQUIRE(dataset.Xt.size(0) == 4);
REQUIRE(dataset.Xt.size(1) == 150);
REQUIRE(dataset.yt.size(0) == 150);
std::cout << "Dataset iris discretized " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 4; feature++) {
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
dataset = RawDatasets("iris", false);
std::cout << "Dataset iris raw " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 4; feature++) {
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
// Test loading a dataset
dataset = RawDatasets("adult", true);
REQUIRE(dataset.Xt.size(0) == 14);
REQUIRE(dataset.Xt.size(1) == 45222);
REQUIRE(dataset.yt.size(0) == 45222);
std::cout << "Dataset adult discretized " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 14; feature++) {
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
dataset = RawDatasets("adult", false);
std::cout << "Dataset adult raw " << std::endl;
for (int sample = 0; sample < max_sample; sample++) {
for (int feature = 0; feature < 14; feature++) {
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
}
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
}
}

View File

@@ -5,6 +5,7 @@
// *************************************************************** // ***************************************************************
#include <random> #include <random>
#include <nlohmann/json.hpp>
#include "TestUtils.h" #include "TestUtils.h"
#include "bayesnet/config.h" #include "bayesnet/config.h"
@@ -51,6 +52,7 @@ private:
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug) RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug)
{ {
catalog = loadCatalog();
num_samples = num_samples_; num_samples = num_samples_;
shuffle = shuffle_; shuffle = shuffle_;
discretize = discretize_; discretize = discretize_;
@@ -62,7 +64,7 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
nSamples = dataset.size(1); nSamples = dataset.size(1);
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble); weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
weightsv = std::vector<double>(nSamples, 1.0 / nSamples); weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
classNumStates = discretize ? states.at(className).size() : 0; classNumStates = states.at(className).size();
auto fold = folding::StratifiedKFold(5, yt, 271); auto fold = folding::StratifiedKFold(5, yt, 271);
auto [train, test] = fold.getFold(0); auto [train, test] = fold.getFold(0);
auto train_t = torch::tensor(train); auto train_t = torch::tensor(train);
@@ -76,20 +78,92 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
std::cout << to_string(); std::cout << to_string();
} }
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X) map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X, const std::vector<bool>& is_numeric)
{ {
map<std::string, int> maxes; map<std::string, int> maxes;
auto fimdlp = mdlp::CPPFImdlp(); auto fimdlp = mdlp::CPPFImdlp();
for (int i = 0; i < X.size(); i++) { for (int i = 0; i < X.size(); i++) {
fimdlp.fit(X[i], yv); mdlp::labels_t xd;
mdlp::labels_t& xd = fimdlp.transform(X[i]); if (is_numeric.at(i)) {
fimdlp.fit(X[i], yv);
xd = fimdlp.transform(X[i]);
} else {
std::transform(X[i].begin(), X[i].end(), back_inserter(xd), [](const auto& val) {
return static_cast<int>(val);
});
}
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1; maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
Xv.push_back(xd); Xv.push_back(xd);
} }
return maxes; return maxes;
} }
map<std::string, std::vector<int>> RawDatasets::loadCatalog()
{
map<std::string, std::vector<int>> catalogNames;
ifstream catalog(Paths::datasets() + "all.txt");
std::vector<int> numericFeaturesIdx;
if (!catalog.is_open()) {
throw std::invalid_argument("Unable to open catalog file. [" + Paths::datasets() + +"all.txt" + "]");
}
std::string line;
std::vector<std::string> sorted_lines;
while (getline(catalog, line)) {
if (line.empty() || line[0] == '#') {
continue;
}
sorted_lines.push_back(line);
}
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
});
for (const auto& line : sorted_lines) {
std::vector<std::string> tokens = split(line, ';');
std::string name = tokens[0];
std::string className;
numericFeaturesIdx.clear();
int size = tokens.size();
switch (size) {
case 1:
className = "-1";
numericFeaturesIdx.push_back(-1);
break;
case 2:
className = tokens[1];
numericFeaturesIdx.push_back(-1);
break;
case 3:
{
className = tokens[1];
auto numericFeatures = tokens[2];
if (numericFeatures == "all") {
numericFeaturesIdx.push_back(-1);
} else {
if (numericFeatures != "none") {
auto features = nlohmann::json::parse(numericFeatures);
for (auto& f : features) {
numericFeaturesIdx.push_back(f);
}
}
}
}
break;
default:
throw std::invalid_argument("Invalid catalog file format.");
}
catalogNames[name] = numericFeaturesIdx;
}
catalog.close();
if (catalogNames.empty()) {
throw std::invalid_argument("Catalog is empty. Please check the catalog file.");
}
return catalogNames;
}
void RawDatasets::loadDataset(const std::string& name, bool class_last) void RawDatasets::loadDataset(const std::string& name, bool class_last)
{ {
auto handler = ShuffleArffFiles(num_samples, shuffle); auto handler = ShuffleArffFiles(num_samples, shuffle);
@@ -101,8 +175,27 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
className = handler.getClassName(); className = handler.getClassName();
auto attributes = handler.getAttributes(); auto attributes = handler.getAttributes();
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; }); transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
auto numericFeaturesIdx = catalog.at(name);
std::vector<bool> is_numeric;
if (numericFeaturesIdx.empty()) {
// no numeric features
is_numeric.assign(features.size(), false);
} else {
if (numericFeaturesIdx[0] == -1) {
// all features are numeric
is_numeric.assign(features.size(), true);
} else {
// some features are numeric
is_numeric.assign(features.size(), false);
for (const auto& idx : numericFeaturesIdx) {
if (idx >= 0 && idx < features.size()) {
is_numeric[idx] = true;
}
}
}
}
// Discretize Dataset // Discretize Dataset
auto maxValues = discretizeDataset(X); auto maxValues = discretizeDataset(X, is_numeric);
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1; maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
if (discretize) { if (discretize) {
// discretize the tensor as well // discretize the tensor as well
@@ -113,13 +206,21 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32)); Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
} }
states[className] = std::vector<int>(maxValues[className]); states[className] = std::vector<int>(maxValues[className]);
iota(begin(states.at(className)), end(states.at(className)), 0);
} else { } else {
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32); Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
for (int i = 0; i < features.size(); ++i) { for (int i = 0; i < features.size(); ++i) {
Xt.index_put_({ i, "..." }, torch::tensor(X[i])); Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
if (!is_numeric.at(i)) {
states[features[i]] = std::vector<int>(maxValues[features[i]]);
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
}
} }
yt = torch::tensor(yv, torch::kInt32);
int maxy = *max_element(yv.begin(), yv.end()) + 1;
states[className] = std::vector<int>(maxy);
} }
iota(begin(states.at(className)), end(states.at(className)), 0);
yt = torch::tensor(yv, torch::kInt32); yt = torch::tensor(yv, torch::kInt32);
} }

View File

@@ -28,6 +28,9 @@ public:
std::vector<string> features; std::vector<string> features;
std::string className; std::string className;
map<std::string, std::vector<int>> states; map<std::string, std::vector<int>> states;
//catalog holds the mapping between dataset names and their corresponding indices of numeric features (-1) means all are numeric
//and an empty vector means none are numeric
map<std::string, std::vector<int>> catalog;
int nSamples, classNumStates; int nSamples, classNumStates;
double epsilon = 1e-5; double epsilon = 1e-5;
bool discretize; bool discretize;
@@ -65,8 +68,30 @@ private:
+ "classNumStates: " + std::to_string(classNumStates) + "\n" + "classNumStates: " + std::to_string(classNumStates) + "\n"
+ "states: " + states_ + "\n"; + "states: " + states_ + "\n";
} }
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X); std::string trim(const std::string& str)
{
std::string result = str;
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
return !std::isspace(ch);
}));
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
return !std::isspace(ch);
}).base(), result.end());
return result;
}
std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(trim(token));
}
return result;
}
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X, const std::vector<bool>& is_numeric);
void loadDataset(const std::string& name, bool class_last); void loadDataset(const std::string& name, bool class_last);
map<std::string, std::vector<int>> loadCatalog();
}; };
#endif //TEST_UTILS_H #endif //TEST_UTILS_H

48861
tests/data/adult.arff Normal file

File diff suppressed because it is too large Load Diff

27
tests/data/all.txt Normal file
View File

@@ -0,0 +1,27 @@
adult;class;[0,2,4,11,12,13]
balance-scale;class; all
breast-w;Class; all
diabetes;class; all
ecoli;class; all
glass;Type; all
hayes-roth;class; none
heart-statlog;class; [0,3,4,7,9,11]
ionosphere;class; all
iris;class; all
kdd_JapaneseVowels;speaker; all
letter;class; all
liver-disorders;selector; all
mfeat-factors;class; all
mfeat-fourier;class; all
mfeat-karhunen;class; all
mfeat-morphological;class; all
mfeat-zernike;class; all
optdigits;class; all
page-blocks;class; all
pendigits;class; all
segment;class; all
sonar;Class; all
spambase;class; all
vehicle;Class; all
waveform-5000;class; all
wine;class; all