Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
74b391907a
|
|||
1aa3b609e5
|
|||
f1a2349245
|
@@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
### Fixed
|
||||
|
||||
- Fixed an issue with local discretization that was discretizing all features wether they were numeric or categorical.
|
||||
- Fix testutils to return states for all features:
|
||||
- An empty vector is now returned for numeric features.
|
||||
- Categorical features now return their unique states.
|
||||
|
||||
## [1.2.1] - 2025-07-19
|
||||
|
||||
|
9
Makefile
9
Makefile
@@ -34,7 +34,7 @@ NC = \033[0m # No Color
|
||||
define ClearTests
|
||||
@for t in $(test_targets); do \
|
||||
if [ -f $(f_debug)/tests/$$t ]; then \
|
||||
echo ">>> Cleaning $$t..." ; \
|
||||
echo ">>> Removing $$t..." ; \
|
||||
rm -f $(f_debug)/tests/$$t ; \
|
||||
fi ; \
|
||||
done
|
||||
@@ -99,11 +99,12 @@ debug: ## Setup debug version using Conan
|
||||
release: ## Setup release version using Conan
|
||||
@$(call setup_target,"Release","$(f_release)","ENABLE_TESTING=OFF")
|
||||
|
||||
buildd: ## Build the debug targets
|
||||
cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
|
||||
buildd: ## Build the debug && test targets
|
||||
@cmake --build $(f_debug) --config Debug -t $(app_targets) --parallel $(JOBS)
|
||||
@cmake --build $(f_debug) -t $(test_targets) --parallel $(JOBS)
|
||||
|
||||
buildr: ## Build the release targets
|
||||
cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
|
||||
@cmake --build $(f_release) --config Release -t $(app_targets) --parallel $(JOBS)
|
||||
|
||||
|
||||
# Install targets
|
||||
|
@@ -118,37 +118,31 @@ namespace bayesnet {
|
||||
}
|
||||
return states;
|
||||
}
|
||||
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states)
|
||||
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states_)
|
||||
{
|
||||
// Discretize the continuous input data and build pDataset (Classifier::dataset)
|
||||
// We expect to have in states for numeric features an empty vector and for discretized features a vector of states
|
||||
int m = Xf.size(1);
|
||||
int n = Xf.size(0);
|
||||
map<std::string, std::vector<int>> states;
|
||||
pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
|
||||
auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
|
||||
// discretize input data by feature(row)
|
||||
std::unique_ptr<mdlp::Discretizer> discretizer;
|
||||
for (auto i = 0; i < pFeatures.size(); ++i) {
|
||||
if (discretizationType == discretization_t::BINQ) {
|
||||
discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
|
||||
} else if (discretizationType == discretization_t::BINU) {
|
||||
discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM);
|
||||
} else { // Default is MDLP
|
||||
discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
|
||||
}
|
||||
auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
|
||||
auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
|
||||
if (states[pFeatures[i]].empty()) {
|
||||
// If the feature is numeric, we discretize it
|
||||
if (discretizationType == discretization_t::BINQ) {
|
||||
discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
|
||||
} else if (discretizationType == discretization_t::BINU) {
|
||||
discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::UNIFORM);
|
||||
} else { // Default is MDLP
|
||||
discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
|
||||
}
|
||||
pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->fit_transform(Xt, yv)));
|
||||
int n_states = discretizer->getCutPoints().size() + 1;
|
||||
auto xStates = std::vector<int>(n_states);
|
||||
iota(xStates.begin(), xStates.end(), 0);
|
||||
states[pFeatures[i]] = xStates;
|
||||
} else {
|
||||
// If the feature is categorical, we just copy it
|
||||
pDataset.index_put_({ i, "..." }, Xf[i].to(torch::kInt32));
|
||||
}
|
||||
discretizer->fit(Xt, yv);
|
||||
pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
|
||||
auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
|
||||
iota(xStates.begin(), xStates.end(), 0);
|
||||
states[pFeatures[i]] = xStates;
|
||||
discretizers[pFeatures[i]] = std::move(discretizer);
|
||||
}
|
||||
int n_classes = torch::max(y).item<int>() + 1;
|
||||
|
@@ -20,7 +20,7 @@
|
||||
#include "bayesnet/ensembles/AODELd.h"
|
||||
#include "bayesnet/ensembles/BoostAODE.h"
|
||||
|
||||
const std::string ACTUAL_VERSION = "1.2.1";
|
||||
const std::string ACTUAL_VERSION = "1.2.2";
|
||||
|
||||
TEST_CASE("Test Bayesian Classifiers score & version", "[Models]")
|
||||
{
|
||||
@@ -496,3 +496,58 @@ TEST_CASE("Local discretization hyperparameters", "[Models]")
|
||||
REQUIRE_NOTHROW(clft.fit(raw.Xt, raw.yt, raw.features, raw.className, raw.states, raw.smoothing));
|
||||
REQUIRE(clft.getStatus() == bayesnet::NORMAL);
|
||||
}
|
||||
TEST_CASE("Test Dataset Loading", "[Datasets]")
|
||||
{
|
||||
int max_sample = 4;
|
||||
// Test loading a dataset
|
||||
RawDatasets dataset("iris", true);
|
||||
REQUIRE(dataset.Xt.size(0) == 4);
|
||||
REQUIRE(dataset.Xt.size(1) == 150);
|
||||
REQUIRE(dataset.yt.size(0) == 150);
|
||||
std::cout << "Dataset iris discretized " << std::endl;
|
||||
for (int sample = 0; sample < max_sample; sample++) {
|
||||
for (int feature = 0; feature < 4; feature++) {
|
||||
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
|
||||
}
|
||||
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||
}
|
||||
dataset = RawDatasets("iris", false);
|
||||
std::cout << "Dataset iris raw " << std::endl;
|
||||
for (int sample = 0; sample < max_sample; sample++) {
|
||||
for (int feature = 0; feature < 4; feature++) {
|
||||
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
|
||||
}
|
||||
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||
}
|
||||
// Test loading a dataset
|
||||
dataset = RawDatasets("adult", true);
|
||||
REQUIRE(dataset.Xt.size(0) == 14);
|
||||
REQUIRE(dataset.Xt.size(1) == 45222);
|
||||
REQUIRE(dataset.yt.size(0) == 45222);
|
||||
std::cout << "Dataset adult discretized " << std::endl;
|
||||
for (int sample = 0; sample < max_sample; sample++) {
|
||||
for (int feature = 0; feature < 14; feature++) {
|
||||
std::cout << dataset.Xt[feature][sample].item<int>() << " ";
|
||||
}
|
||||
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||
}
|
||||
auto features = dataset.features;
|
||||
std::cout << "States:" << std::endl;
|
||||
for (int i = 0; i < 14; i++) {
|
||||
std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
|
||||
}
|
||||
dataset = RawDatasets("adult", false);
|
||||
std::cout << "Dataset adult raw " << std::endl;
|
||||
for (int sample = 0; sample < max_sample; sample++) {
|
||||
for (int feature = 0; feature < 14; feature++) {
|
||||
std::cout << dataset.Xt[feature][sample].item<float>() << " ";
|
||||
}
|
||||
std::cout << "| " << dataset.yt[sample].item<int>() << std::endl;
|
||||
}
|
||||
std::cout << "States:" << std::endl;
|
||||
for (int i = 0; i < 14; i++) {
|
||||
std::cout << i << " has " << dataset.states.at(features[i]).size() << " states." << std::endl;
|
||||
}
|
||||
auto clf = bayesnet::TANLd();
|
||||
clf.fit(dataset.Xt, dataset.yt, dataset.features, dataset.className, dataset.states, dataset.smoothing);
|
||||
}
|
||||
|
@@ -5,6 +5,7 @@
|
||||
// ***************************************************************
|
||||
|
||||
#include <random>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include "TestUtils.h"
|
||||
#include "bayesnet/config.h"
|
||||
|
||||
@@ -51,6 +52,7 @@ private:
|
||||
|
||||
RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num_samples_, bool shuffle_, bool class_last, bool debug)
|
||||
{
|
||||
catalog = loadCatalog();
|
||||
num_samples = num_samples_;
|
||||
shuffle = shuffle_;
|
||||
discretize = discretize_;
|
||||
@@ -62,7 +64,7 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
|
||||
nSamples = dataset.size(1);
|
||||
weights = torch::full({ nSamples }, 1.0 / nSamples, torch::kDouble);
|
||||
weightsv = std::vector<double>(nSamples, 1.0 / nSamples);
|
||||
classNumStates = discretize ? states.at(className).size() : 0;
|
||||
classNumStates = states.at(className).size();
|
||||
auto fold = folding::StratifiedKFold(5, yt, 271);
|
||||
auto [train, test] = fold.getFold(0);
|
||||
auto train_t = torch::tensor(train);
|
||||
@@ -76,20 +78,92 @@ RawDatasets::RawDatasets(const std::string& file_name, bool discretize_, int num
|
||||
std::cout << to_string();
|
||||
}
|
||||
|
||||
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X)
|
||||
map<std::string, int> RawDatasets::discretizeDataset(std::vector<mdlp::samples_t>& X, const std::vector<bool>& is_numeric)
|
||||
{
|
||||
|
||||
map<std::string, int> maxes;
|
||||
auto fimdlp = mdlp::CPPFImdlp();
|
||||
for (int i = 0; i < X.size(); i++) {
|
||||
fimdlp.fit(X[i], yv);
|
||||
mdlp::labels_t& xd = fimdlp.transform(X[i]);
|
||||
mdlp::labels_t xd;
|
||||
if (is_numeric.at(i)) {
|
||||
fimdlp.fit(X[i], yv);
|
||||
xd = fimdlp.transform(X[i]);
|
||||
} else {
|
||||
std::transform(X[i].begin(), X[i].end(), back_inserter(xd), [](const auto& val) {
|
||||
return static_cast<int>(val);
|
||||
});
|
||||
}
|
||||
maxes[features[i]] = *max_element(xd.begin(), xd.end()) + 1;
|
||||
Xv.push_back(xd);
|
||||
}
|
||||
return maxes;
|
||||
}
|
||||
|
||||
map<std::string, std::vector<int>> RawDatasets::loadCatalog()
|
||||
{
|
||||
map<std::string, std::vector<int>> catalogNames;
|
||||
ifstream catalog(Paths::datasets() + "all.txt");
|
||||
std::vector<int> numericFeaturesIdx;
|
||||
if (!catalog.is_open()) {
|
||||
throw std::invalid_argument("Unable to open catalog file. [" + Paths::datasets() + +"all.txt" + "]");
|
||||
}
|
||||
std::string line;
|
||||
std::vector<std::string> sorted_lines;
|
||||
while (getline(catalog, line)) {
|
||||
if (line.empty() || line[0] == '#') {
|
||||
continue;
|
||||
}
|
||||
sorted_lines.push_back(line);
|
||||
}
|
||||
sort(sorted_lines.begin(), sorted_lines.end(), [](const auto& lhs, const auto& rhs) {
|
||||
const auto result = mismatch(lhs.cbegin(), lhs.cend(), rhs.cbegin(), rhs.cend(), [](const auto& lhs, const auto& rhs) {return tolower(lhs) == tolower(rhs);});
|
||||
|
||||
return result.second != rhs.cend() && (result.first == lhs.cend() || tolower(*result.first) < tolower(*result.second));
|
||||
});
|
||||
|
||||
for (const auto& line : sorted_lines) {
|
||||
std::vector<std::string> tokens = split(line, ';');
|
||||
std::string name = tokens[0];
|
||||
std::string className;
|
||||
numericFeaturesIdx.clear();
|
||||
int size = tokens.size();
|
||||
switch (size) {
|
||||
case 1:
|
||||
className = "-1";
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
break;
|
||||
case 2:
|
||||
className = tokens[1];
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
break;
|
||||
case 3:
|
||||
{
|
||||
className = tokens[1];
|
||||
auto numericFeatures = tokens[2];
|
||||
if (numericFeatures == "all") {
|
||||
numericFeaturesIdx.push_back(-1);
|
||||
} else {
|
||||
if (numericFeatures != "none") {
|
||||
auto features = nlohmann::json::parse(numericFeatures);
|
||||
for (auto& f : features) {
|
||||
numericFeaturesIdx.push_back(f);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw std::invalid_argument("Invalid catalog file format.");
|
||||
|
||||
}
|
||||
catalogNames[name] = numericFeaturesIdx;
|
||||
}
|
||||
catalog.close();
|
||||
if (catalogNames.empty()) {
|
||||
throw std::invalid_argument("Catalog is empty. Please check the catalog file.");
|
||||
}
|
||||
return catalogNames;
|
||||
}
|
||||
|
||||
void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
||||
{
|
||||
auto handler = ShuffleArffFiles(num_samples, shuffle);
|
||||
@@ -101,8 +175,27 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
||||
className = handler.getClassName();
|
||||
auto attributes = handler.getAttributes();
|
||||
transform(attributes.begin(), attributes.end(), back_inserter(features), [](const auto& pair) { return pair.first; });
|
||||
auto numericFeaturesIdx = catalog.at(name);
|
||||
std::vector<bool> is_numeric;
|
||||
if (numericFeaturesIdx.empty()) {
|
||||
// no numeric features
|
||||
is_numeric.assign(features.size(), false);
|
||||
} else {
|
||||
if (numericFeaturesIdx[0] == -1) {
|
||||
// all features are numeric
|
||||
is_numeric.assign(features.size(), true);
|
||||
} else {
|
||||
// some features are numeric
|
||||
is_numeric.assign(features.size(), false);
|
||||
for (const auto& idx : numericFeaturesIdx) {
|
||||
if (idx >= 0 && idx < features.size()) {
|
||||
is_numeric[idx] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Discretize Dataset
|
||||
auto maxValues = discretizeDataset(X);
|
||||
auto maxValues = discretizeDataset(X, is_numeric);
|
||||
maxValues[className] = *max_element(yv.begin(), yv.end()) + 1;
|
||||
if (discretize) {
|
||||
// discretize the tensor as well
|
||||
@@ -113,13 +206,23 @@ void RawDatasets::loadDataset(const std::string& name, bool class_last)
|
||||
Xt.index_put_({ i, "..." }, torch::tensor(Xv[i], torch::kInt32));
|
||||
}
|
||||
states[className] = std::vector<int>(maxValues[className]);
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
} else {
|
||||
Xt = torch::zeros({ static_cast<int>(X.size()), static_cast<int>(X[0].size()) }, torch::kFloat32);
|
||||
for (int i = 0; i < features.size(); ++i) {
|
||||
Xt.index_put_({ i, "..." }, torch::tensor(X[i]));
|
||||
if (!is_numeric.at(i)) {
|
||||
states[features[i]] = std::vector<int>(maxValues[features[i]]);
|
||||
iota(begin(states.at(features[i])), end(states.at(features[i])), 0);
|
||||
} else {
|
||||
states[features[i]] = std::vector<int>();
|
||||
}
|
||||
}
|
||||
yt = torch::tensor(yv, torch::kInt32);
|
||||
int maxy = *max_element(yv.begin(), yv.end()) + 1;
|
||||
states[className] = std::vector<int>(maxy);
|
||||
}
|
||||
iota(begin(states.at(className)), end(states.at(className)), 0);
|
||||
yt = torch::tensor(yv, torch::kInt32);
|
||||
|
||||
}
|
||||
|
||||
|
@@ -28,6 +28,9 @@ public:
|
||||
std::vector<string> features;
|
||||
std::string className;
|
||||
map<std::string, std::vector<int>> states;
|
||||
//catalog holds the mapping between dataset names and their corresponding indices of numeric features (-1) means all are numeric
|
||||
//and an empty vector means none are numeric
|
||||
map<std::string, std::vector<int>> catalog;
|
||||
int nSamples, classNumStates;
|
||||
double epsilon = 1e-5;
|
||||
bool discretize;
|
||||
@@ -65,8 +68,30 @@ private:
|
||||
+ "classNumStates: " + std::to_string(classNumStates) + "\n"
|
||||
+ "states: " + states_ + "\n";
|
||||
}
|
||||
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X);
|
||||
std::string trim(const std::string& str)
|
||||
{
|
||||
std::string result = str;
|
||||
result.erase(result.begin(), std::find_if(result.begin(), result.end(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}));
|
||||
result.erase(std::find_if(result.rbegin(), result.rend(), [](int ch) {
|
||||
return !std::isspace(ch);
|
||||
}).base(), result.end());
|
||||
return result;
|
||||
}
|
||||
std::vector<std::string> split(const std::string& text, char delimiter)
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
std::stringstream ss(text);
|
||||
std::string token;
|
||||
while (std::getline(ss, token, delimiter)) {
|
||||
result.push_back(trim(token));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
map<std::string, int> discretizeDataset(std::vector<mdlp::samples_t>& X, const std::vector<bool>& is_numeric);
|
||||
void loadDataset(const std::string& name, bool class_last);
|
||||
map<std::string, std::vector<int>> loadCatalog();
|
||||
};
|
||||
|
||||
#endif //TEST_UTILS_H
|
48861
tests/data/adult.arff
Normal file
48861
tests/data/adult.arff
Normal file
File diff suppressed because it is too large
Load Diff
27
tests/data/all.txt
Normal file
27
tests/data/all.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
adult;class;[0,2,4,10,11,12]
|
||||
balance-scale;class; all
|
||||
breast-w;Class; all
|
||||
diabetes;class; all
|
||||
ecoli;class; all
|
||||
glass;Type; all
|
||||
hayes-roth;class; none
|
||||
heart-statlog;class; [0,3,4,7,9,11]
|
||||
ionosphere;class; all
|
||||
iris;class; all
|
||||
kdd_JapaneseVowels;speaker; all
|
||||
letter;class; all
|
||||
liver-disorders;selector; all
|
||||
mfeat-factors;class; all
|
||||
mfeat-fourier;class; all
|
||||
mfeat-karhunen;class; all
|
||||
mfeat-morphological;class; all
|
||||
mfeat-zernike;class; all
|
||||
optdigits;class; all
|
||||
page-blocks;class; all
|
||||
pendigits;class; all
|
||||
segment;class; all
|
||||
sonar;Class; all
|
||||
spambase;class; all
|
||||
vehicle;Class; all
|
||||
waveform-5000;class; all
|
||||
wine;class; all
|
Reference in New Issue
Block a user