Complete proposal with only discretizing numeric features

This commit is contained in:
2025-08-24 01:09:25 +02:00
parent 0c7452e35b
commit 7c01646726
9 changed files with 380 additions and 48883 deletions

View File

@@ -37,6 +37,7 @@ namespace bayesnet {
className = className_;
states = iterativeLocalDiscretization(y, static_cast<KDB*>(this), dataset, features, className, states_, smoothing);
KDB::fit(dataset, features, className, states, smoothing);
fitted = true;
return *this;
}
torch::Tensor KDBLd::predict(torch::Tensor& X)

View File

@@ -118,17 +118,20 @@ namespace bayesnet {
}
return states;
}
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states_)
map<std::string, std::vector<int>> Proposal::fit_local_discretization(const torch::Tensor& y, map<std::string, std::vector<int>> states)
{
// Discretize the continuous input data and build pDataset (Classifier::dataset)
// We expect to have in states for numeric features an empty vector and for discretized features a vector of states
int m = Xf.size(1);
int n = Xf.size(0);
map<std::string, std::vector<int>> states;
pDataset = torch::zeros({ n + 1, m }, torch::kInt32);
auto yv = std::vector<int>(y.data_ptr<int>(), y.data_ptr<int>() + y.size(0));
// discretize input data by feature(row)
std::unique_ptr<mdlp::Discretizer> discretizer;
wasNumeric.resize(pFeatures.size());
for (auto i = 0; i < pFeatures.size(); ++i) {
auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
if (discretizationType == discretization_t::BINQ) {
discretizer = std::make_unique<mdlp::BinDisc>(ld_params.proposed_cuts, mdlp::strategy_t::QUANTILE);
} else if (discretizationType == discretization_t::BINU) {
@@ -136,13 +139,19 @@ namespace bayesnet {
} else { // Default is MDLP
discretizer = std::make_unique<mdlp::CPPFImdlp>(ld_params.min_length, ld_params.max_depth, ld_params.proposed_cuts);
}
auto Xt_ptr = Xf.index({ i }).data_ptr<float>();
auto Xt = std::vector<float>(Xt_ptr, Xt_ptr + Xf.size(1));
discretizer->fit(Xt, yv);
pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->transform(Xt)));
auto xStates = std::vector<int>(discretizer->getCutPoints().size() + 1);
iota(xStates.begin(), xStates.end(), 0);
states[pFeatures[i]] = xStates;
if (states[pFeatures[i]].empty()) {
// If the feature is numeric, we discretize it
pDataset.index_put_({ i, "..." }, torch::tensor(discretizer->fit_transform(Xt, yv)));
int n_states = discretizer->getCutPoints().size() + 1;
auto xStates = std::vector<int>(n_states);
iota(xStates.begin(), xStates.end(), 0);
states[pFeatures[i]] = xStates;
wasNumeric[i] = true;
} else {
wasNumeric[i] = false;
// If the feature is categorical, we just copy it
pDataset.index_put_({ i, "..." }, Xf[i].to(torch::kInt32));
}
discretizers[pFeatures[i]] = std::move(discretizer);
}
int n_classes = torch::max(y).item<int>() + 1;
@@ -157,8 +166,13 @@ namespace bayesnet {
auto Xtd = torch::zeros_like(X, torch::kInt32);
for (int i = 0; i < X.size(0); ++i) {
auto Xt = std::vector<float>(X[i].data_ptr<float>(), X[i].data_ptr<float>() + X.size(1));
auto Xd = discretizers[pFeatures[i]]->transform(Xt);
Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32));
std::vector<int> Xd;
if (wasNumeric[i]) {
auto Xd = discretizers[pFeatures[i]]->transform(Xt);
Xtd.index_put_({ i }, torch::tensor(Xd, torch::kInt32));
} else {
Xtd.index_put_({ i }, Xf[i].to(torch::kInt32));
}
}
return Xtd;
}

View File

@@ -61,6 +61,7 @@ namespace bayesnet {
std::vector<std::string>& notes; // Notes during fit from BaseClassifier
torch::Tensor& pDataset; // (n+1)xm tensor
std::vector<std::string>& pFeatures;
std::vector<bool> wasNumeric;
std::string& pClassName;
enum class discretization_t {
MDLP,

View File

@@ -36,6 +36,7 @@ namespace bayesnet {
className = className_;
states = iterativeLocalDiscretization(y, static_cast<SPODE*>(this), dataset, features, className, states_, smoothing);
SPODE::fit(dataset, features, className, states, smoothing);
fitted = true;
return *this;
}
torch::Tensor SPODELd::predict(torch::Tensor& X)

View File

@@ -35,6 +35,7 @@ namespace bayesnet {
className = className_;
states = iterativeLocalDiscretization(y, static_cast<TAN*>(this), dataset, features, className, states_, smoothing);
TAN::fit(dataset, features, className, states, smoothing);
fitted = true;
return *this;
}
torch::Tensor TANLd::predict(torch::Tensor& X)