Add Prior probability to predict

Fix predict_spode
This commit is contained in:
2025-03-01 20:29:45 +01:00
parent c127cb670a
commit 12e69a7f53
2 changed files with 120 additions and 69 deletions

View File

@@ -76,9 +76,9 @@ namespace platform {
std::shuffle(featureSelection.begin(), featureSelection.end(), g); std::shuffle(featureSelection.begin(), featureSelection.end(), g);
} }
// Remove used features // Remove used features
featureSelection.erase(remove_if(begin(featureSelection), end(featureSelection), [&](auto x) featureSelection.erase(remove_if(featureSelection.begin(), featureSelection.end(), [&](auto x)
{ return std::find(begin(featuresUsed), end(featuresUsed), x) != end(featuresUsed);}), { return std::find(featuresUsed.begin(), featuresUsed.end(), x) != featuresUsed.end();}),
end(featureSelection) featureSelection.end()
); );
int k = bisection ? pow(2, tolerance) : 1; int k = bisection ? pow(2, tolerance) : 1;
int counter = 0; // The model counter of the current pack int counter = 0; // The model counter of the current pack

View File

@@ -60,6 +60,8 @@ namespace platform {
states_.push_back(*max_element(y.begin(), y.end()) + 1); states_.push_back(*max_element(y.begin(), y.end()) + 1);
// //
statesClass_ = states_.back(); statesClass_ = states_.back();
classCounts_.resize(statesClass_, 0.0);
classPriors_.resize(statesClass_, 0.0);
// //
// Initialize data structures // Initialize data structures
// //
@@ -94,9 +96,6 @@ namespace platform {
classFeatureCounts_.resize(feature_offset * statesClass_); classFeatureCounts_.resize(feature_offset * statesClass_);
classFeatureProbs_.resize(feature_offset * statesClass_); classFeatureProbs_.resize(feature_offset * statesClass_);
// classCounts_[c]
classCounts_.resize(statesClass_, 0.0);
matrixState_ = MatrixState::COUNTS; matrixState_ = MatrixState::COUNTS;
// //
// Add samples // Add samples
@@ -187,8 +186,9 @@ namespace platform {
// ------------------------------------------------------- // -------------------------------------------------------
// //
// Once all samples are added in COUNTS mode, call this to: // Once all samples are added in COUNTS mode, call this to:
// 1) compute p(x_i=si | c) => classFeatureProbs_ // 1) compute p(c) => classPriors_
// 2) compute p(x_j=sj | c, x_i=si) => data_ (for i<j) dataOpp_ (for i>j) // 2) compute p(x_i=si | c) => classFeatureProbs_
// 3) compute p(x_j=sj | c, x_i=si) => data_ (for i<j) dataOpp_ (for i>j)
// //
void computeProbabilities() void computeProbabilities()
{ {
@@ -196,32 +196,67 @@ namespace platform {
throw std::logic_error("computeProbabilities: must be in COUNTS mode."); throw std::logic_error("computeProbabilities: must be in COUNTS mode.");
} }
double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0); double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0);
// (1) p(x_i=si | c) => classFeatureProbs_ // (1) p(c)
if (totalCount <= 0.0) {
// fallback => uniform
double unif = 1.0 / statesClass_;
for (int c = 0; c < statesClass_; ++c) {
classPriors_[c] = unif;
}
} else {
for (int c = 0; c < statesClass_; ++c) {
classPriors_[c] = classCounts_[c] / totalCount;
}
}
// (2) p(x_i=si | c) => classFeatureProbs_
int idx, sf; int idx, sf;
double denom, countVal, p; double denom, countVal, p;
// for (int feature = 0; feature < nFeatures_; ++feature) {
// sf = states_[feature];
// for (int c = 0; c < statesClass_; ++c) {
// denom = classCounts_[c] * sf;
// if (denom <= 0.0) {
// // fallback => uniform
// for (int sf_value = 0; sf_value < sf; ++sf_value) {
// idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c;
// classFeatureProbs_[idx] = 1.0 / sf;
// }
// } else {
// for (int sf_value = 0; sf_value < sf; ++sf_value) {
// idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c;
// countVal = classFeatureCounts_[idx];
// p = ((countVal + SMOOTHING / (statesClass_ * states_[feature])) / (totalCount + SMOOTHING));
// classFeatureProbs_[idx] = p;
// }
// }
// }
// }
double alpha = SMOOTHING;
for (int feature = 0; feature < nFeatures_; ++feature) { for (int feature = 0; feature < nFeatures_; ++feature) {
sf = states_[feature]; int sf = states_[feature];
for (int c = 0; c < statesClass_; ++c) { for (int c = 0; c < statesClass_; ++c) {
denom = classCounts_[c] * sf; double denom = classCounts_[c] + alpha * sf; // typical Laplace smoothing denominator
if (denom <= 0.0) { if (classCounts_[c] <= 0.0) {
// fallback => uniform // fallback => uniform
for (int sf_value = 0; sf_value < sf; ++sf_value) { for (int sf_value = 0; sf_value < sf; ++sf_value) {
idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; int idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c;
classFeatureProbs_[idx] = 1.0 / sf; classFeatureProbs_[idx] = 1.0 / sf;
} }
} else { } else {
for (int sf_value = 0; sf_value < sf; ++sf_value) { for (int sf_value = 0; sf_value < sf; ++sf_value) {
idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; int idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c;
countVal = classFeatureCounts_[idx]; double countVal = classFeatureCounts_[idx];
p = ((countVal + SMOOTHING / (statesClass_ * states_[feature])) / (totalCount + SMOOTHING)); // standard NB with Laplace alpha
double p = (countVal + alpha) / denom;
classFeatureProbs_[idx] = p; classFeatureProbs_[idx] = p;
} }
} }
} }
} }
// getCountFromTable(int classVal, int pIndex, int childIndex) // getCountFromTable(int classVal, int pIndex, int childIndex)
// (2) p(x_j=sj | c, x_i=si) => data_(i,si,j,sj,c) // (3) p(x_j=sj | c, x_i=si) => data_(i,si,j,sj,c)
// (2) p(x_i=si | c, x_j=sj) => dataOpp_(j,sj,i,si,c) // (3) p(x_i=si | c, x_j=sj) => dataOpp_(j,sj,i,si,c)
double pccCount, pcCount, ccCount; double pccCount, pcCount, ccCount;
double conditionalProb, oppositeCondProb; double conditionalProb, oppositeCondProb;
int part1, part2, p1, part2_class, p1_class; int part1, part2, p1, part2_class, p1_class;
@@ -231,13 +266,15 @@ namespace platform {
p1 = featureClassOffset_[parent] + sp; p1 = featureClassOffset_[parent] + sp;
part1 = pairOffset_[p1]; part1 = pairOffset_[p1];
p1_class = p1 * statesClass_; p1_class = p1 * statesClass_;
// int parentStates = states_[parent];
for (int child = parent - 1; child >= 0; --child) { for (int child = parent - 1; child >= 0; --child) {
// for (int child = 2; child >= 2; --child) { // for (int child = 2; child >= 2; --child) {
for (int sc = 0; sc < states_[child]; ++sc) { for (int sc = 0; sc < states_[child]; ++sc) {
part2 = featureClassOffset_[child] + sc; part2 = featureClassOffset_[child] + sc;
part2_class = part2 * statesClass_; part2_class = part2 * statesClass_;
for (int c = 0; c < statesClass_; c++) { for (int c = 0; c < statesClass_; c++) {
//idx = compute_index(parent, sp, child, sc, classval);
idx = (part1 + part2) * statesClass_ + c; idx = (part1 + part2) * statesClass_ + c;
// Parent, Child, Class Count // Parent, Child, Class Count
pccCount = data_[idx]; pccCount = data_[idx];
@@ -246,8 +283,19 @@ namespace platform {
// Child, Class count // Child, Class count
ccCount = classFeatureCounts_[part2_class + c]; ccCount = classFeatureCounts_[part2_class + c];
conditionalProb = (pccCount + SMOOTHING / states_[parent]) / (ccCount + SMOOTHING); conditionalProb = (pccCount + SMOOTHING / states_[parent]) / (ccCount + SMOOTHING);
// pcCount = classFeatureCounts_[(featureClassOffset_[parent] + sp) * statesClass_ + c];
// // This is the "parent, class" count
// int childStates = states_[child];
// conditionalProb = (pccCount + alpha) / (pcCount + alpha * childStates);
data_[idx] = conditionalProb; data_[idx] = conditionalProb;
oppositeCondProb = (pccCount + SMOOTHING / states_[child]) / (pcCount + SMOOTHING); oppositeCondProb = (pccCount + SMOOTHING / states_[child]) / (pcCount + SMOOTHING);
// ccCount = classFeatureCounts_[(featureClassOffset_[child] + sc) * statesClass_ + c];
// oppositeCondProb = (pccCount + alpha) / (ccCount + alpha * parentStates);
dataOpp_[idx] = oppositeCondProb; dataOpp_[idx] = oppositeCondProb;
} }
} }
@@ -268,50 +316,55 @@ namespace platform {
// We multiply p(c) * p(x_sp| c) * p(x_i| c, x_sp). // We multiply p(c) * p(x_sp| c) * p(x_i| c, x_sp).
// Then normalize the distribution. // Then normalize the distribution.
// //
std::vector<double> predict_proba_spode(const std::vector<int>& instance, int parent) // std::vector<double> predict_proba_spode(const std::vector<int>& instance, int parent)
{ // {
// accumulates posterior probabilities for each class // // accumulates posterior probabilities for each class
auto probs = std::vector<double>(statesClass_); // auto probs = std::vector<double>(statesClass_);
auto spodeProbs = std::vector<double>(statesClass_); // auto spodeProbs = std::vector<double>(statesClass_);
// Initialize the probabilities with the feature|class probabilities // // Initialize the probabilities with the feature|class probabilities
int localOffset; // int localOffset;
int sp = instance[parent]; // int sp = instance[parent];
localOffset = (featureClassOffset_[parent] + sp) * statesClass_; // localOffset = (featureClassOffset_[parent] + sp) * statesClass_;
for (int c = 0; c < statesClass_; ++c) { // for (int c = 0; c < statesClass_; ++c) {
spodeProbs[c] = classFeatureProbs_[localOffset + c]; // spodeProbs[c] = classFeatureProbs_[localOffset + c] * classPriors_[c];
} // }
int idx, base, sc, parent_offset; // int idx, base, sc, parent_offset;
sp = instance[parent]; // sp = instance[parent];
parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; // parent_offset = pairOffset_[featureClassOffset_[parent] + sp];
for (int child = 0; child < parent; ++child) { // for (int child = 0; child < parent; ++child) {
sc = instance[child]; // sc = instance[child];
base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; // base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_;
for (int c = 0; c < statesClass_; ++c) { // for (int c = 0; c < statesClass_; ++c) {
/* // /*
* The probability P(xc|xp,c) is stored in dataOpp_, and // * The probability P(xc|xp,c) is stored in dataOpp_, and
* the probability P(xp|xc,c) is stored in data_ // * the probability P(xp|xc,c) is stored in data_
*/ // */
/* // idx = base + c;
int base = pairOffset_[i * nFeatures_ + j]; // spodeProbs[c] *= data_[idx];
int blockSize = states_[i] * states_[j]; // spodeProbs[c] *= dataOpp_[idx];
return base + c * blockSize + (si * states_[j] + sj); // }
*/ // }
// index = compute_index(parent, instance[parent], child, instance[child], classVal); // // Normalize the probabilities
idx = base + c; // normalize(spodeProbs);
spodeProbs[c] *= data_[idx]; // return spodeProbs;
spodeProbs[c] *= dataOpp_[idx]; // }
}
}
// Normalize the probabilities
normalize(spodeProbs);
return spodeProbs;
}
int predict_spode(const std::vector<int>& instance, int parent) int predict_spode(const std::vector<int>& instance, int parent)
{ {
auto probs = predict_proba_spode(instance, parent); auto probs = predict_proba(instance, parent);
return (int)std::distance(probs.begin(), std::max_element(probs.begin(), probs.end())); return (int)std::distance(probs.begin(), std::max_element(probs.begin(), probs.end()));
} }
std::vector<double> predict_proba(const std::vector<int>& instance) // -------------------------------------------------------
// predict_proba
// -------------------------------------------------------
//
// P(c | x) ∝ p(c) * ∏_{i} p(x_i | c) * ∏_{i<j} p(x_j | c, x_i) * p(x_i | c, x_j)
//
// 'instance' should have size == nFeatures_ (no class).
// We multiply p(c) * p(x_i| c) * p(x_j| c, x_i) for all i, j.
// Then normalize the distribution.
//
// if spode != -1, we only return the probabilities for that spode
std::vector<double> predict_proba(const std::vector<int>& instance, int spode = -1)
{ {
// accumulates posterior probabilities for each class // accumulates posterior probabilities for each class
auto probs = std::vector<double>(statesClass_); auto probs = std::vector<double>(statesClass_);
@@ -325,7 +378,7 @@ namespace platform {
} }
localOffset = (featureClassOffset_[feature] + instance[feature]) * statesClass_; localOffset = (featureClassOffset_[feature] + instance[feature]) * statesClass_;
for (int c = 0; c < statesClass_; ++c) { for (int c = 0; c < statesClass_; ++c) {
spodeProbs[feature][c] = classFeatureProbs_[localOffset + c]; spodeProbs[feature][c] = classFeatureProbs_[localOffset + c] * classPriors_[c];
} }
} }
int idx, base, sp, sc, parent_offset; int idx, base, sp, sc, parent_offset;
@@ -344,24 +397,21 @@ namespace platform {
* The probability P(xc|xp,c) is stored in dataOpp_, and * The probability P(xc|xp,c) is stored in dataOpp_, and
* the probability P(xp|xc,c) is stored in data_ * the probability P(xp|xc,c) is stored in data_
*/ */
/*
int base = pairOffset_[i * nFeatures_ + j];
int blockSize = states_[i] * states_[j];
return base + c * blockSize + (si * states_[j] + sj);
*/
// index = compute_index(parent, instance[parent], child, instance[child], classVal);
idx = base + c; idx = base + c;
spodeProbs[child][c] *= data_[idx]; spodeProbs[child][c] *= data_[idx];
// spodeProbs[child][c] *= data_.at(index);
spodeProbs[parent][c] *= dataOpp_[idx]; spodeProbs[parent][c] *= dataOpp_[idx];
// spodeProbs[parent][c] *= dataOpp_.at(index);
} }
} }
} }
if (spode != -1) {
// no need to use significance_models_ if we are predicting with a single spode
normalize(spodeProbs[spode]);
return spodeProbs[spode];
}
/* add all the probabilities for each class */ /* add all the probabilities for each class */
for (int c = 0; c < statesClass_; ++c) { for (int c = 0; c < statesClass_; ++c) {
for (int i = 0; i < nFeatures_; ++i) { for (int i = 0; i < nFeatures_; ++i) {
probs[c] += spodeProbs[i][c]; probs[c] += spodeProbs[i][c] * significance_models_[i];
} }
} }
// Normalize the probabilities // Normalize the probabilities
@@ -433,6 +483,7 @@ namespace platform {
// classCounts_[c] // classCounts_[c]
std::vector<double> classCounts_; std::vector<double> classCounts_;
std::vector<double> classPriors_; // => p(c)
// For p(x_i=si| c), we store counts in classFeatureCounts_ => offset by featureClassOffset_[i] // For p(x_i=si| c), we store counts in classFeatureCounts_ => offset by featureClassOffset_[i]
std::vector<int> featureClassOffset_; std::vector<int> featureClassOffset_;