diff --git a/src/experimental_clfs/ExpClf.h b/src/experimental_clfs/ExpClf.h index abc7d04..fc6d3ec 100644 --- a/src/experimental_clfs/ExpClf.h +++ b/src/experimental_clfs/ExpClf.h @@ -11,11 +11,11 @@ #include #include #include -#include "bayesnet/ensembles/Boost.h" +#include +#include #include "common/Timer.hpp" #include "CountingSemaphore.hpp" #include "Xaode.hpp" -#include "Xaode2.hpp" namespace platform { class ExpClf : public bayesnet::Boost { @@ -45,8 +45,7 @@ namespace platform { void remove_last_parent(); protected: bool debug = false; - // Xaode aode; - Xaode2 aode_; + Xaode aode_; torch::Tensor weights_; const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted"; inline void normalize_weights(int num_instances) diff --git a/src/experimental_clfs/XA1DE.h b/src/experimental_clfs/XA1DE.h index cede388..0c1aefc 100644 --- a/src/experimental_clfs/XA1DE.h +++ b/src/experimental_clfs/XA1DE.h @@ -8,6 +8,7 @@ #define XA1DE_H #include "Xaode.hpp" #include "ExpClf.h" +#include namespace platform { class XA1DE : public ExpClf { diff --git a/src/experimental_clfs/XBAODE.cpp b/src/experimental_clfs/XBAODE.cpp index dc7d653..27660a4 100644 --- a/src/experimental_clfs/XBAODE.cpp +++ b/src/experimental_clfs/XBAODE.cpp @@ -37,7 +37,7 @@ namespace platform { // Algorithm based on the adaboost algorithm for classification // as explained in Ensemble methods (Zhi-Hua Zhou, 2012) double alpha_t = 0; - weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); + weights_ = torch::full({ m }, 1.0 / static_cast(m), torch::kFloat64); bool finished = false; std::vector featuresUsed; aode_.fit(X_train_, y_train_, features, className, states, weights_, false); @@ -88,8 +88,7 @@ namespace platform { auto feature = featureSelection[0]; featureSelection.erase(featureSelection.begin()); auto model = XSpode(feature); - model.fit(X_train_, y_train_, weights_); - alpha_t = 0.0; + model.fit(X_train_, y_train_, weights_, smoothing); std::vector ypred; if (alpha_block) { // diff --git a/src/experimental_clfs/XSpode.hpp b/src/experimental_clfs/XSpode.hpp index 1f1c10c..030e1d7 100644 --- a/src/experimental_clfs/XSpode.hpp +++ b/src/experimental_clfs/XSpode.hpp @@ -11,22 +11,13 @@ #include #include #include +#include "CountingSemaphore.hpp" + namespace platform { class XSpode { public: - // -------------------------------------- - // The SPODE can be EMPTY (just created), - // in COUNTS mode (accumulating raw counts), - // or in PROBS mode (storing conditional probabilities). - // -------------------------------------- - enum class MatrixState { - EMPTY, - COUNTS, - PROBS - }; - // -------------------------------------- // Constructor // @@ -36,8 +27,8 @@ namespace platform { : superParent_{ spIndex }, nFeatures_{ 0 }, statesClass_{ 0 }, - matrixState_{ MatrixState::EMPTY }, - alpha_{ 1.0 } + alpha_{ 1.0 }, + semaphore_{ CountingSemaphore::getInstance() } { } @@ -61,7 +52,7 @@ namespace platform { // -------------------------------------- void fit(const std::vector>& X, const std::vector& y, - const torch::Tensor& weights) + const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing) { int numInstances = static_cast(y.size()); nFeatures_ = static_cast(X.size()); @@ -99,9 +90,6 @@ namespace platform { } childCounts_.resize(totalSize, 0.0); - // Switch to COUNTS mode - matrixState_ = MatrixState::COUNTS; - // Accumulate raw counts for (int n = 0; n < numInstances; n++) { std::vector instance(nFeatures_ + 1); @@ -112,11 +100,20 @@ namespace platform { addSample(instance, weights[n].item()); } - // Laplace smoothing scaled to #instances - alpha_ = 1.0 / static_cast(numInstances); + switch (smoothing) { + case bayesnet::Smoothing_t::ORIGINAL: + alpha_ = 1.0 / numInstances; + break; + case bayesnet::Smoothing_t::LAPLACE: + alpha_ = 1.0; + break; + default: + alpha_ = 0.0; // No smoothing + } initializer_ = initializer_ = std::numeric_limits::max() / (nFeatures_ * nFeatures_); // Convert raw counts to probabilities computeProbabilities(); + fitted_ = true; } // -------------------------------------- @@ -128,9 +125,6 @@ namespace platform { // void addSample(const std::vector& instance, double weight) { - if (matrixState_ != MatrixState::COUNTS) { - throw std::logic_error("addSample: Not in COUNTS mode!"); - } if (weight <= 0.0) return; int c = instance.back(); @@ -167,10 +161,6 @@ namespace platform { // -------------------------------------- void computeProbabilities() { - if (matrixState_ != MatrixState::COUNTS) { - throw std::logic_error("computeProbabilities: must be in COUNTS mode."); - } - double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0); // p(c) => classPriors_ @@ -225,7 +215,6 @@ namespace platform { } } - matrixState_ = MatrixState::PROBS; } // -------------------------------------- @@ -239,10 +228,6 @@ namespace platform { // -------------------------------------- std::vector predict_proba(const std::vector& instance) const { - if (matrixState_ != MatrixState::PROBS) { - throw std::logic_error("predict_proba: the model is not in PROBS mode."); - } - std::vector probs(statesClass_, 0.0); // Multiply p(c) × p(x_sp | c) @@ -270,6 +255,41 @@ namespace platform { normalize(probs); return probs; } + std::vector> predict_proba(const std::vector>& test_data) + { + int test_size = test_data[0].size(); + int sample_size = test_data.size(); + auto probabilities = std::vector>(test_size, std::vector(statesClass_)); + + int chunk_size = std::min(150, int(test_size / semaphore_.getMaxCount()) + 1); + std::vector threads; + auto worker = [&](const std::vector>& samples, int begin, int chunk, int sample_size, std::vector>& predictions) { + std::string threadName = "(V)PWorker-" + std::to_string(begin) + "-" + std::to_string(chunk); +#if defined(__linux__) + pthread_setname_np(pthread_self(), threadName.c_str()); +#else + pthread_setname_np(threadName.c_str()); +#endif + + std::vector instance(sample_size); + for (int sample = begin; sample < begin + chunk; ++sample) { + for (int feature = 0; feature < sample_size; ++feature) { + instance[feature] = samples[feature][sample]; + } + predictions[sample] = predict_proba(instance); + } + semaphore_.release(); + }; + for (int begin = 0; begin < test_size; begin += chunk_size) { + int chunk = std::min(chunk_size, test_size - begin); + semaphore_.acquire(); + threads.emplace_back(worker, test_data, begin, chunk, sample_size, std::ref(probabilities)); + } + for (auto& thread : threads) { + thread.join(); + } + return probabilities; + } // -------------------------------------- // predict @@ -283,13 +303,19 @@ namespace platform { return static_cast(std::distance(p.begin(), std::max_element(p.begin(), p.end()))); } - std::vector predict(const std::vector>& X) const + std::vector predict(std::vector>& test_data) { - std::vector preds; - for (const auto& instance : X) { - preds.push_back(predict(instance)); + if (!fitted_) { + throw std::logic_error(CLASSIFIER_NOT_FITTED); } - return preds; + auto probabilities = predict_proba(test_data); + std::vector predictions(probabilities.size(), 0); + + for (size_t i = 0; i < probabilities.size(); i++) { + predictions[i] = std::distance(probabilities[i].begin(), std::max_element(probabilities[i].begin(), probabilities[i].end())); + } + + return predictions; } // -------------------------------------- @@ -317,9 +343,6 @@ namespace platform { << "nFeatures_ = " << nFeatures_ << "\n" << "superParent_ = " << superParent_ << "\n" << "statesClass_ = " << statesClass_ << "\n" - << "matrixState_ = " - << (matrixState_ == MatrixState::EMPTY ? "EMPTY" - : (matrixState_ == MatrixState::COUNTS ? "COUNTS" : "PROBS")) << "\n"; oss << "States: ["; @@ -366,8 +389,11 @@ namespace platform { int superParent_; // which feature is the single super-parent int nFeatures_; int statesClass_; + bool fitted_ = false; std::vector states_; // [states_feat0, ..., states_feat(N-1)] (class not included in this array) + const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted"; + // Class counts std::vector classCounts_; // [c], accumulative std::vector classPriors_; // [c], after normalization @@ -384,9 +410,9 @@ namespace platform { std::vector childProbs_; std::vector childOffsets_; - MatrixState matrixState_; double alpha_ = 1.0; double initializer_; // for numerical stability + CountingSemaphore& semaphore_; }; } // namespace platform diff --git a/src/experimental_clfs/Xaode.hpp b/src/experimental_clfs/Xaode.hpp index 29c6f35..87d34da 100644 --- a/src/experimental_clfs/Xaode.hpp +++ b/src/experimental_clfs/Xaode.hpp @@ -9,15 +9,17 @@ #ifndef XAODE_H #define XAODE_H #include +#include #include #include #include -#include #include #include #include +#include #include + namespace platform { class Xaode { public: @@ -108,30 +110,39 @@ namespace platform { instance[nFeatures_] = y[n_instance]; addSample(instance, weights[n_instance].item()); } + // alpha_ Laplace smoothing adapted to the number of instances + alpha_ = 1.0 / static_cast(num_instances); + initializer_ = std::numeric_limits::max() / (nFeatures_ * nFeatures_); computeProbabilities(); } - // Optional: print a quick summary - void show() const + std::string to_string() const { - std::cout << "-------- Xaode.show() --------" << std::endl + std::ostringstream ostream; + ostream << "-------- Xaode.status --------" << std::endl << "- nFeatures = " << nFeatures_ << std::endl << "- statesClass = " << statesClass_ << std::endl << "- matrixState = " << (matrixState_ == MatrixState::COUNTS ? "COUNTS" : "PROBS") << std::endl; - std::cout << "- states: size: " << states_.size() << std::endl; - for (int s : states_) std::cout << s << " "; std::cout << std::endl; - std::cout << "- classCounts: size: " << classCounts_.size() << std::endl; - for (double cc : classCounts_) std::cout << cc << " "; std::cout << std::endl; - std::cout << "- classFeatureCounts: size: " << classFeatureCounts_.size() << std::endl; - for (double cfc : classFeatureCounts_) std::cout << cfc << " "; std::cout << std::endl; - std::cout << "- classFeatureProbs: size: " << classFeatureProbs_.size() << std::endl; - for (double cfp : classFeatureProbs_) std::cout << cfp << " "; std::cout << std::endl; - std::cout << "- featureClassOffset: size: " << featureClassOffset_.size() << std::endl; - for (int f : featureClassOffset_) std::cout << f << " "; std::cout << std::endl; - std::cout << "- pairOffset_: size: " << pairOffset_.size() << std::endl; - for (int p : pairOffset_) std::cout << p << " "; std::cout << std::endl; - std::cout << "- data: size: " << data_.size() << std::endl; - for (double d : data_) std::cout << d << " "; std::cout << std::endl; - std::cout << "--------------------------------" << std::endl; + ostream << "- states: size: " << states_.size() << std::endl; + for (int s : states_) ostream << s << " "; ostream << std::endl; + ostream << "- classCounts: size: " << classCounts_.size() << std::endl; + for (double cc : classCounts_) ostream << cc << " "; ostream << std::endl; + ostream << "- classPriors: size: " << classPriors_.size() << std::endl; + for (double cp : classPriors_) ostream << cp << " "; ostream << std::endl; + ostream << "- classFeatureCounts: size: " << classFeatureCounts_.size() << std::endl; + for (double cfc : classFeatureCounts_) ostream << cfc << " "; ostream << std::endl; + ostream << "- classFeatureProbs: size: " << classFeatureProbs_.size() << std::endl; + for (double cfp : classFeatureProbs_) ostream << cfp << " "; ostream << std::endl; + ostream << "- featureClassOffset: size: " << featureClassOffset_.size() << std::endl; + for (int f : featureClassOffset_) ostream << f << " "; ostream << std::endl; + ostream << "- pairOffset_: size: " << pairOffset_.size() << std::endl; + for (int p : pairOffset_) ostream << p << " "; ostream << std::endl; + ostream << "- data: size: " << data_.size() << std::endl; + for (double d : data_) ostream << d << " "; ostream << std::endl; + ostream << "- dataOpp: size: " << dataOpp_.size() << std::endl; + for (double d : dataOpp_) ostream << d << " "; ostream << std::endl; + ostream << "--------------------------------" << std::endl; + std::string output = ostream.str(); + return output; } // ------------------------------------------------------- // addSample (only in COUNTS mode) @@ -146,18 +157,7 @@ namespace platform { // (B) increment feature–class counts => for p(x_i|c) // (C) increment pair (superparent= i, child= j) counts => data_ // - - // if (matrixState_ != MatrixState::COUNTS) { - // throw std::logic_error("addSample: not in COUNTS mode."); - // } - // if (static_cast(instance.size()) != nFeatures_ + 1) { - // throw std::invalid_argument("addSample: instance.size() must be nFeatures_ + 1."); - // } - int c = instance.back(); - // if (c < 0 || c >= statesClass_) { - // throw std::out_of_range("addSample: class index out of range."); - // } if (weight <= 0.0) { return; } @@ -166,17 +166,17 @@ namespace platform { // (B,C) // We'll store raw counts now and turn them into p(child| c, superparent) later. - int idx, fcIndex, si, sj, i_offset; - for (int i = 0; i < nFeatures_; ++i) { - si = instance[i]; + int idx, fcIndex, sp, sc, i_offset; + for (int parent = 0; parent < nFeatures_; ++parent) { + sp = instance[parent]; // (B) increment feature–class counts => for p(x_i|c) - fcIndex = (featureClassOffset_[i] + si) * statesClass_ + c; + fcIndex = (featureClassOffset_[parent] + sp) * statesClass_ + c; classFeatureCounts_[fcIndex] += weight; // (C) increment pair (superparent= i, child= j) counts => data_ - i_offset = pairOffset_[featureClassOffset_[i] + si]; - for (int j = 0; j < i; ++j) { - sj = instance[j]; - idx = (i_offset + featureClassOffset_[j] + sj) * statesClass_ + c; + i_offset = pairOffset_[featureClassOffset_[parent] + sp]; + for (int child = 0; child < parent; ++child) { + sc = instance[child]; + idx = (i_offset + featureClassOffset_[child] + sc) * statesClass_ + c; data_[idx] += weight; } } @@ -205,36 +205,26 @@ namespace platform { } } else { for (int c = 0; c < statesClass_; ++c) { - classPriors_[c] = classCounts_[c] / totalCount; + classPriors_[c] = (classCounts_[c] + alpha_) / (totalCount + alpha_ * statesClass_); } } // (2) p(x_i=si | c) => classFeatureProbs_ int idx, sf; - double denom, countVal, p; + double denom; for (int feature = 0; feature < nFeatures_; ++feature) { sf = states_[feature]; for (int c = 0; c < statesClass_; ++c) { - denom = classCounts_[c] * sf; - if (denom <= 0.0) { - // fallback => uniform - for (int sf_value = 0; sf_value < sf; ++sf_value) { - idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; - classFeatureProbs_[idx] = 1.0 / sf; - } - } else { - for (int sf_value = 0; sf_value < sf; ++sf_value) { - idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; - countVal = classFeatureCounts_[idx]; - p = ((countVal + alpha_ / (statesClass_ * states_[feature])) / (totalCount + alpha_)); - classFeatureProbs_[idx] = p; - } + denom = classCounts_[c] + alpha_ * sf; + for (int sf_value = 0; sf_value < sf; ++sf_value) { + idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; + classFeatureProbs_[idx] = (classFeatureCounts_[idx] + alpha_) / denom; } } } // getCountFromTable(int classVal, int pIndex, int childIndex) // (3) p(x_c=sc | c, x_p=sp) => data_(parent,sp,child,sc,c) // (3) p(x_p=sp | c, x_c=sc) => dataOpp_(child,sc,parent,sp,c) - // C(x_c, x_p, c) + alpha_/Card(xp) + // C(x_c, x_p, c) + alpha_ // P(x_p | x_c, c) = ----------------------------------- // C(x_c, c) + alpha_ double pcc_count, pc_count, cc_count; @@ -258,10 +248,10 @@ namespace platform { // Child, Class count cc_count = classFeatureCounts_[part2_class + c]; // p(x_c=sc | c, x_p=sp) - conditionalProb = (pcc_count + alpha_ / states_[parent]) / (cc_count + alpha_); + conditionalProb = (pcc_count + alpha_) / (pc_count + alpha_ * states_[child]); data_[idx] = conditionalProb; // p(x_p=sp | c, x_c=sc) - oppositeCondProb = (pcc_count + alpha_ / states_[child]) / (pc_count + alpha_); + oppositeCondProb = (pcc_count + alpha_) / (cc_count + alpha_ * states_[parent]); dataOpp_[idx] = oppositeCondProb; } } @@ -286,30 +276,39 @@ namespace platform { { // accumulates posterior probabilities for each class auto probs = std::vector(statesClass_); - auto spodeProbs = std::vector(statesClass_); + auto spodeProbs = std::vector(statesClass_, 0.0); + if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) { + return spodeProbs; + } // Initialize the probabilities with the feature|class probabilities x class priors int localOffset; int sp = instance[parent]; localOffset = (featureClassOffset_[parent] + sp) * statesClass_; for (int c = 0; c < statesClass_; ++c) { - spodeProbs[c] = classFeatureProbs_[localOffset + c] * classPriors_[c]; + spodeProbs[c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_; } int idx, base, sc, parent_offset; - sp = instance[parent]; - parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; for (int child = 0; child < nFeatures_; ++child) { if (child == parent) { continue; } sc = instance[child]; - base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; + if (child > parent) { + parent_offset = pairOffset_[featureClassOffset_[child] + sc]; + base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_; + } else { + parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; + base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; + } for (int c = 0; c < statesClass_; ++c) { /* * The probability P(xc|xp,c) is stored in dataOpp_, and * the probability P(xp|xc,c) is stored in data_ */ idx = base + c; - spodeProbs[c] *= child < parent ? dataOpp_[idx] : data_[idx]; + double factor = child > parent ? dataOpp_[idx] : data_[idx]; + // double factor = data_[idx]; + spodeProbs[c] *= factor; } } // Normalize the probabilities @@ -345,7 +344,7 @@ namespace platform { } localOffset = (featureClassOffset_[feature] + instance[feature]) * statesClass_; for (int c = 0; c < statesClass_; ++c) { - spodeProbs[feature][c] = classFeatureProbs_[localOffset + c] * classPriors_[c]; + spodeProbs[feature][c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_; } } int idx, base, sp, sc, parent_offset; @@ -358,15 +357,23 @@ namespace platform { parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; for (int child = 0; child < parent; ++child) { sc = instance[child]; - base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; + if (child > parent) { + parent_offset = pairOffset_[featureClassOffset_[child] + sc]; + base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_; + } else { + parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; + base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; + } for (int c = 0; c < statesClass_; ++c) { /* * The probability P(xc|xp,c) is stored in dataOpp_, and * the probability P(xp|xc,c) is stored in data_ */ idx = base + c; - spodeProbs[child][c] *= data_[idx]; - spodeProbs[parent][c] *= dataOpp_[idx]; + double factor_child = child > parent ? data_[idx] : dataOpp_[idx]; + double factor_parent = child > parent ? dataOpp_[idx] : data_[idx]; + spodeProbs[child][c] *= factor_child; + spodeProbs[parent][c] *= factor_parent; } } } @@ -454,7 +461,8 @@ namespace platform { MatrixState matrixState_; - double alpha_ = 1.0; + double alpha_ = 1.0; // Laplace smoothing + double initializer_ = 1.0; std::vector active_parents; }; } diff --git a/src/experimental_clfs/Xaode2.hpp b/src/experimental_clfs/Xaode2.hpp deleted file mode 100644 index 520c8e7..0000000 --- a/src/experimental_clfs/Xaode2.hpp +++ /dev/null @@ -1,469 +0,0 @@ -// *************************************************************** -// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez -// SPDX-FileType: SOURCE -// SPDX-License-Identifier: MIT -// *************************************************************** -// Based on the Geoff. I. Webb A1DE java algorithm -// https://weka.sourceforge.io/packageMetaData/AnDE/Latest.html - -#ifndef XAODE2_H -#define XAODE2_H -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace platform { - class Xaode2 { - public: - // ------------------------------------------------------- - // The Xaode can be EMPTY (just created), in COUNTS mode (accumulating raw counts) - // or PROBS mode (storing conditional probabilities). - enum class MatrixState { - EMPTY, - COUNTS, - PROBS - }; - std::vector significance_models_; - Xaode2() : nFeatures_{ 0 }, statesClass_{ 0 }, matrixState_{ MatrixState::EMPTY } {} - // ------------------------------------------------------- - // fit - // ------------------------------------------------------- - // - // Classifiers interface - // all parameter decide if the model is initialized with all the parents active or none of them - // - // states.size() = nFeatures + 1, - // where states.back() = number of class states. - // - // We'll store: - // 1) p(x_i=si | c) in classFeatureProbs_ - // 2) p(x_j=sj | c, x_i=si) in data_, with i i is "superparent," j is "child." - // - // Internally, in COUNTS mode, data_ accumulates raw counts, then - // computeProbabilities(...) normalizes them into conditionals. - void fit(std::vector>& X, std::vector& y, const std::vector& features, const std::string& className, std::map>& states, const torch::Tensor& weights, const bool all_parents) - { - int num_instances = X[0].size(); - nFeatures_ = X.size(); - - significance_models_.resize(nFeatures_, (all_parents ? 1.0 : 0.0)); - for (int i = 0; i < nFeatures_; i++) { - if (all_parents) active_parents.push_back(i); - states_.push_back(*max_element(X[i].begin(), X[i].end()) + 1); - } - states_.push_back(*max_element(y.begin(), y.end()) + 1); - // - statesClass_ = states_.back(); - classCounts_.resize(statesClass_, 0.0); - classPriors_.resize(statesClass_, 0.0); - // - // Initialize data structures - // - active_parents.resize(nFeatures_); - int totalStates = std::accumulate(states_.begin(), states_.end(), 0) - statesClass_; - - // For p(x_i=si | c), we store them in a 1D array classFeatureProbs_ after we compute. - // We'll need the offsets for each feature i in featureClassOffset_. - featureClassOffset_.resize(nFeatures_); - // We'll store p(x_child=sj | c, x_sp=si) for each pair (i instance(nFeatures_ + 1); - for (int n_instance = 0; n_instance < num_instances; n_instance++) { - for (int feature = 0; feature < nFeatures_; feature++) { - instance[feature] = X[feature][n_instance]; - } - instance[nFeatures_] = y[n_instance]; - addSample(instance, weights[n_instance].item()); - } - // alpha_ Laplace smoothing adapted to the number of instances - alpha_ = 1.0 / static_cast(num_instances); - initializer_ = std::numeric_limits::max() / (nFeatures_ * nFeatures_); - computeProbabilities(); - } - std::string to_string() const - { - std::ostringstream ostream; - ostream << "-------- Xaode.status --------" << std::endl - << "- nFeatures = " << nFeatures_ << std::endl - << "- statesClass = " << statesClass_ << std::endl - << "- matrixState = " << (matrixState_ == MatrixState::COUNTS ? "COUNTS" : "PROBS") << std::endl; - ostream << "- states: size: " << states_.size() << std::endl; - for (int s : states_) ostream << s << " "; ostream << std::endl; - ostream << "- classCounts: size: " << classCounts_.size() << std::endl; - for (double cc : classCounts_) ostream << cc << " "; ostream << std::endl; - ostream << "- classPriors: size: " << classPriors_.size() << std::endl; - for (double cp : classPriors_) ostream << cp << " "; ostream << std::endl; - ostream << "- classFeatureCounts: size: " << classFeatureCounts_.size() << std::endl; - for (double cfc : classFeatureCounts_) ostream << cfc << " "; ostream << std::endl; - ostream << "- classFeatureProbs: size: " << classFeatureProbs_.size() << std::endl; - for (double cfp : classFeatureProbs_) ostream << cfp << " "; ostream << std::endl; - ostream << "- featureClassOffset: size: " << featureClassOffset_.size() << std::endl; - for (int f : featureClassOffset_) ostream << f << " "; ostream << std::endl; - ostream << "- pairOffset_: size: " << pairOffset_.size() << std::endl; - for (int p : pairOffset_) ostream << p << " "; ostream << std::endl; - ostream << "- data: size: " << data_.size() << std::endl; - for (double d : data_) ostream << d << " "; ostream << std::endl; - ostream << "- dataOpp: size: " << dataOpp_.size() << std::endl; - for (double d : dataOpp_) ostream << d << " "; ostream << std::endl; - ostream << "--------------------------------" << std::endl; - std::string output = ostream.str(); - return output; - } - // ------------------------------------------------------- - // addSample (only in COUNTS mode) - // ------------------------------------------------------- - // - // instance should have the class at the end. - // - void addSample(const std::vector& instance, double weight) - { - // - // (A) increment classCounts_ - // (B) increment feature–class counts => for p(x_i|c) - // (C) increment pair (superparent= i, child= j) counts => data_ - // - int c = instance.back(); - if (weight <= 0.0) { - return; - } - // (A) increment classCounts_ - classCounts_[c] += weight; - - // (B,C) - // We'll store raw counts now and turn them into p(child| c, superparent) later. - int idx, fcIndex, sp, sc, i_offset; - for (int parent = 0; parent < nFeatures_; ++parent) { - sp = instance[parent]; - // (B) increment feature–class counts => for p(x_i|c) - fcIndex = (featureClassOffset_[parent] + sp) * statesClass_ + c; - classFeatureCounts_[fcIndex] += weight; - // (C) increment pair (superparent= i, child= j) counts => data_ - i_offset = pairOffset_[featureClassOffset_[parent] + sp]; - for (int child = 0; child < parent; ++child) { - sc = instance[child]; - idx = (i_offset + featureClassOffset_[child] + sc) * statesClass_ + c; - data_[idx] += weight; - } - } - } - // ------------------------------------------------------- - // computeProbabilities - // ------------------------------------------------------- - // - // Once all samples are added in COUNTS mode, call this to: - // 1) compute p(c) => classPriors_ - // 2) compute p(x_i=si | c) => classFeatureProbs_ - // 3) compute p(x_j=sj | c, x_i=si) => data_ (for ij) - // - void computeProbabilities() - { - if (matrixState_ != MatrixState::COUNTS) { - throw std::logic_error("computeProbabilities: must be in COUNTS mode."); - } - double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0); - // (1) p(c) - if (totalCount <= 0.0) { - // fallback => uniform - double unif = 1.0 / statesClass_; - for (int c = 0; c < statesClass_; ++c) { - classPriors_[c] = unif; - } - } else { - for (int c = 0; c < statesClass_; ++c) { - classPriors_[c] = (classCounts_[c] + alpha_) / (totalCount + alpha_ * statesClass_); - } - } - // (2) p(x_i=si | c) => classFeatureProbs_ - int idx, sf; - double denom; - for (int feature = 0; feature < nFeatures_; ++feature) { - sf = states_[feature]; - for (int c = 0; c < statesClass_; ++c) { - denom = classCounts_[c] + alpha_ * sf; - for (int sf_value = 0; sf_value < sf; ++sf_value) { - idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; - classFeatureProbs_[idx] = (classFeatureCounts_[idx] + alpha_) / denom; - } - } - } - // getCountFromTable(int classVal, int pIndex, int childIndex) - // (3) p(x_c=sc | c, x_p=sp) => data_(parent,sp,child,sc,c) - // (3) p(x_p=sp | c, x_c=sc) => dataOpp_(child,sc,parent,sp,c) - // C(x_c, x_p, c) + alpha_ - // P(x_p | x_c, c) = ----------------------------------- - // C(x_c, c) + alpha_ - double pcc_count, pc_count, cc_count; - double conditionalProb, oppositeCondProb; - int part1, part2, p1, part2_class, p1_class; - for (int parent = 1; parent < nFeatures_; ++parent) { - for (int sp = 0; sp < states_[parent]; ++sp) { - p1 = featureClassOffset_[parent] + sp; - part1 = pairOffset_[p1]; - p1_class = p1 * statesClass_; - for (int child = 0; child < parent; ++child) { - for (int sc = 0; sc < states_[child]; ++sc) { - part2 = featureClassOffset_[child] + sc; - part2_class = part2 * statesClass_; - for (int c = 0; c < statesClass_; c++) { - idx = (part1 + part2) * statesClass_ + c; - // Parent, Child, Class Count - pcc_count = data_[idx]; - // Parent, Class count - pc_count = classFeatureCounts_[p1_class + c]; - // Child, Class count - cc_count = classFeatureCounts_[part2_class + c]; - // p(x_c=sc | c, x_p=sp) - conditionalProb = (pcc_count + alpha_) / (pc_count + alpha_ * states_[child]); - data_[idx] = conditionalProb; - // p(x_p=sp | c, x_c=sc) - oppositeCondProb = (pcc_count + alpha_) / (cc_count + alpha_ * states_[parent]); - dataOpp_[idx] = oppositeCondProb; - } - } - } - } - } - matrixState_ = MatrixState::PROBS; - } - // ------------------------------------------------------- - // predict_proba_spode - // ------------------------------------------------------- - // - // Single-superparent approach: - // P(c | x) ∝ p(c) * p(x_sp| c) * ∏_{i≠sp} p(x_i | c, x_sp) - // - // 'instance' should have size == nFeatures_ (no class). - // sp in [0..nFeatures_). - // We multiply p(c) * p(x_sp| c) * p(x_i| c, x_sp). - // Then normalize the distribution. - // - std::vector predict_proba_spode(const std::vector& instance, int parent) - { - // accumulates posterior probabilities for each class - auto probs = std::vector(statesClass_); - auto spodeProbs = std::vector(statesClass_, 0.0); - if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) { - return spodeProbs; - } - // Initialize the probabilities with the feature|class probabilities x class priors - int localOffset; - int sp = instance[parent]; - localOffset = (featureClassOffset_[parent] + sp) * statesClass_; - for (int c = 0; c < statesClass_; ++c) { - spodeProbs[c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_; - } - int idx, base, sc, parent_offset; - for (int child = 0; child < nFeatures_; ++child) { - if (child == parent) { - continue; - } - sc = instance[child]; - if (child > parent) { - parent_offset = pairOffset_[featureClassOffset_[child] + sc]; - base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_; - } else { - parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; - base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; - } - for (int c = 0; c < statesClass_; ++c) { - /* - * The probability P(xc|xp,c) is stored in dataOpp_, and - * the probability P(xp|xc,c) is stored in data_ - */ - idx = base + c; - double factor = child > parent ? dataOpp_[idx] : data_[idx]; - // double factor = data_[idx]; - spodeProbs[c] *= factor; - } - } - // Normalize the probabilities - normalize(spodeProbs); - return spodeProbs; - } - int predict_spode(const std::vector& instance, int parent) - { - auto probs = predict_proba_spode(instance, parent); - return (int)std::distance(probs.begin(), std::max_element(probs.begin(), probs.end())); - } - // ------------------------------------------------------- - // predict_proba - // ------------------------------------------------------- - // - // P(c | x) ∝ p(c) * ∏_{i} p(x_i | c) * ∏_{i predict_proba(const std::vector& instance) - { - // accumulates posterior probabilities for each class - auto probs = std::vector(statesClass_); - auto spodeProbs = std::vector>(nFeatures_, std::vector(statesClass_)); - // Initialize the probabilities with the feature|class probabilities - int localOffset; - for (int feature = 0; feature < nFeatures_; ++feature) { - // if feature is not in the active_parents, skip it - if (std::find(active_parents.begin(), active_parents.end(), feature) == active_parents.end()) { - continue; - } - localOffset = (featureClassOffset_[feature] + instance[feature]) * statesClass_; - for (int c = 0; c < statesClass_; ++c) { - spodeProbs[feature][c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_; - } - } - int idx, base, sp, sc, parent_offset; - for (int parent = 1; parent < nFeatures_; ++parent) { - // if parent is not in the active_parents, skip it - if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) { - continue; - } - sp = instance[parent]; - parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; - for (int child = 0; child < parent; ++child) { - sc = instance[child]; - if (child > parent) { - parent_offset = pairOffset_[featureClassOffset_[child] + sc]; - base = (parent_offset + featureClassOffset_[parent] + sp) * statesClass_; - } else { - parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; - base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; - } - for (int c = 0; c < statesClass_; ++c) { - /* - * The probability P(xc|xp,c) is stored in dataOpp_, and - * the probability P(xp|xc,c) is stored in data_ - */ - idx = base + c; - double factor_child = child > parent ? data_[idx] : dataOpp_[idx]; - double factor_parent = child > parent ? dataOpp_[idx] : data_[idx]; - spodeProbs[child][c] *= factor_child; - spodeProbs[parent][c] *= factor_parent; - } - } - } - /* add all the probabilities for each class */ - for (int c = 0; c < statesClass_; ++c) { - for (int i = 0; i < nFeatures_; ++i) { - probs[c] += spodeProbs[i][c] * significance_models_[i]; - } - } - // Normalize the probabilities - normalize(probs); - return probs; - } - void normalize(std::vector& probs) const - { - double sum = std::accumulate(probs.begin(), probs.end(), 0.0); - if (std::isnan(sum)) { - throw std::runtime_error("Can't normalize array. Sum is NaN."); - } - if (sum == 0) { - return; - } - for (int i = 0; i < (int)probs.size(); i++) { - probs[i] /= sum; - } - } - // Returns current mode: INIT, COUNTS or PROBS - MatrixState state() const - { - return matrixState_; - } - int statesClass() const - { - return statesClass_; - } - int nFeatures() const - { - return nFeatures_; - } - int getNumberOfStates() const - { - return std::accumulate(states_.begin(), states_.end(), 0) * nFeatures_; - } - int getNumberOfEdges() const - { - return nFeatures_ * (2 * nFeatures_ - 1); - } - int getNumberOfNodes() const - { - return (nFeatures_ + 1) * nFeatures_; - } - void add_active_parent(int active_parent) - { - active_parents.push_back(active_parent); - } - void remove_last_parent() - { - active_parents.pop_back(); - } - - private: - // ----------- - // MEMBER DATA - // ----------- - std::vector states_; // [states_feat0, ..., states_feat(n-1), statesClass_] - int nFeatures_; - int statesClass_; - - // data_ means p(child=sj | c, superparent= si) after normalization. - // But in COUNTS mode, it accumulates raw counts. - std::vector pairOffset_; - // data_ stores p(child=sj | c, superparent=si) for each pair (i data_; - // dataOpp_ stores p(superparent=si | c, child=sj) for each pair (i dataOpp_; - - // classCounts_[c] - std::vector classCounts_; - std::vector classPriors_; // => p(c) - - // For p(x_i=si| c), we store counts in classFeatureCounts_ => offset by featureClassOffset_[i] - std::vector featureClassOffset_; - std::vector classFeatureCounts_; - std::vector classFeatureProbs_; // => p(x_i=si | c) after normalization - - MatrixState matrixState_; - - double alpha_ = 1.0; // Laplace smoothing - double initializer_ = 1.0; - std::vector active_parents; - }; -} -#endif // XAODE2_H \ No newline at end of file