From b2002d341c94c94b85d1776280c217200c1b6cfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Mon, 3 Mar 2025 12:38:05 +0100 Subject: [PATCH] Create Xaode2 and add initializer factor in predict --- Makefile | 2 +- src/experimental_clfs/ExpClf.h | 4 +- src/experimental_clfs/XBAODE.h | 1 - src/experimental_clfs/Xaode2.hpp | 464 +++++++++++++++++++++++++++++++ 4 files changed, 468 insertions(+), 3 deletions(-) create mode 100644 src/experimental_clfs/Xaode2.hpp diff --git a/Makefile b/Makefile index 57d53a7..59c603b 100644 --- a/Makefile +++ b/Makefile @@ -38,7 +38,7 @@ setup: ## Install dependencies for tests and coverage fi dest ?= ${HOME}/bin -main: ## Build the main target +main: ## Build only the b_main target @cmake --build $(f_release) -t b_main --parallel @cp $(f_release)/src/b_main $(dest) diff --git a/src/experimental_clfs/ExpClf.h b/src/experimental_clfs/ExpClf.h index 82098ba..abc7d04 100644 --- a/src/experimental_clfs/ExpClf.h +++ b/src/experimental_clfs/ExpClf.h @@ -15,6 +15,7 @@ #include "common/Timer.hpp" #include "CountingSemaphore.hpp" #include "Xaode.hpp" +#include "Xaode2.hpp" namespace platform { class ExpClf : public bayesnet::Boost { @@ -44,7 +45,8 @@ namespace platform { void remove_last_parent(); protected: bool debug = false; - Xaode aode_; + // Xaode aode; + Xaode2 aode_; torch::Tensor weights_; const std::string CLASSIFIER_NOT_FITTED = "Classifier has not been fitted"; inline void normalize_weights(int num_instances) diff --git a/src/experimental_clfs/XBAODE.h b/src/experimental_clfs/XBAODE.h index 13951ac..77bc427 100644 --- a/src/experimental_clfs/XBAODE.h +++ b/src/experimental_clfs/XBAODE.h @@ -18,7 +18,6 @@ namespace platform { class XBAODE : public ExpClf { public: XBAODE(); - virtual ~XBAODE() override = default; std::string getVersion() override { return version; }; protected: void trainModel(const torch::Tensor& weights, const bayesnet::Smoothing_t smoothing) override; diff --git a/src/experimental_clfs/Xaode2.hpp b/src/experimental_clfs/Xaode2.hpp new file mode 100644 index 0000000..dd5f15d --- /dev/null +++ b/src/experimental_clfs/Xaode2.hpp @@ -0,0 +1,464 @@ +// *************************************************************** +// SPDX-FileCopyrightText: Copyright 2025 Ricardo Montañana Gómez +// SPDX-FileType: SOURCE +// SPDX-License-Identifier: MIT +// *************************************************************** +// Based on the Geoff. I. Webb A1DE java algorithm +// https://weka.sourceforge.io/packageMetaData/AnDE/Latest.html + +#ifndef XAODE2_H +#define XAODE2_H +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace platform { + class Xaode2 { + public: + // ------------------------------------------------------- + // The Xaode can be EMPTY (just created), in COUNTS mode (accumulating raw counts) + // or PROBS mode (storing conditional probabilities). + enum class MatrixState { + EMPTY, + COUNTS, + PROBS + }; + std::vector significance_models_; + Xaode2() : nFeatures_{ 0 }, statesClass_{ 0 }, matrixState_{ MatrixState::EMPTY } {} + // ------------------------------------------------------- + // fit + // ------------------------------------------------------- + // + // Classifiers interface + // all parameter decide if the model is initialized with all the parents active or none of them + // + // states.size() = nFeatures + 1, + // where states.back() = number of class states. + // + // We'll store: + // 1) p(x_i=si | c) in classFeatureProbs_ + // 2) p(x_j=sj | c, x_i=si) in data_, with i i is "superparent," j is "child." + // + // Internally, in COUNTS mode, data_ accumulates raw counts, then + // computeProbabilities(...) normalizes them into conditionals. + void fit(std::vector>& X, std::vector& y, const std::vector& features, const std::string& className, std::map>& states, const torch::Tensor& weights, const bool all_parents) + { + int num_instances = X[0].size(); + nFeatures_ = X.size(); + + significance_models_.resize(nFeatures_, (all_parents ? 1.0 : 0.0)); + for (int i = 0; i < nFeatures_; i++) { + if (all_parents) active_parents.push_back(i); + states_.push_back(*max_element(X[i].begin(), X[i].end()) + 1); + } + states_.push_back(*max_element(y.begin(), y.end()) + 1); + // + statesClass_ = states_.back(); + classCounts_.resize(statesClass_, 0.0); + classPriors_.resize(statesClass_, 0.0); + // + // Initialize data structures + // + active_parents.resize(nFeatures_); + int totalStates = std::accumulate(states_.begin(), states_.end(), 0) - statesClass_; + + // For p(x_i=si | c), we store them in a 1D array classFeatureProbs_ after we compute. + // We'll need the offsets for each feature i in featureClassOffset_. + featureClassOffset_.resize(nFeatures_); + // We'll store p(x_child=sj | c, x_sp=si) for each pair (i instance(nFeatures_ + 1); + for (int n_instance = 0; n_instance < num_instances; n_instance++) { + for (int feature = 0; feature < nFeatures_; feature++) { + instance[feature] = X[feature][n_instance]; + } + instance[nFeatures_] = y[n_instance]; + addSample(instance, weights[n_instance].item()); + } + //alpha_ = 1 / num_instances; + initializer_ = std::numeric_limits::max() / (nFeatures_ * nFeatures_); + computeProbabilities(); + } + // Optional: print a quick summary + void show() const + { + std::cout << "-------- Xaode.show() --------" << std::endl + << "- nFeatures = " << nFeatures_ << std::endl + << "- statesClass = " << statesClass_ << std::endl + << "- matrixState = " << (matrixState_ == MatrixState::COUNTS ? "COUNTS" : "PROBS") << std::endl; + std::cout << "- states: size: " << states_.size() << std::endl; + for (int s : states_) std::cout << s << " "; std::cout << std::endl; + std::cout << "- classCounts: size: " << classCounts_.size() << std::endl; + for (double cc : classCounts_) std::cout << cc << " "; std::cout << std::endl; + std::cout << "- classFeatureCounts: size: " << classFeatureCounts_.size() << std::endl; + for (double cfc : classFeatureCounts_) std::cout << cfc << " "; std::cout << std::endl; + std::cout << "- classFeatureProbs: size: " << classFeatureProbs_.size() << std::endl; + for (double cfp : classFeatureProbs_) std::cout << cfp << " "; std::cout << std::endl; + std::cout << "- featureClassOffset: size: " << featureClassOffset_.size() << std::endl; + for (int f : featureClassOffset_) std::cout << f << " "; std::cout << std::endl; + std::cout << "- pairOffset_: size: " << pairOffset_.size() << std::endl; + for (int p : pairOffset_) std::cout << p << " "; std::cout << std::endl; + std::cout << "- data: size: " << data_.size() << std::endl; + for (double d : data_) std::cout << d << " "; std::cout << std::endl; + std::cout << "--------------------------------" << std::endl; + } + // ------------------------------------------------------- + // addSample (only in COUNTS mode) + // ------------------------------------------------------- + // + // instance should have the class at the end. + // + void addSample(const std::vector& instance, double weight) + { + // + // (A) increment classCounts_ + // (B) increment feature–class counts => for p(x_i|c) + // (C) increment pair (superparent= i, child= j) counts => data_ + // + + // if (matrixState_ != MatrixState::COUNTS) { + // throw std::logic_error("addSample: not in COUNTS mode."); + // } + // if (static_cast(instance.size()) != nFeatures_ + 1) { + // throw std::invalid_argument("addSample: instance.size() must be nFeatures_ + 1."); + // } + + int c = instance.back(); + // if (c < 0 || c >= statesClass_) { + // throw std::out_of_range("addSample: class index out of range."); + // } + if (weight <= 0.0) { + return; + } + // (A) increment classCounts_ + classCounts_[c] += weight; + + // (B,C) + // We'll store raw counts now and turn them into p(child| c, superparent) later. + int idx, fcIndex, si, sj, i_offset; + for (int i = 0; i < nFeatures_; ++i) { + si = instance[i]; + // (B) increment feature–class counts => for p(x_i|c) + fcIndex = (featureClassOffset_[i] + si) * statesClass_ + c; + classFeatureCounts_[fcIndex] += weight; + // (C) increment pair (superparent= i, child= j) counts => data_ + i_offset = pairOffset_[featureClassOffset_[i] + si]; + for (int j = 0; j < i; ++j) { + sj = instance[j]; + idx = (i_offset + featureClassOffset_[j] + sj) * statesClass_ + c; + data_[idx] += weight; + } + } + } + // ------------------------------------------------------- + // computeProbabilities + // ------------------------------------------------------- + // + // Once all samples are added in COUNTS mode, call this to: + // 1) compute p(c) => classPriors_ + // 2) compute p(x_i=si | c) => classFeatureProbs_ + // 3) compute p(x_j=sj | c, x_i=si) => data_ (for ij) + // + void computeProbabilities() + { + if (matrixState_ != MatrixState::COUNTS) { + throw std::logic_error("computeProbabilities: must be in COUNTS mode."); + } + double totalCount = std::accumulate(classCounts_.begin(), classCounts_.end(), 0.0); + // (1) p(c) + if (totalCount <= 0.0) { + // fallback => uniform + double unif = 1.0 / statesClass_; + for (int c = 0; c < statesClass_; ++c) { + classPriors_[c] = unif; + } + } else { + for (int c = 0; c < statesClass_; ++c) { + classPriors_[c] = classCounts_[c] / totalCount; + } + } + // (2) p(x_i=si | c) => classFeatureProbs_ + int idx, sf; + double denom, countVal, p; + for (int feature = 0; feature < nFeatures_; ++feature) { + sf = states_[feature]; + for (int c = 0; c < statesClass_; ++c) { + denom = classCounts_[c] * sf; + if (denom <= 0.0) { + // fallback => uniform + for (int sf_value = 0; sf_value < sf; ++sf_value) { + idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; + classFeatureProbs_[idx] = 1.0 / sf; + } + } else { + for (int sf_value = 0; sf_value < sf; ++sf_value) { + idx = (featureClassOffset_[feature] + sf_value) * statesClass_ + c; + countVal = classFeatureCounts_[idx]; + p = ((countVal + alpha_ / (statesClass_ * states_[feature])) / (totalCount + alpha_)); + classFeatureProbs_[idx] = p; + } + } + } + } + // getCountFromTable(int classVal, int pIndex, int childIndex) + // (3) p(x_c=sc | c, x_p=sp) => data_(parent,sp,child,sc,c) + // (3) p(x_p=sp | c, x_c=sc) => dataOpp_(child,sc,parent,sp,c) + // C(x_c, x_p, c) + alpha_/Card(xp) + // P(x_p | x_c, c) = ----------------------------------- + // C(x_c, c) + alpha_ + double pcc_count, pc_count, cc_count; + double conditionalProb, oppositeCondProb; + int part1, part2, p1, part2_class, p1_class; + for (int parent = 1; parent < nFeatures_; ++parent) { + for (int sp = 0; sp < states_[parent]; ++sp) { + p1 = featureClassOffset_[parent] + sp; + part1 = pairOffset_[p1]; + p1_class = p1 * statesClass_; + for (int child = 0; child < parent; ++child) { + for (int sc = 0; sc < states_[child]; ++sc) { + part2 = featureClassOffset_[child] + sc; + part2_class = part2 * statesClass_; + for (int c = 0; c < statesClass_; c++) { + idx = (part1 + part2) * statesClass_ + c; + // Parent, Child, Class Count + pcc_count = data_[idx]; + // Parent, Class count + pc_count = classFeatureCounts_[p1_class + c]; + // Child, Class count + cc_count = classFeatureCounts_[part2_class + c]; + // p(x_c=sc | c, x_p=sp) + conditionalProb = (pcc_count + alpha_ / states_[parent]) / (cc_count + alpha_); + data_[idx] = conditionalProb; + // p(x_p=sp | c, x_c=sc) + oppositeCondProb = (pcc_count + alpha_ / states_[child]) / (pc_count + alpha_); + dataOpp_[idx] = oppositeCondProb; + } + } + } + } + } + matrixState_ = MatrixState::PROBS; + } + // ------------------------------------------------------- + // predict_proba_spode + // ------------------------------------------------------- + // + // Single-superparent approach: + // P(c | x) ∝ p(c) * p(x_sp| c) * ∏_{i≠sp} p(x_i | c, x_sp) + // + // 'instance' should have size == nFeatures_ (no class). + // sp in [0..nFeatures_). + // We multiply p(c) * p(x_sp| c) * p(x_i| c, x_sp). + // Then normalize the distribution. + // + std::vector predict_proba_spode(const std::vector& instance, int parent) + { + // accumulates posterior probabilities for each class + auto probs = std::vector(statesClass_); + auto spodeProbs = std::vector(statesClass_); + // Initialize the probabilities with the feature|class probabilities x class priors + int localOffset; + int sp = instance[parent]; + localOffset = (featureClassOffset_[parent] + sp) * statesClass_; + for (int c = 0; c < statesClass_; ++c) { + spodeProbs[c] = classFeatureProbs_[localOffset + c] * classPriors_[c] * initializer_; + } + int idx, base, sc, parent_offset; + sp = instance[parent]; + parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; + for (int child = 0; child < nFeatures_; ++child) { + if (child == parent) { + continue; + } + sc = instance[child]; + base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; + for (int c = 0; c < statesClass_; ++c) { + /* + * The probability P(xc|xp,c) is stored in dataOpp_, and + * the probability P(xp|xc,c) is stored in data_ + */ + idx = base + c; + spodeProbs[c] *= child < parent ? dataOpp_[idx] : data_[idx]; + } + } + // Normalize the probabilities + normalize(spodeProbs); + return spodeProbs; + } + int predict_spode(const std::vector& instance, int parent) + { + auto probs = predict_proba_spode(instance, parent); + return (int)std::distance(probs.begin(), std::max_element(probs.begin(), probs.end())); + } + // ------------------------------------------------------- + // predict_proba + // ------------------------------------------------------- + // + // P(c | x) ∝ p(c) * ∏_{i} p(x_i | c) * ∏_{i predict_proba(const std::vector& instance) + { + // accumulates posterior probabilities for each class + auto probs = std::vector(statesClass_); + auto spodeProbs = std::vector>(nFeatures_, std::vector(statesClass_)); + // Initialize the probabilities with the feature|class probabilities + int localOffset; + for (int feature = 0; feature < nFeatures_; ++feature) { + // if feature is not in the active_parents, skip it + if (std::find(active_parents.begin(), active_parents.end(), feature) == active_parents.end()) { + continue; + } + localOffset = (featureClassOffset_[feature] + instance[feature]) * statesClass_; + for (int c = 0; c < statesClass_; ++c) { + spodeProbs[feature][c] = classFeatureProbs_[localOffset + c] * classPriors_[c]; + } + } + int idx, base, sp, sc, parent_offset; + for (int parent = 1; parent < nFeatures_; ++parent) { + // if parent is not in the active_parents, skip it + if (std::find(active_parents.begin(), active_parents.end(), parent) == active_parents.end()) { + continue; + } + sp = instance[parent]; + parent_offset = pairOffset_[featureClassOffset_[parent] + sp]; + for (int child = 0; child < parent; ++child) { + sc = instance[child]; + base = (parent_offset + featureClassOffset_[child] + sc) * statesClass_; + for (int c = 0; c < statesClass_; ++c) { + /* + * The probability P(xc|xp,c) is stored in dataOpp_, and + * the probability P(xp|xc,c) is stored in data_ + */ + idx = base + c; + spodeProbs[child][c] *= data_[idx]; + spodeProbs[parent][c] *= dataOpp_[idx]; + } + } + } + /* add all the probabilities for each class */ + for (int c = 0; c < statesClass_; ++c) { + for (int i = 0; i < nFeatures_; ++i) { + probs[c] += spodeProbs[i][c] * significance_models_[i]; + } + } + // Normalize the probabilities + normalize(probs); + return probs; + } + void normalize(std::vector& probs) const + { + double sum = std::accumulate(probs.begin(), probs.end(), 0.0); + if (std::isnan(sum)) { + throw std::runtime_error("Can't normalize array. Sum is NaN."); + } + if (sum == 0) { + return; + } + for (int i = 0; i < (int)probs.size(); i++) { + probs[i] /= sum; + } + } + // Returns current mode: INIT, COUNTS or PROBS + MatrixState state() const + { + return matrixState_; + } + int statesClass() const + { + return statesClass_; + } + int nFeatures() const + { + return nFeatures_; + } + int getNumberOfStates() const + { + return std::accumulate(states_.begin(), states_.end(), 0) * nFeatures_; + } + int getNumberOfEdges() const + { + return nFeatures_ * (2 * nFeatures_ - 1); + } + int getNumberOfNodes() const + { + return (nFeatures_ + 1) * nFeatures_; + } + void add_active_parent(int active_parent) + { + active_parents.push_back(active_parent); + } + void remove_last_parent() + { + active_parents.pop_back(); + } + + private: + // ----------- + // MEMBER DATA + // ----------- + std::vector states_; // [states_feat0, ..., states_feat(n-1), statesClass_] + int nFeatures_; + int statesClass_; + + // data_ means p(child=sj | c, superparent= si) after normalization. + // But in COUNTS mode, it accumulates raw counts. + std::vector pairOffset_; + // data_ stores p(child=sj | c, superparent=si) for each pair (i data_; + // dataOpp_ stores p(superparent=si | c, child=sj) for each pair (i dataOpp_; + + // classCounts_[c] + std::vector classCounts_; + std::vector classPriors_; // => p(c) + + // For p(x_i=si| c), we store counts in classFeatureCounts_ => offset by featureClassOffset_[i] + std::vector featureClassOffset_; + std::vector classFeatureCounts_; + std::vector classFeatureProbs_; // => p(x_i=si | c) after normalization + + MatrixState matrixState_; + + double alpha_ = 1.0; + double initializer_ = std::numeric_limits::max(); + std::vector active_parents; + }; +} +#endif // XAODE2_H \ No newline at end of file