diff --git a/.vscode/launch.json b/.vscode/launch.json index e0da5f0..c1275e6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -25,9 +25,10 @@ "program": "${workspaceFolder}/build/src/Platform/main", "args": [ "-m", - "TANLd", + "BoostAODE", "-p", "/Users/rmontanana/Code/discretizbench/datasets", + "--discretize", "--stratified", "-d", "iris" diff --git a/CMakeLists.txt b/CMakeLists.txt index c53a3a2..186a175 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ add_git_submodule("lib/json") # -------------- add_subdirectory(config) add_subdirectory(lib/Files) +add_subdirectory(lib/FeatureSelect) add_subdirectory(src/BayesNet) add_subdirectory(src/Platform) add_subdirectory(sample) diff --git a/lib/Files/CMakeLists.txt b/lib/Files/CMakeLists.txt index 5e3412f..fce5b8f 100644 --- a/lib/Files/CMakeLists.txt +++ b/lib/Files/CMakeLists.txt @@ -1,2 +1 @@ -add_library(ArffFiles ArffFiles.cc) -#target_link_libraries(BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file +add_library(ArffFiles ArffFiles.cc) \ No newline at end of file diff --git a/lib/featureselect/CMakeLists.txt b/lib/featureselect/CMakeLists.txt new file mode 100644 index 0000000..06da1b7 --- /dev/null +++ b/lib/featureselect/CMakeLists.txt @@ -0,0 +1 @@ +add_library(FeatureSelect FeatureSelect.cpp) \ No newline at end of file diff --git a/lib/featureselect/FeatureSelect.cpp b/lib/featureselect/FeatureSelect.cpp new file mode 100644 index 0000000..6659063 --- /dev/null +++ b/lib/featureselect/FeatureSelect.cpp @@ -0,0 +1,119 @@ +#include "FeatureSelect.h" +namespace features { + SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat) + : samples(samples), labels(labels), weights(weights), k(k), nat(nat) + { + if (samples.size() == 0 || samples[0].size() == 0) + throw invalid_argument("features must be a non-empty matrix"); + if (samples.size() != labels.size()) + throw invalid_argument("number of samples (" + to_string(samples.size()) + ") and labels (" + to_string(labels.size()) + ") must be equal"); + if (samples.size() != weights.size()) + throw invalid_argument("number of samples and weights must be equal"); + if (k < 1 || k > static_cast(samples[0].size())) + throw invalid_argument("k must be between 1 and number of features"); + numFeatures = 0; + numClasses = 0; + numSamples = 0; + fitted = false; + } + SelectKBestWeighted& SelectKBestWeighted::fit() + { + auto labelsCopy = labels; + numFeatures = samples[0].size(); + numSamples = samples.size(); + // compute number of classes + sort(labelsCopy.begin(), labelsCopy.end()); + auto last = unique(labelsCopy.begin(), labelsCopy.end()); + labelsCopy.erase(last, labelsCopy.end()); + numClasses = labelsCopy.size(); + // compute scores + scores.reserve(numFeatures); + for (int i = 0; i < numFeatures; ++i) { + scores.push_back(MutualInformation(i)); + features.push_back(i); + } + // sort & reduce scores and features + sort(features.begin(), features.end(), [&](int i, int j) + { return scores[i] > scores[j]; }); + sort(scores.begin(), scores.end(), greater()); + features.resize(k); + scores.resize(k); + fitted = true; + return *this; + } + precision_t SelectKBestWeighted::entropyLabel() + { + return entropy(labels); + } + precision_t SelectKBestWeighted::entropy(const sample_t& data) + { + precision_t ventropy = 0, totalWeight = 0; + score_t counts(numClasses + 1, 0); + for (auto i = 0; i < static_cast(data.size()); ++i) { + counts[data[i]] += weights[i]; + totalWeight += weights[i]; + } + for (auto count : counts) { + precision_t p = count / totalWeight; + if (p > 0) { + if (nat) { + ventropy -= p * log(p); + } else { + ventropy -= p * log2(p); + } + } + } + return ventropy; + } + // H(Y|X) = sum_{x in X} p(x) H(Y|X=x) + precision_t SelectKBestWeighted::conditionalEntropy(const int feature) + { + unordered_map featureCounts; + unordered_map> jointCounts; + featureCounts.clear(); + jointCounts.clear(); + precision_t totalWeight = 0; + for (auto i = 0; i < numSamples; i++) { + featureCounts[samples[i][feature]] += weights[i]; + jointCounts[samples[i][feature]][labels[i]] += weights[i]; + totalWeight += weights[i]; + } + if (totalWeight == 0) + throw invalid_argument("Total weight should not be zero"); + precision_t entropy = 0; + for (auto& [feat, count] : featureCounts) { + auto p_f = count / totalWeight; + precision_t entropy_f = 0; + for (auto& [label, jointCount] : jointCounts[feat]) { + auto p_l_f = jointCount / count; + if (p_l_f > 0) { + if (nat) { + entropy_f -= p_l_f * log(p_l_f); + } else { + entropy_f -= p_l_f * log2(p_l_f); + } + } + } + entropy += p_f * entropy_f; + } + return entropy; + } + // I(X;Y) = H(Y) - H(Y|X) + precision_t SelectKBestWeighted::MutualInformation(const int i) + { + return entropyLabel() - conditionalEntropy(i); + } + score_t SelectKBestWeighted::getScores() const + { + if (!fitted) + throw logic_error("score not fitted"); + return scores; + } + //Return the indices of the selected features + labels_t SelectKBestWeighted::getFeatures() const + { + if (!fitted) + throw logic_error("score not fitted"); + return features; + } +} diff --git a/lib/featureselect/FeatureSelect.h b/lib/featureselect/FeatureSelect.h new file mode 100644 index 0000000..18ddd99 --- /dev/null +++ b/lib/featureselect/FeatureSelect.h @@ -0,0 +1,38 @@ +#ifndef SELECT_K_BEST_WEIGHTED_H +#define SELECT_K_BEST_WEIGHTED_H +#include +#include +#include +using namespace std; +namespace features { + typedef float precision_t; + typedef int value_t; + typedef vector sample_t; + typedef vector samples_t; + typedef vector labels_t; + typedef vector score_t, weights_t; + + class SelectKBestWeighted { + private: + const samples_t samples; + const labels_t labels; + const weights_t weights; + const int k; + bool nat; // use natural log or log2 + int numFeatures, numClasses, numSamples; + bool fitted; + score_t scores; // scores of the features + labels_t features; // indices of the selected features + precision_t entropyLabel(); + precision_t entropy(const sample_t&); + precision_t conditionalEntropy(const int); + precision_t MutualInformation(const int); + public: + SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool); + SelectKBestWeighted& fit(); + score_t getScores() const; + labels_t getFeatures() const; //Return the indices of the selected features + static inline string version() { return "0.1.0"; }; + }; +} +#endif \ No newline at end of file diff --git a/sample/sample.cc b/sample/sample.cc index 7da318d..ecf76be 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -178,59 +178,59 @@ int main(int argc, char** argv) cout << "end." << endl; auto score = clf->score(Xd, y); cout << "Score: " << score << endl; - // auto graph = clf->graph(); - // auto dot_file = model_name + "_" + file_name; - // ofstream file(dot_file + ".dot"); - // file << graph; - // file.close(); - // cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; - // cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; - // string stratified_string = stratified ? " Stratified" : ""; - // cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; - // cout << "==========================================" << endl; - // torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); - // torch::Tensor yt = torch::tensor(y, torch::kInt32); - // for (int i = 0; i < features.size(); ++i) { - // Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); - // } - // float total_score = 0, total_score_train = 0, score_train, score_test; - // Fold* fold; - // if (stratified) - // fold = new StratifiedKFold(nFolds, y, seed); - // else - // fold = new KFold(nFolds, y.size(), seed); - // for (auto i = 0; i < nFolds; ++i) { - // auto [train, test] = fold->getFold(i); - // cout << "Fold: " << i + 1 << endl; - // if (tensors) { - // auto ttrain = torch::tensor(train, torch::kInt64); - // auto ttest = torch::tensor(test, torch::kInt64); - // torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); - // torch::Tensor ytraint = yt.index({ ttrain }); - // torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); - // torch::Tensor ytestt = yt.index({ ttest }); - // clf->fit(Xtraint, ytraint, features, className, states); - // auto temp = clf->predict(Xtraint); - // score_train = clf->score(Xtraint, ytraint); - // score_test = clf->score(Xtestt, ytestt); - // } else { - // auto [Xtrain, ytrain] = extract_indices(train, Xd, y); - // auto [Xtest, ytest] = extract_indices(test, Xd, y); - // clf->fit(Xtrain, ytrain, features, className, states); - // score_train = clf->score(Xtrain, ytrain); - // score_test = clf->score(Xtest, ytest); - // } - // if (dump_cpt) { - // cout << "--- CPT Tables ---" << endl; - // clf->dump_cpt(); - // } - // total_score_train += score_train; - // total_score += score_test; - // cout << "Score Train: " << score_train << endl; - // cout << "Score Test : " << score_test << endl; - // cout << "-------------------------------------------------------------------------------" << endl; - // } - // cout << "**********************************************************************************" << endl; - // cout << "Average Score Train: " << total_score_train / nFolds << endl; - // cout << "Average Score Test : " << total_score / nFolds << endl;return 0; + auto graph = clf->graph(); + auto dot_file = model_name + "_" + file_name; + ofstream file(dot_file + ".dot"); + file << graph; + file.close(); + cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; + cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; + string stratified_string = stratified ? " Stratified" : ""; + cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; + cout << "==========================================" << endl; + torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); + torch::Tensor yt = torch::tensor(y, torch::kInt32); + for (int i = 0; i < features.size(); ++i) { + Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + } + float total_score = 0, total_score_train = 0, score_train, score_test; + Fold* fold; + if (stratified) + fold = new StratifiedKFold(nFolds, y, seed); + else + fold = new KFold(nFolds, y.size(), seed); + for (auto i = 0; i < nFolds; ++i) { + auto [train, test] = fold->getFold(i); + cout << "Fold: " << i + 1 << endl; + if (tensors) { + auto ttrain = torch::tensor(train, torch::kInt64); + auto ttest = torch::tensor(test, torch::kInt64); + torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); + torch::Tensor ytraint = yt.index({ ttrain }); + torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); + torch::Tensor ytestt = yt.index({ ttest }); + clf->fit(Xtraint, ytraint, features, className, states); + auto temp = clf->predict(Xtraint); + score_train = clf->score(Xtraint, ytraint); + score_test = clf->score(Xtestt, ytestt); + } else { + auto [Xtrain, ytrain] = extract_indices(train, Xd, y); + auto [Xtest, ytest] = extract_indices(test, Xd, y); + clf->fit(Xtrain, ytrain, features, className, states); + score_train = clf->score(Xtrain, ytrain); + score_test = clf->score(Xtest, ytest); + } + if (dump_cpt) { + cout << "--- CPT Tables ---" << endl; + clf->dump_cpt(); + } + total_score_train += score_train; + total_score += score_test; + cout << "Score Train: " << score_train << endl; + cout << "Score Test : " << score_test << endl; + cout << "-------------------------------------------------------------------------------" << endl; + } + cout << "**********************************************************************************" << endl; + cout << "Average Score Train: " << total_score_train / nFolds << endl; + cout << "Average Score Test : " << total_score / nFolds << endl;return 0; } \ No newline at end of file diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index cb93141..7d249ee 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -38,12 +38,14 @@ namespace bayesnet { auto source = vector(features); source.push_back(className); auto combinations = doCombinations(source); + double totalWeight = weights.sum().item(); // Compute class prior - auto margin = torch::zeros({ classNumStates }); + auto margin = torch::zeros({ classNumStates }, torch::kFloat); for (int value = 0; value < classNumStates; ++value) { auto mask = samples.index({ -1, "..." }) == value; - margin[value] = mask.sum().item() / samples.size(1); + margin[value] = mask.sum().item() / samples.size(1); } + cout << "Margin: " << margin; for (auto [first, second] : combinations) { int index_first = find(features.begin(), features.end(), first) - features.begin(); int index_second = find(features.begin(), features.end(), second) - features.begin(); @@ -54,7 +56,7 @@ namespace bayesnet { auto second_dataset = samples.index({ index_second, mask }); auto weights_dataset = weights.index({ mask }); auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset); - auto pb = margin[value].item(); + auto pb = margin[value].item(); accumulated += pb * mi; } result.push_back(accumulated); @@ -81,7 +83,7 @@ namespace bayesnet { double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights) { torch::Tensor counts = feature.bincount(weights); - int totalWeight = counts.sum().item(); + double totalWeight = counts.sum().item(); torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; torch::Tensor logProbs = torch::log(probs); torch::Tensor entropy = -probs * logProbs; @@ -95,7 +97,7 @@ namespace bayesnet { unordered_map> jointCounts; double totalWeight = 0; for (auto i = 0; i < numSamples; i++) { - jointCounts[secondFeature[i].item()][firstFeature[i].item()] += 1; + jointCounts[secondFeature[i].item()][firstFeature[i].item()] += weights[i].item(); totalWeight += weights[i].item(); } if (totalWeight == 0) diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index baafa16..d68ac2a 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -1,10 +1,32 @@ #include "BoostAODE.h" +#include "FeatureSelect.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} void BoostAODE::buildModel(const torch::Tensor& weights) { models.clear(); + int n_samples = dataset.size(1); + int n_features = dataset.size(0); + features::samples_t vsamples; + for (auto i = 0; i < n_samples; ++i) { + auto row = dataset.index({ "...", i }); + // convert row to std::vector + auto vrow = vector(row.data_ptr(), row.data_ptr() + row.numel()); + vsamples.push_back(vrow); + } + auto vweights = features::weights_t(n_samples, 1.0 / n_samples); + auto row = dataset.index({ -1, "..." }); + auto yv = features::labels_t(row.data_ptr(), row.data_ptr() + row.numel()); + auto featureSelection = features::SelectKBestWeighted(vsamples, yv, vweights, n_features, true); + auto features = featureSelection.fit().getFeatures(); + // features = ( + // CSelectKBestWeighted( + // self.X_, self.y_, weights, k = self.n_features_in_ + // ) + // .fit() + // .get_features() + auto scores = features::score_t(n_features, 0.0); for (int i = 0; i < features.size(); ++i) { models.push_back(std::make_unique(i)); } diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index a94d8e9..2f2f631 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -1,7 +1,9 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) +include_directories(${BayesNet_SOURCE_DIR}/lib/featureselect) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc - KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) -target_link_libraries(BayesNet mdlp ArffFiles "${TORCH_LIBRARIES}") \ No newline at end of file + KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc + Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) +target_link_libraries(BayesNet mdlp FeatureSelect "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index 154f1df..4d4ab08 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -43,7 +43,7 @@ namespace bayesnet { { dataset = X; buildDataset(y); - const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); return build(features, className, states, weights); } // X is nxm where n is the number of features and m the number of samples @@ -55,13 +55,13 @@ namespace bayesnet { } auto ytmp = torch::tensor(y, kInt32); buildDataset(ytmp); - const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); return build(features, className, states, weights); } Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states) { this->dataset = dataset; - const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); return build(features, className, states, weights); } Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states, const torch::Tensor& weights) diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index b65f570..5753eb8 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -5,7 +5,6 @@ namespace bayesnet { Network::Network() : features(vector()), className(""), classNumStates(0), fitted(false) {} Network::Network(float maxT) : features(vector()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} - Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other. getmaxThreads()), fitted(other.fitted) { @@ -174,6 +173,7 @@ namespace bayesnet { void Network::completeFit(const map>& states, const torch::Tensor& weights) { setStates(states); + laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation int maxThreadsRunning = static_cast(std::thread::hardware_concurrency() * maxThreads); if (maxThreadsRunning < 1) { maxThreadsRunning = 1; @@ -347,7 +347,7 @@ namespace bayesnet { } // Normalize result double sum = accumulate(result.begin(), result.end(), 0.0); - transform(result.begin(), result.end(), result.begin(), [sum](double& value) { return value / sum; }); + transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; }); return result; } vector Network::show() const @@ -435,6 +435,7 @@ namespace bayesnet { { for (auto& node : nodes) { cout << "* " << node.first << ": (" << node.second->getNumStates() << ") : " << node.second->getCPT().sizes() << endl; + cout << node.second->getCPT() << endl; } } } diff --git a/src/BayesNet/Network.h b/src/BayesNet/Network.h index 5ea94ec..a26e790 100644 --- a/src/BayesNet/Network.h +++ b/src/BayesNet/Network.h @@ -13,7 +13,7 @@ namespace bayesnet { int classNumStates; vector features; // Including classname string className; - int laplaceSmoothing = 1; + double laplaceSmoothing; torch::Tensor samples; // nxm tensor used to fit the model bool isCyclic(const std::string&, std::unordered_set&, std::unordered_set&); vector predict_sample(const vector&); @@ -25,7 +25,6 @@ namespace bayesnet { void setStates(const map>&); public: Network(); - explicit Network(float, int); explicit Network(float); explicit Network(Network&); torch::Tensor& getSamples(); diff --git a/src/BayesNet/Node.cc b/src/BayesNet/Node.cc index 10f26b8..04d2ed2 100644 --- a/src/BayesNet/Node.cc +++ b/src/BayesNet/Node.cc @@ -84,7 +84,7 @@ namespace bayesnet { } return result; } - void Node::computeCPT(const torch::Tensor& dataset, const vector& features, const int laplaceSmoothing, const torch::Tensor& weights) + void Node::computeCPT(const torch::Tensor& dataset, const vector& features, const double laplaceSmoothing, const torch::Tensor& weights) { dimensions.clear(); // Get dimensions of the CPT @@ -111,7 +111,7 @@ namespace bayesnet { coordinates.push_back(dataset.index({ parent_index, n_sample })); } // Increment the count of the corresponding coordinate - cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item()); + cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item()); } // Normalize the counts cpTable = cpTable / cpTable.sum(0); diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index 83c4b1a..6758c5c 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -26,7 +26,7 @@ namespace bayesnet { vector& getParents(); vector& getChildren(); torch::Tensor& getCPT(); - void computeCPT(const torch::Tensor& dataset, const vector& features, const int laplaceSmoothing, const torch::Tensor& weights); + void computeCPT(const torch::Tensor& dataset, const vector& features, const double laplaceSmoothing, const torch::Tensor& weights); int getNumStates() const; void setNumStates(int); unsigned minFill(); diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index f0728be..3bdfa8e 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -22,6 +22,8 @@ namespace bayesnet { auto root = mi[mi.size() - 1].first; // 2. Compute mutual information between each feature and the class auto weights_matrix = metrics.conditionalEdge(weights); + cout << "*** Weights matrix ***\n"; + cout << weights_matrix << "\n"; // 3. Compute the maximum spanning tree auto mst = metrics.maximumSpanningTree(features, weights_matrix, root); // 4. Add edges from the maximum spanning tree to the model