From af0419c9dab9b3dabf4909833037d674a578e993 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 13 Aug 2023 00:59:02 +0200 Subject: [PATCH 01/20] First approx with const 1 weights --- .vscode/launch.json | 5 +++-- src/BayesNet/Classifier.cc | 3 ++- src/BayesNet/Network.cc | 30 +++++++++++++++++------------- src/BayesNet/Network.h | 10 +++++----- src/BayesNet/Node.cc | 4 ++-- src/BayesNet/Node.h | 2 +- src/BayesNet/Proposal.cc | 3 ++- 7 files changed, 32 insertions(+), 25 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index ba01ca6..a42c076 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -25,12 +25,13 @@ "program": "${workspaceFolder}/build/src/Platform/main", "args": [ "-m", - "SPODELd", + "SPODE", "-p", "/Users/rmontanana/Code/discretizbench/datasets", "--stratified", + "--discretize", "-d", - "iris" + "letter" ], "cwd": "/Users/rmontanana/Code/discretizbench", }, diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index b3317f4..87bae91 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -37,7 +37,8 @@ namespace bayesnet { } void Classifier::trainModel() { - model.fit(dataset, features, className, states); + const torch::Tensor weights = torch::ones({ m }); + model.fit(dataset, weights, features, className, states); } // X is nxm where n is the number of features and m the number of samples Classifier& Classifier::fit(torch::Tensor& X, torch::Tensor& y, vector& features, string className, map>& states) diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index 8a4106c..fbb62cc 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -104,8 +104,11 @@ namespace bayesnet { { return nodes; } - void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const vector& featureNames, const string& className, const map>& states) + void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const vector& featureNames, const string& className, const map>& states, const torch::Tensor& weights) { + if (weights.size(0) != n_samples) { + throw invalid_argument("Weights must have the same number of elements as samples in Network::fit"); + } if (n_samples != n_samples_y) { throw invalid_argument("X and y must have the same number of samples in Network::fit (" + to_string(n_samples) + " != " + to_string(n_samples_y) + ")"); } @@ -136,28 +139,29 @@ namespace bayesnet { classNumStates = nodes[className]->getNumStates(); } // X comes in nxm, where n is the number of features and m the number of samples - void Network::fit(const torch::Tensor& X, const torch::Tensor& y, const vector& featureNames, const string& className, const map>& states) + void Network::fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector& featureNames, const string& className, const map>& states) { - checkFitData(X.size(1), X.size(0), y.size(0), featureNames, className, states); + checkFitData(X.size(1), X.size(0), y.size(0), featureNames, className, states, weights); this->className = className; Tensor ytmp = torch::transpose(y.view({ y.size(0), 1 }), 0, 1); samples = torch::cat({ X , ytmp }, 0); for (int i = 0; i < featureNames.size(); ++i) { auto row_feature = X.index({ i, "..." }); } - completeFit(states); + completeFit(states, weights); } - void Network::fit(const torch::Tensor& samples, const vector& featureNames, const string& className, const map>& states) + void Network::fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector& featureNames, const string& className, const map>& states) { - checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states); + checkFitData(samples.size(1), samples.size(0) - 1, samples.size(1), featureNames, className, states, weights); this->className = className; this->samples = samples; - completeFit(states); + completeFit(states, weights); } // input_data comes in nxm, where n is the number of features and m the number of samples - void Network::fit(const vector>& input_data, const vector& labels, const vector& featureNames, const string& className, const map>& states) + void Network::fit(const vector>& input_data, const vector& labels, const vector& weights_, const vector& featureNames, const string& className, const map>& states) { - checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className, states); + const torch::Tensor weights = torch::tensor(weights_, torch::kFloat64); + checkFitData(input_data[0].size(), input_data.size(), labels.size(), featureNames, className, states, weights); this->className = className; // Build tensor of samples (nxm) (n+1 because of the class) samples = torch::zeros({ static_cast(input_data.size() + 1), static_cast(input_data[0].size()) }, torch::kInt32); @@ -165,9 +169,9 @@ namespace bayesnet { samples.index_put_({ i, "..." }, torch::tensor(input_data[i], torch::kInt32)); } samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32)); - completeFit(states); + completeFit(states, weights); } - void Network::completeFit(const map>& states) + void Network::completeFit(const map>& states, const torch::Tensor& weights) { setStates(states); int maxThreadsRunning = static_cast(std::thread::hardware_concurrency() * maxThreads); @@ -182,7 +186,7 @@ namespace bayesnet { while (nextNodeIndex < nodes.size()) { unique_lock lock(mtx); cv.wait(lock, [&activeThreads, &maxThreadsRunning]() { return activeThreads < maxThreadsRunning; }); - threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads]() { + threads.emplace_back([this, &nextNodeIndex, &mtx, &cv, &activeThreads, &weights]() { while (true) { unique_lock lock(mtx); if (nextNodeIndex >= nodes.size()) { @@ -191,7 +195,7 @@ namespace bayesnet { auto& pair = *std::next(nodes.begin(), nextNodeIndex); ++nextNodeIndex; lock.unlock(); - pair.second->computeCPT(samples, features, laplaceSmoothing); + pair.second->computeCPT(samples, features, laplaceSmoothing, weights); lock.lock(); nodes[pair.first] = std::move(pair.second); lock.unlock(); diff --git a/src/BayesNet/Network.h b/src/BayesNet/Network.h index d8db620..5ea94ec 100644 --- a/src/BayesNet/Network.h +++ b/src/BayesNet/Network.h @@ -20,8 +20,8 @@ namespace bayesnet { vector predict_sample(const torch::Tensor&); vector exactInference(map&); double computeFactor(map&); - void completeFit(const map>&); - void checkFitData(int n_features, int n_samples, int n_samples_y, const vector& featureNames, const string& className, const map>&); + void completeFit(const map>& states, const torch::Tensor& weights); + void checkFitData(int n_features, int n_samples, int n_samples_y, const vector& featureNames, const string& className, const map>& states, const torch::Tensor& weights); void setStates(const map>&); public: Network(); @@ -39,9 +39,9 @@ namespace bayesnet { int getNumEdges() const; int getClassNumStates() const; string getClassName() const; - void fit(const vector>&, const vector&, const vector&, const string&, const map>&); - void fit(const torch::Tensor&, const torch::Tensor&, const vector&, const string&, const map>&); - void fit(const torch::Tensor&, const vector&, const string&, const map>&); + void fit(const vector>& input_data, const vector& labels, const vector& weights, const vector& featureNames, const string& className, const map>& states); + void fit(const torch::Tensor& X, const torch::Tensor& y, const torch::Tensor& weights, const vector& featureNames, const string& className, const map>& states); + void fit(const torch::Tensor& samples, const torch::Tensor& weights, const vector& featureNames, const string& className, const map>& states); vector predict(const vector>&); // Return mx1 vector of predictions torch::Tensor predict(const torch::Tensor&); // Return mx1 tensor of predictions torch::Tensor predict_tensor(const torch::Tensor& samples, const bool proba); diff --git a/src/BayesNet/Node.cc b/src/BayesNet/Node.cc index 6669819..10f26b8 100644 --- a/src/BayesNet/Node.cc +++ b/src/BayesNet/Node.cc @@ -84,7 +84,7 @@ namespace bayesnet { } return result; } - void Node::computeCPT(const torch::Tensor& dataset, const vector& features, const int laplaceSmoothing) + void Node::computeCPT(const torch::Tensor& dataset, const vector& features, const int laplaceSmoothing, const torch::Tensor& weights) { dimensions.clear(); // Get dimensions of the CPT @@ -111,7 +111,7 @@ namespace bayesnet { coordinates.push_back(dataset.index({ parent_index, n_sample })); } // Increment the count of the corresponding coordinate - cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + 1); + cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item()); } // Normalize the counts cpTable = cpTable / cpTable.sum(0); diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index f4eb320..83c4b1a 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -26,7 +26,7 @@ namespace bayesnet { vector& getParents(); vector& getChildren(); torch::Tensor& getCPT(); - void computeCPT(const torch::Tensor&, const vector&, const int); + void computeCPT(const torch::Tensor& dataset, const vector& features, const int laplaceSmoothing, const torch::Tensor& weights); int getNumStates() const; void setNumStates(int); unsigned minFill(); diff --git a/src/BayesNet/Proposal.cc b/src/BayesNet/Proposal.cc index eef0088..d95e701 100644 --- a/src/BayesNet/Proposal.cc +++ b/src/BayesNet/Proposal.cc @@ -65,7 +65,8 @@ namespace bayesnet { //Update new states of the feature/node states[pFeatures[index]] = xStates; } - model.fit(pDataset, pFeatures, pClassName, states); + const torch::Tensor weights = torch::ones({ pDataset.size(1) }, torch::kFloat); + model.fit(pDataset, weights, pFeatures, pClassName, states); } return states; } From f26ea1f0ace8c65a1f0d49557f605b8dfd39bcb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 13 Aug 2023 12:56:06 +0200 Subject: [PATCH 02/20] Add weights to BayesMetrics --- src/BayesNet/BayesMetrics.cc | 23 ++++++++++++----------- src/BayesNet/BayesMetrics.h | 16 ++++++++-------- src/BayesNet/Classifier.h | 5 +++-- src/BayesNet/KDB.cc | 4 ++-- src/BayesNet/TAN.cc | 6 +++--- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 8952ead..2f0de11 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -32,7 +32,7 @@ namespace bayesnet { } return result; } - torch::Tensor Metrics::conditionalEdge() + torch::Tensor Metrics::conditionalEdge(const torch::Tensor& weights) { auto result = vector(); auto source = vector(features); @@ -52,7 +52,7 @@ namespace bayesnet { auto mask = samples.index({ -1, "..." }) == value; auto first_dataset = samples.index({ index_first, mask }); auto second_dataset = samples.index({ index_second, mask }); - auto mi = mutualInformation(first_dataset, second_dataset); + auto mi = mutualInformation(first_dataset, second_dataset, weights); auto pb = margin[value].item(); accumulated += pb * mi; } @@ -70,15 +70,16 @@ namespace bayesnet { return matrix; } // To use in Python - vector Metrics::conditionalEdgeWeights() + vector Metrics::conditionalEdgeWeights(vector& weights_) { - auto matrix = conditionalEdge(); + const torch::Tensor weights = torch::tensor(weights_); + auto matrix = conditionalEdge(weights); std::vector v(matrix.data_ptr(), matrix.data_ptr() + matrix.numel()); return v; } - double Metrics::entropy(const torch::Tensor& feature) + double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights) { - torch::Tensor counts = feature.bincount(); + torch::Tensor counts = feature.bincount(weights); int totalWeight = counts.sum().item(); torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; torch::Tensor logProbs = torch::log(probs); @@ -86,15 +87,15 @@ namespace bayesnet { return entropy.nansum().item(); } // H(Y|X) = sum_{x in X} p(x) H(Y|X=x) - double Metrics::conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature) + double Metrics::conditionalEntropy(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights) { int numSamples = firstFeature.sizes()[0]; - torch::Tensor featureCounts = secondFeature.bincount(); + torch::Tensor featureCounts = secondFeature.bincount(weights); unordered_map> jointCounts; double totalWeight = 0; for (auto i = 0; i < numSamples; i++) { jointCounts[secondFeature[i].item()][firstFeature[i].item()] += 1; - totalWeight += 1; + totalWeight += weights[i].item(); } if (totalWeight == 0) return 0; @@ -115,9 +116,9 @@ namespace bayesnet { return entropyValue; } // I(X;Y) = H(Y) - H(Y|X) - double Metrics::mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature) + double Metrics::mutualInformation(const torch::Tensor& firstFeature, const torch::Tensor& secondFeature, const torch::Tensor& weights) { - return entropy(firstFeature) - conditionalEntropy(firstFeature, secondFeature); + return entropy(firstFeature, weights) - conditionalEntropy(firstFeature, secondFeature, weights); } /* Compute the maximum spanning tree considering the weights as distances diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 2a2fff3..5bd25b6 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -12,16 +12,16 @@ namespace bayesnet { vector features; string className; int classNumStates = 0; + double entropy(const Tensor& feature, const Tensor& weights); + double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); + vector> doCombinations(const vector&); public: Metrics() = default; - Metrics(const Tensor&, const vector&, const string&, const int); - Metrics(const vector>&, const vector&, const vector&, const string&, const int); - double entropy(const Tensor&); - double conditionalEntropy(const Tensor&, const Tensor&); - double mutualInformation(const Tensor&, const Tensor&); - vector conditionalEdgeWeights(); // To use in Python - Tensor conditionalEdge(); - vector> doCombinations(const vector&); + Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); + Metrics(const vector>& vsamples, const vector& labels, const vector& features, const string& className, const int classNumStates); + double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); + vector conditionalEdgeWeights(vector& weights); // To use in Python + Tensor conditionalEdge(const torch::Tensor& weights); vector> maximumSpanningTree(const vector& features, const Tensor& weights, const int root); }; } diff --git a/src/BayesNet/Classifier.h b/src/BayesNet/Classifier.h index 2e736a3..6d00928 100644 --- a/src/BayesNet/Classifier.h +++ b/src/BayesNet/Classifier.h @@ -14,13 +14,14 @@ namespace bayesnet { Classifier& build(vector& features, string className, map>& states); protected: bool fitted; - Network model; int m, n; // m: number of samples, n: number of features - Tensor dataset; // (n+1)xm tensor + Network model; Metrics metrics; vector features; string className; map> states; + Tensor dataset; // (n+1)xm tensor + Tensor weights; void checkFitParameters(); virtual void buildModel() = 0; void trainModel() override; diff --git a/src/BayesNet/KDB.cc b/src/BayesNet/KDB.cc index 74566b0..874e08a 100644 --- a/src/BayesNet/KDB.cc +++ b/src/BayesNet/KDB.cc @@ -32,10 +32,10 @@ namespace bayesnet { vector mi; for (auto i = 0; i < features.size(); i++) { Tensor firstFeature = dataset.index({ i, "..." }); - mi.push_back(metrics.mutualInformation(firstFeature, y)); + mi.push_back(metrics.mutualInformation(firstFeature, y, weights)); } // 2. Compute class conditional mutual information I(Xi;XjIC), f or each - auto conditionalEdgeWeights = metrics.conditionalEdge(); + auto conditionalEdgeWeights = metrics.conditionalEdge(weights); // 3. Let the used variable list, S, be empty. vector S; // 4. Let the DAG network being constructed, BN, begin with a single diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index 7b3e3a6..843a5e6 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -15,15 +15,15 @@ namespace bayesnet { Tensor class_dataset = dataset.index({ -1, "..." }); for (int i = 0; i < static_cast(features.size()); ++i) { Tensor feature_dataset = dataset.index({ i, "..." }); - auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset); + auto mi_value = metrics.mutualInformation(class_dataset, feature_dataset, weights); mi.push_back({ i, mi_value }); } sort(mi.begin(), mi.end(), [](const auto& left, const auto& right) {return left.second < right.second;}); auto root = mi[mi.size() - 1].first; // 2. Compute mutual information between each feature and the class - auto weights = metrics.conditionalEdge(); + auto weights_matrix = metrics.conditionalEdge(weights); // 3. Compute the maximum spanning tree - auto mst = metrics.maximumSpanningTree(features, weights, root); + auto mst = metrics.maximumSpanningTree(features, weights_matrix, root); // 4. Add edges from the maximum spanning tree to the model for (auto i = 0; i < mst.size(); ++i) { auto [from, to] = mst[i]; From 2729b92f065b30f2da3d57049473bf2515a983e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 13 Aug 2023 16:19:17 +0200 Subject: [PATCH 03/20] Summary list --- .vscode/launch.json | 11 +++++++ src/Platform/CMakeLists.txt | 4 ++- src/Platform/Results.cc | 60 +++++++++++++++++++++++++++++++++++++ src/Platform/Results.h | 38 +++++++++++++++++++++++ src/Platform/main.cc | 2 +- src/Platform/manage.cc | 32 ++++++++++++++++++++ 6 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 src/Platform/Results.cc create mode 100644 src/Platform/Results.h create mode 100644 src/Platform/manage.cc diff --git a/.vscode/launch.json b/.vscode/launch.json index ba01ca6..0a7a483 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -34,6 +34,17 @@ ], "cwd": "/Users/rmontanana/Code/discretizbench", }, + { + "type": "lldb", + "request": "launch", + "name": "manage", + "program": "${workspaceFolder}/build/src/Platform/manage", + "args": [ + "-n", + "20" + ], + "cwd": "/Users/rmontanana/Code/discretizbench", + }, { "name": "Build & debug active file", "type": "cppdbg", diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 3b13abc..0eb26ce 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -5,4 +5,6 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc Report.cc) -target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file +add_executable(manage manage.cc Results.cc Report.cc) +target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") +target_link_libraries(manage "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Results.cc b/src/Platform/Results.cc new file mode 100644 index 0000000..ee5315b --- /dev/null +++ b/src/Platform/Results.cc @@ -0,0 +1,60 @@ +#include +#include "platformUtils.h" +#include "Results.h" +namespace platform { + const double REFERENCE_SCORE = 22.109799; + Result::Result(const string& path, const string& filename) + : path(path) + , filename(filename) + { + auto data = load(); + date = data["date"]; + score = 0; + for (const auto& result : data["results"]) { + score += result["score"].get(); + } + score /= REFERENCE_SCORE; + title = data["title"]; + duration = data["duration"]; + model = data["model"]; + } + json Result::load() + { + ifstream resultData(path + "/" + filename); + if (resultData.is_open()) { + json data = json::parse(resultData); + return data; + } + throw invalid_argument("Unable to open result file. [" + path + "/" + filename + "]"); + } + void Results::load() + { + using std::filesystem::directory_iterator; + for (const auto& file : directory_iterator(path)) { + auto filename = file.path().filename().string(); + if (filename.find(".json") != string::npos && filename.find("results_") == 0) { + auto result = Result(path, filename); + files.push_back(result); + } + } + } + string Result::to_string() const + { + stringstream oss; + oss << date << " "; + oss << setw(12) << left << model << " "; + oss << right << setw(9) << setprecision(7) << fixed << score << " "; + oss << setw(9) << setprecision(3) << fixed << duration << " "; + oss << setw(50) << left << title << " "; + return oss.str(); + } + void Results::manage() + { + cout << "Results found: " << files.size() << endl; + cout << "========================" << endl; + for (const auto& result : files) { + cout << result.to_string() << endl; + } + } + +} \ No newline at end of file diff --git a/src/Platform/Results.h b/src/Platform/Results.h new file mode 100644 index 0000000..5d36f32 --- /dev/null +++ b/src/Platform/Results.h @@ -0,0 +1,38 @@ +#ifndef RESULTS_H +#define RESULTS_H +#include +#include +#include +#include +namespace platform { + using namespace std; + using json = nlohmann::json; + + class Result { + public: + Result(const string& path, const string& filename); + json load(); + string to_string() const; + private: + string path; + string filename; + string date; + double score; + string title; + double duration; + string model; + }; + class Results { + public: + explicit Results(const string& path) : path(path) { load(); }; + void manage(); + private: + string path; + vector files; + void load(); // Loads the list of results + void show(); + int menu(); + }; +}; + +#endif \ No newline at end of file diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 24d0a33..7692629 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -14,7 +14,7 @@ const string PATH_DATASETS = "datasets"; argparse::ArgumentParser manageArguments(int argc, char** argv) { auto env = platform::DotEnv(); - argparse::ArgumentParser program("BayesNetSample"); + argparse::ArgumentParser program("main"); program.add_argument("-d", "--dataset").default_value("").help("Dataset file name"); program.add_argument("-p", "--path") .help("folder where the data files are located, default") diff --git a/src/Platform/manage.cc b/src/Platform/manage.cc new file mode 100644 index 0000000..b901601 --- /dev/null +++ b/src/Platform/manage.cc @@ -0,0 +1,32 @@ +#include +#include +#include "platformUtils.h" +#include "Results.h" + +using namespace std; +const string PATH_RESULTS = "results"; + +argparse::ArgumentParser manageArguments(int argc, char** argv) +{ + argparse::ArgumentParser program("manage"); + program.add_argument("-n", "--number").default_value(0).help("Number of results to show (0 = all)").scan<'i', int>(); + try { + program.parse_args(argc, argv); + auto number = program.get("number"); + } + catch (const exception& err) { + cerr << err.what() << endl; + cerr << program; + exit(1); + } + return program; +} + +int main(int argc, char** argv) +{ + auto program = manageArguments(argc, argv); + auto number = program.get("number"); + auto results = platform::Results(PATH_RESULTS); + results.manage(); + return 0; +} From 054567c65a252dafb6090a85918bc3c22d4038bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 13 Aug 2023 17:10:18 +0200 Subject: [PATCH 04/20] Add sorting capacity --- src/Platform/Results.cc | 132 +++++++++++++++++++++++++++++++++++++++- src/Platform/Results.h | 19 +++++- src/Platform/manage.cc | 5 +- 3 files changed, 150 insertions(+), 6 deletions(-) diff --git a/src/Platform/Results.cc b/src/Platform/Results.cc index ee5315b..056f0b1 100644 --- a/src/Platform/Results.cc +++ b/src/Platform/Results.cc @@ -1,6 +1,7 @@ #include #include "platformUtils.h" #include "Results.h" +#include "Report.h" namespace platform { const double REFERENCE_SCORE = 22.109799; Result::Result(const string& path, const string& filename) @@ -48,13 +49,140 @@ namespace platform { oss << setw(50) << left << title << " "; return oss.str(); } - void Results::manage() + void Results::show() const { cout << "Results found: " << files.size() << endl; - cout << "========================" << endl; + cout << "-------------------" << endl; + auto i = 0; + cout << " # Date Model Score Duration Title" << endl; + cout << "=== ========== ============ ========= ========= =============================================================" << endl; for (const auto& result : files) { + cout << setw(3) << fixed << right << i++ << " "; cout << result.to_string() << endl; + if (i == max && max != 0) { + break; + } + } } + int Results::getIndex(const string& intent) const + { + cout << "Choose result to " << intent << ": "; + int index; + cin >> index; + if (index >= 0 && index < files.size()) { + return index; + } + + cout << "Invalid index" << endl; + return -1; + } + void Results::menu() + { + cout << "Choose option (quit='q', list='l', delete='d', hide='h', sort='s', report='r'): "; + char option; + int index; + string filename; + cin >> option; + switch (option) { + case 'q': + exit(0); + case 'l': + show(); + menu(); + break; + case 'd': + index = getIndex("delete"); + if (index == -1) + break; + filename = files[index].getFilename(); + cout << "Deleting " << filename << endl; + remove((path + "/" + filename).c_str()); + files.erase(files.begin() + index); + show(); + menu(); + break; + case 'h': + index = getIndex("hide"); + if (index == -1) + break; + filename = files[index].getFilename(); + cout << "Hiding " << filename << endl; + rename((path + "/" + filename).c_str(), (path + "/." + filename).c_str()); + files.erase(files.begin() + index); + show(); + menu(); + break; + case 's': + sortList(); + show(); + menu(); + break; + case 'r': + index = getIndex("report"); + if (index == -1) + break; + filename = files[index].getFilename(); + cout << "Reporting " << filename << endl; + auto data = files[index].load(); + Report report(data); + report.show(); + menu(); + break; + + } + } + void Results::sortList() + { + cout << "Choose sorting field (date='d', score='s', duration='u', model='m'): "; + char option; + cin >> option; + switch (option) { + case 'd': + sortDate(); + break; + case 's': + sortScore(); + break; + case 'u': + sortDuration(); + break; + case 'm': + sortModel(); + break; + default: + cout << "Invalid option" << endl; + } + + } + void Results::sortDate() + { + sort(files.begin(), files.end(), [](const Result& a, const Result& b) { + return a.getDate() > b.getDate(); + }); + } + void Results::sortModel() + { + sort(files.begin(), files.end(), [](const Result& a, const Result& b) { + return a.getModel() > b.getModel(); + }); + } + void Results::sortDuration() + { + sort(files.begin(), files.end(), [](const Result& a, const Result& b) { + return a.getDuration() > b.getDuration(); + }); + } + void Results::sortScore() + { + sort(files.begin(), files.end(), [](const Result& a, const Result& b) { + return a.getScore() > b.getScore(); + }); + } + void Results::manage() + { + show(); + menu(); + } } \ No newline at end of file diff --git a/src/Platform/Results.h b/src/Platform/Results.h index 5d36f32..945901f 100644 --- a/src/Platform/Results.h +++ b/src/Platform/Results.h @@ -13,6 +13,12 @@ namespace platform { Result(const string& path, const string& filename); json load(); string to_string() const; + string getFilename() const { return filename; }; + string getDate() const { return date; }; + double getScore() const { return score; }; + string getTitle() const { return title; }; + double getDuration() const { return duration; }; + string getModel() const { return model; }; private: string path; string filename; @@ -24,14 +30,21 @@ namespace platform { }; class Results { public: - explicit Results(const string& path) : path(path) { load(); }; + explicit Results(const string& path, const int max) : path(path), max(max) { load(); }; void manage(); private: string path; + int max; vector files; void load(); // Loads the list of results - void show(); - int menu(); + void show() const; + int getIndex(const string& intent) const; + void menu(); + void sortList(); + void sortDate(); + void sortScore(); + void sortModel(); + void sortDuration(); }; }; diff --git a/src/Platform/manage.cc b/src/Platform/manage.cc index b901601..f97dae3 100644 --- a/src/Platform/manage.cc +++ b/src/Platform/manage.cc @@ -13,6 +13,9 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) try { program.parse_args(argc, argv); auto number = program.get("number"); + if (number < 0) { + throw runtime_error("Number of results must be greater than or equal to 0"); + } } catch (const exception& err) { cerr << err.what() << endl; @@ -26,7 +29,7 @@ int main(int argc, char** argv) { auto program = manageArguments(argc, argv); auto number = program.get("number"); - auto results = platform::Results(PATH_RESULTS); + auto results = platform::Results(PATH_RESULTS, number); results.manage(); return 0; } From 3691cb4a614b96d0e0145fadb8d365c25f2a23fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sun, 13 Aug 2023 18:13:00 +0200 Subject: [PATCH 05/20] Add totals and filter by scoreName and model --- src/Platform/BestResult.h | 10 ++++++++++ src/Platform/Report.cc | 26 ++++++++++++++++++++++---- src/Platform/Report.h | 2 ++ src/Platform/Results.cc | 24 ++++++++++++++++++------ src/Platform/Results.h | 6 +++++- src/Platform/manage.cc | 8 +++++++- 6 files changed, 64 insertions(+), 12 deletions(-) create mode 100644 src/Platform/BestResult.h diff --git a/src/Platform/BestResult.h b/src/Platform/BestResult.h new file mode 100644 index 0000000..8b3f1cb --- /dev/null +++ b/src/Platform/BestResult.h @@ -0,0 +1,10 @@ +#ifndef BESTRESULT_H +#define BESTRESULT_H +#include +class BestResult { +public: + static std::string title() { return "STree_default (linear-ovo)"; } + static double score() { return 22.109799; } + static std::string scoreName() { return "accuracy"; } +}; +#endif \ No newline at end of file diff --git a/src/Platform/Report.cc b/src/Platform/Report.cc index 3693248..7bd7d69 100644 --- a/src/Platform/Report.cc +++ b/src/Platform/Report.cc @@ -1,4 +1,5 @@ #include "Report.h" +#include "BestResult.h" namespace platform { string headerLine(const string& text) @@ -28,6 +29,7 @@ namespace platform { { header(); body(); + footer(); } void Report::header() { @@ -44,6 +46,8 @@ namespace platform { { cout << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl; cout << "============================== ====== ===== === ======= ======= ======= =============== ================= ===============" << endl; + json lastResult; + totalScore = 0; for (const auto& r : data["results"]) { cout << setw(30) << left << r["dataset"].get() << " "; cout << setw(6) << right << r["samples"].get() << " "; @@ -56,12 +60,26 @@ namespace platform { cout << setw(10) << right << setprecision(6) << fixed << r["test_time"].get() << "±" << setw(6) << setprecision(4) << fixed << r["test_time_std"].get() << " "; cout << " " << r["hyperparameters"].get(); cout << endl; + lastResult = r; + totalScore += r["score_test"].get(); + } + if (data["results"].size() == 1) { cout << string(MAXL, '*') << endl; - cout << headerLine("Train scores: " + fVector(r["scores_train"])); - cout << headerLine("Test scores: " + fVector(r["scores_test"])); - cout << headerLine("Train times: " + fVector(r["times_train"])); - cout << headerLine("Test times: " + fVector(r["times_test"])); + cout << headerLine("Train scores: " + fVector(lastResult["scores_train"])); + cout << headerLine("Test scores: " + fVector(lastResult["scores_test"])); + cout << headerLine("Train times: " + fVector(lastResult["times_train"])); + cout << headerLine("Test times: " + fVector(lastResult["times_test"])); cout << string(MAXL, '*') << endl; } } + void Report::footer() + { + cout << string(MAXL, '*') << endl; + auto score = data["score_name"].get(); + if (score == BestResult::scoreName()) { + cout << headerLine(score + " compared to " + BestResult::title() + " .: " + to_string(totalScore / BestResult::score())); + } + cout << string(MAXL, '*') << endl; + + } } \ No newline at end of file diff --git a/src/Platform/Report.h b/src/Platform/Report.h index c6ea8a1..302ac60 100644 --- a/src/Platform/Report.h +++ b/src/Platform/Report.h @@ -16,8 +16,10 @@ namespace platform { private: void header(); void body(); + void footer(); string fromVector(const string& key); json data; + double totalScore; // Total score of all results in a report }; }; #endif \ No newline at end of file diff --git a/src/Platform/Results.cc b/src/Platform/Results.cc index 056f0b1..c33cf37 100644 --- a/src/Platform/Results.cc +++ b/src/Platform/Results.cc @@ -2,8 +2,8 @@ #include "platformUtils.h" #include "Results.h" #include "Report.h" +#include "BestResult.h" namespace platform { - const double REFERENCE_SCORE = 22.109799; Result::Result(const string& path, const string& filename) : path(path) , filename(filename) @@ -14,7 +14,10 @@ namespace platform { for (const auto& result : data["results"]) { score += result["score"].get(); } - score /= REFERENCE_SCORE; + scoreName = data["score_name"]; + if (scoreName == BestResult::scoreName()) { + score /= BestResult::score(); + } title = data["title"]; duration = data["duration"]; model = data["model"]; @@ -35,7 +38,11 @@ namespace platform { auto filename = file.path().filename().string(); if (filename.find(".json") != string::npos && filename.find("results_") == 0) { auto result = Result(path, filename); - files.push_back(result); + bool addResult = true; + if (model != "any" && result.getModel() != model || scoreName != "any" && scoreName != result.getScoreName()) + addResult = false; + if (addResult) + files.push_back(result); } } } @@ -44,7 +51,8 @@ namespace platform { stringstream oss; oss << date << " "; oss << setw(12) << left << model << " "; - oss << right << setw(9) << setprecision(7) << fixed << score << " "; + oss << setw(11) << left << scoreName << " "; + oss << right << setw(11) << setprecision(7) << fixed << score << " "; oss << setw(9) << setprecision(3) << fixed << duration << " "; oss << setw(50) << left << title << " "; return oss.str(); @@ -54,8 +62,8 @@ namespace platform { cout << "Results found: " << files.size() << endl; cout << "-------------------" << endl; auto i = 0; - cout << " # Date Model Score Duration Title" << endl; - cout << "=== ========== ============ ========= ========= =============================================================" << endl; + cout << " # Date Model Score Name Score Duration Title" << endl; + cout << "=== ========== ============ =========== =========== ========= =============================================================" << endl; for (const auto& result : files) { cout << setw(3) << fixed << right << i++ << " "; cout << result.to_string() << endl; @@ -181,6 +189,10 @@ namespace platform { } void Results::manage() { + if (files.size() == 0) { + cout << "No results found!" << endl; + exit(0); + } show(); menu(); } diff --git a/src/Platform/Results.h b/src/Platform/Results.h index 945901f..bd4768b 100644 --- a/src/Platform/Results.h +++ b/src/Platform/Results.h @@ -19,6 +19,7 @@ namespace platform { string getTitle() const { return title; }; double getDuration() const { return duration; }; string getModel() const { return model; }; + string getScoreName() const { return scoreName; }; private: string path; string filename; @@ -27,14 +28,17 @@ namespace platform { string title; double duration; string model; + string scoreName; }; class Results { public: - explicit Results(const string& path, const int max) : path(path), max(max) { load(); }; + Results(const string& path, const int max, const string& model, const string& score) : path(path), max(max), model(model), scoreName(score) { load(); }; void manage(); private: string path; int max; + string model; + string scoreName; vector files; void load(); // Loads the list of results void show() const; diff --git a/src/Platform/manage.cc b/src/Platform/manage.cc index f97dae3..74e4a2c 100644 --- a/src/Platform/manage.cc +++ b/src/Platform/manage.cc @@ -10,12 +10,16 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) { argparse::ArgumentParser program("manage"); program.add_argument("-n", "--number").default_value(0).help("Number of results to show (0 = all)").scan<'i', int>(); + program.add_argument("-m", "--model").default_value("any").help("Filter results of the selected model)"); + program.add_argument("-s", "--score").default_value("any").help("Filter results of the score name supplied"); try { program.parse_args(argc, argv); auto number = program.get("number"); if (number < 0) { throw runtime_error("Number of results must be greater than or equal to 0"); } + auto model = program.get("model"); + auto score = program.get("score"); } catch (const exception& err) { cerr << err.what() << endl; @@ -29,7 +33,9 @@ int main(int argc, char** argv) { auto program = manageArguments(argc, argv); auto number = program.get("number"); - auto results = platform::Results(PATH_RESULTS, number); + auto model = program.get("model"); + auto score = program.get("score"); + auto results = platform::Results(PATH_RESULTS, number, model, score); results.manage(); return 0; } From 55d21294d5460addd17e339729386ae804896dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 14 Aug 2023 00:40:31 +0200 Subject: [PATCH 06/20] Add class Paths and enhance input --- src/Platform/Paths.h | 10 +++ src/Platform/Results.cc | 129 ++++++++++++++++++++-------------- src/Platform/Results.h | 3 +- src/Platform/main.cc | 9 ++- src/Platform/manage.cc | 4 +- src/Platform/platformUtils.cc | 3 +- 6 files changed, 98 insertions(+), 60 deletions(-) create mode 100644 src/Platform/Paths.h diff --git a/src/Platform/Paths.h b/src/Platform/Paths.h new file mode 100644 index 0000000..756e61a --- /dev/null +++ b/src/Platform/Paths.h @@ -0,0 +1,10 @@ +#ifndef PATHS_H +#define PATHS_H +namespace platform { + class Paths { + public: + static std::string datasets() { return "datasets/"; } + static std::string results() { return "results/"; } + }; +} +#endif \ No newline at end of file diff --git a/src/Platform/Results.cc b/src/Platform/Results.cc index c33cf37..48c7e9a 100644 --- a/src/Platform/Results.cc +++ b/src/Platform/Results.cc @@ -22,7 +22,7 @@ namespace platform { duration = data["duration"]; model = data["model"]; } - json Result::load() + json Result::load() const { ifstream resultData(path + "/" + filename); if (resultData.is_open()) { @@ -70,7 +70,6 @@ namespace platform { if (i == max && max != 0) { break; } - } } int Results::getIndex(const string& intent) const @@ -81,70 +80,98 @@ namespace platform { if (index >= 0 && index < files.size()) { return index; } - cout << "Invalid index" << endl; return -1; } + void Results::report(const int index) const + { + cout << "Reporting " << files.at(index).getFilename() << endl; + auto data = files.at(index).load(); + Report report(data); + report.show(); + } void Results::menu() { - cout << "Choose option (quit='q', list='l', delete='d', hide='h', sort='s', report='r'): "; char option; int index; - string filename; - cin >> option; - switch (option) { - case 'q': - exit(0); - case 'l': - show(); - menu(); - break; - case 'd': - index = getIndex("delete"); - if (index == -1) + bool finished = false; + string filename, line, options = "qldhsr"; + while (!finished) { + cout << "Choose option (quit='q', list='l', delete='d', hide='h', sort='s', report='r'): "; + getline(cin, line); + if (line.size() == 0) + continue; + if (options.find(line[0]) != string::npos) { + if (line.size() > 1) { + cout << "Invalid option" << endl; + continue; + } + option = line[0]; + } else { + index = stoi(line); + if (index >= 0 && index < files.size()) { + report(index); + } else { + cout << "Invalid option" << endl; + } + continue; + } + switch (option) { + case 'q': + finished = true; break; - filename = files[index].getFilename(); - cout << "Deleting " << filename << endl; - remove((path + "/" + filename).c_str()); - files.erase(files.begin() + index); - show(); - menu(); - break; - case 'h': - index = getIndex("hide"); - if (index == -1) + case 'l': + show(); break; - filename = files[index].getFilename(); - cout << "Hiding " << filename << endl; - rename((path + "/" + filename).c_str(), (path + "/." + filename).c_str()); - files.erase(files.begin() + index); - show(); - menu(); - break; - case 's': - sortList(); - show(); - menu(); - break; - case 'r': - index = getIndex("report"); - if (index == -1) + case 'd': + index = getIndex("delete"); + if (index == -1) + break; + filename = files[index].getFilename(); + cout << "Deleting " << filename << endl; + remove((path + "/" + filename).c_str()); + files.erase(files.begin() + index); + show(); break; - filename = files[index].getFilename(); - cout << "Reporting " << filename << endl; - auto data = files[index].load(); - Report report(data); - report.show(); - menu(); - break; - + case 'h': + index = getIndex("hide"); + if (index == -1) + break; + filename = files[index].getFilename(); + cout << "Hiding " << filename << endl; + rename((path + "/" + filename).c_str(), (path + "/." + filename).c_str()); + files.erase(files.begin() + index); + show(); + menu(); + break; + case 's': + sortList(); + show(); + break; + case 'r': + index = getIndex("report"); + if (index == -1) + break; + report(index); + break; + default: + cout << "Invalid option" << endl; + } } } void Results::sortList() { cout << "Choose sorting field (date='d', score='s', duration='u', model='m'): "; + string line; char option; - cin >> option; + getline(cin, line); + if (line.size() == 0) + return; + if (line.size() > 1) { + cout << "Invalid option" << endl; + return; + } + option = line[0]; switch (option) { case 'd': sortDate(); @@ -161,7 +188,6 @@ namespace platform { default: cout << "Invalid option" << endl; } - } void Results::sortDate() { @@ -195,6 +221,7 @@ namespace platform { } show(); menu(); + cout << "Done!" << endl; } } \ No newline at end of file diff --git a/src/Platform/Results.h b/src/Platform/Results.h index bd4768b..e6b1552 100644 --- a/src/Platform/Results.h +++ b/src/Platform/Results.h @@ -11,7 +11,7 @@ namespace platform { class Result { public: Result(const string& path, const string& filename); - json load(); + json load() const; string to_string() const; string getFilename() const { return filename; }; string getDate() const { return date; }; @@ -42,6 +42,7 @@ namespace platform { vector files; void load(); // Loads the list of results void show() const; + void report(const int index) const; int getIndex(const string& intent) const; void menu(); void sortList(); diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 7692629..0618c89 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -6,10 +6,10 @@ #include "DotEnv.h" #include "Models.h" #include "modelRegister.h" +#include "Paths.h" + using namespace std; -const string PATH_RESULTS = "results"; -const string PATH_DATASETS = "datasets"; argparse::ArgumentParser manageArguments(int argc, char** argv) { @@ -18,8 +18,7 @@ argparse::ArgumentParser manageArguments(int argc, char** argv) program.add_argument("-d", "--dataset").default_value("").help("Dataset file name"); program.add_argument("-p", "--path") .help("folder where the data files are located, default") - .default_value(string{ PATH_DATASETS } - ); + .default_value(string{ platform::Paths::datasets() }); program.add_argument("-m", "--model") .help("Model to use " + platform::Models::instance()->toString()) .action([](const std::string& value) { @@ -115,7 +114,7 @@ int main(int argc, char** argv) experiment.go(filesToTest, path); experiment.setDuration(timer.getDuration()); if (saveResults) - experiment.save(PATH_RESULTS); + experiment.save(platform::Paths::results()); else experiment.report(); cout << "Done!" << endl; diff --git a/src/Platform/manage.cc b/src/Platform/manage.cc index 74e4a2c..34e66cd 100644 --- a/src/Platform/manage.cc +++ b/src/Platform/manage.cc @@ -1,10 +1,10 @@ #include #include #include "platformUtils.h" +#include "Paths.h" #include "Results.h" using namespace std; -const string PATH_RESULTS = "results"; argparse::ArgumentParser manageArguments(int argc, char** argv) { @@ -35,7 +35,7 @@ int main(int argc, char** argv) auto number = program.get("number"); auto model = program.get("model"); auto score = program.get("score"); - auto results = platform::Results(PATH_RESULTS, number, model, score); + auto results = platform::Results(platform::Paths::results(), number, model, score); results.manage(); return 0; } diff --git a/src/Platform/platformUtils.cc b/src/Platform/platformUtils.cc index 6fca9d9..74e97fd 100644 --- a/src/Platform/platformUtils.cc +++ b/src/Platform/platformUtils.cc @@ -1,4 +1,5 @@ #include "platformUtils.h" +#include "Paths.h" using namespace torch; @@ -85,7 +86,7 @@ tuple, string, map>> loadData tuple>, vector, vector, string, map>> loadFile(const string& name) { auto handler = ArffFiles(); - handler.load(PATH + static_cast(name) + ".arff"); + handler.load(platform::Paths::datasets() + static_cast(name) + ".arff"); // Get Dataset X, y vector& X = handler.getX(); mdlp::labels_t& y = handler.getY(); From 2a3fc9aa4569d39b9d296868400e038acc547ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 14 Aug 2023 17:03:06 +0200 Subject: [PATCH 07/20] Add colors and enhace input control --- src/Platform/Colors.h | 14 ++++++++++++++ src/Platform/Report.cc | 28 ++++++++++++++++++---------- src/Platform/Report.h | 1 + src/Platform/Results.cc | 30 +++++++++++++++++++++--------- 4 files changed, 54 insertions(+), 19 deletions(-) create mode 100644 src/Platform/Colors.h diff --git a/src/Platform/Colors.h b/src/Platform/Colors.h new file mode 100644 index 0000000..7ab2e08 --- /dev/null +++ b/src/Platform/Colors.h @@ -0,0 +1,14 @@ +#ifndef COLORS_H +#define COLORS_H +class Colors { +public: + static std::string MAGENTA() { return "\033[1;35m"; } + static std::string BLUE() { return "\033[1;34m"; } + static std::string CYAN() { return "\033[1;36m"; } + static std::string GREEN() { return "\033[1;32m"; } + static std::string YELLOW() { return "\033[1;33m"; } + static std::string RED() { return "\033[1;31m"; } + static std::string WHITE() { return "\033[1;37m"; } + static std::string RESET() { return "\033[0m"; } +}; +#endif // COLORS_H \ No newline at end of file diff --git a/src/Platform/Report.cc b/src/Platform/Report.cc index 7bd7d69..a40a482 100644 --- a/src/Platform/Report.cc +++ b/src/Platform/Report.cc @@ -33,7 +33,7 @@ namespace platform { } void Report::header() { - cout << string(MAXL, '*') << endl; + cout << Colors::MAGENTA() << string(MAXL, '*') << endl; cout << headerLine("Report " + data["model"].get() + " ver. " + data["version"].get() + " with " + to_string(data["folds"].get()) + " Folds cross validation and " + to_string(data["seeds"].size()) + " random seeds. " + data["date"].get() + " " + data["time"].get()); cout << headerLine(data["title"].get()); cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get() ? "True" : "False")); @@ -44,24 +44,32 @@ namespace platform { } void Report::body() { - cout << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl; - cout << "============================== ====== ===== === ======= ======= ======= =============== ================= ===============" << endl; + cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl; + cout << "============================== ====== ===== === ======= ======= ======= =============== ================== ===============" << endl; json lastResult; totalScore = 0; + bool odd = true; for (const auto& r : data["results"]) { - cout << setw(30) << left << r["dataset"].get() << " "; + auto color = odd ? Colors::CYAN() : Colors::BLUE(); + cout << color << setw(30) << left << r["dataset"].get() << " "; cout << setw(6) << right << r["samples"].get() << " "; cout << setw(5) << right << r["features"].get() << " "; cout << setw(3) << right << r["classes"].get() << " "; cout << setw(7) << setprecision(2) << fixed << r["nodes"].get() << " "; cout << setw(7) << setprecision(2) << fixed << r["leaves"].get() << " "; cout << setw(7) << setprecision(2) << fixed << r["depth"].get() << " "; - cout << setw(8) << right << setprecision(6) << fixed << r["score_test"].get() << "±" << setw(6) << setprecision(4) << fixed << r["score_test_std"].get() << " "; - cout << setw(10) << right << setprecision(6) << fixed << r["test_time"].get() << "±" << setw(6) << setprecision(4) << fixed << r["test_time_std"].get() << " "; - cout << " " << r["hyperparameters"].get(); + cout << setw(8) << right << setprecision(6) << fixed << r["score"].get() << "±" << setw(6) << setprecision(4) << fixed << r["score_std"].get() << " "; + cout << setw(11) << right << setprecision(6) << fixed << r["time"].get() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get() << " "; + try { + cout << r["hyperparameters"].get(); + } + catch (const exception& err) { + cout << r["hyperparameters"]; + } cout << endl; lastResult = r; - totalScore += r["score_test"].get(); + totalScore += r["score"].get(); + odd = !odd; } if (data["results"].size() == 1) { cout << string(MAXL, '*') << endl; @@ -74,12 +82,12 @@ namespace platform { } void Report::footer() { - cout << string(MAXL, '*') << endl; + cout << Colors::MAGENTA() << string(MAXL, '*') << endl; auto score = data["score_name"].get(); if (score == BestResult::scoreName()) { cout << headerLine(score + " compared to " + BestResult::title() + " .: " + to_string(totalScore / BestResult::score())); } - cout << string(MAXL, '*') << endl; + cout << string(MAXL, '*') << endl << Colors::RESET(); } } \ No newline at end of file diff --git a/src/Platform/Report.h b/src/Platform/Report.h index 302ac60..5934b2f 100644 --- a/src/Platform/Report.h +++ b/src/Platform/Report.h @@ -3,6 +3,7 @@ #include #include #include +#include "Colors.h" using json = nlohmann::json; const int MAXL = 121; diff --git a/src/Platform/Results.cc b/src/Platform/Results.cc index 48c7e9a..0bf4070 100644 --- a/src/Platform/Results.cc +++ b/src/Platform/Results.cc @@ -3,6 +3,7 @@ #include "Results.h" #include "Report.h" #include "BestResult.h" +#include "Colors.h" namespace platform { Result::Result(const string& path, const string& filename) : path(path) @@ -59,25 +60,35 @@ namespace platform { } void Results::show() const { - cout << "Results found: " << files.size() << endl; + cout << Colors::GREEN() << "Results found: " << files.size() << endl; cout << "-------------------" << endl; auto i = 0; cout << " # Date Model Score Name Score Duration Title" << endl; cout << "=== ========== ============ =========== =========== ========= =============================================================" << endl; + bool odd = true; for (const auto& result : files) { - cout << setw(3) << fixed << right << i++ << " "; + auto color = odd ? Colors::BLUE() : Colors::CYAN(); + cout << color << setw(3) << fixed << right << i++ << " "; cout << result.to_string() << endl; if (i == max && max != 0) { break; } + odd = !odd; } } int Results::getIndex(const string& intent) const { - cout << "Choose result to " << intent << ": "; - int index; - cin >> index; - if (index >= 0 && index < files.size()) { + string color; + if (intent == "delete") { + color = Colors::RED(); + } else { + color = Colors::YELLOW(); + } + cout << color << "Choose result to " << intent << " (cancel=-1): "; + string line; + getline(cin, line); + int index = stoi(line); + if (index >= -1 && index < static_cast(files.size())) { return index; } cout << "Invalid index" << endl; @@ -85,7 +96,7 @@ namespace platform { } void Results::report(const int index) const { - cout << "Reporting " << files.at(index).getFilename() << endl; + cout << Colors::YELLOW() << "Reporting " << files.at(index).getFilename() << endl; auto data = files.at(index).load(); Report report(data); report.show(); @@ -97,7 +108,7 @@ namespace platform { bool finished = false; string filename, line, options = "qldhsr"; while (!finished) { - cout << "Choose option (quit='q', list='l', delete='d', hide='h', sort='s', report='r'): "; + cout << Colors::RESET() << "Choose option (quit='q', list='l', delete='d', hide='h', sort='s', report='r'): "; getline(cin, line); if (line.size() == 0) continue; @@ -131,6 +142,7 @@ namespace platform { cout << "Deleting " << filename << endl; remove((path + "/" + filename).c_str()); files.erase(files.begin() + index); + cout << "File: " + filename + " deleted!" << endl; show(); break; case 'h': @@ -161,7 +173,7 @@ namespace platform { } void Results::sortList() { - cout << "Choose sorting field (date='d', score='s', duration='u', model='m'): "; + cout << Colors::YELLOW() << "Choose sorting field (date='d', score='s', duration='u', model='m'): "; string line; char option; getline(cin, line); From 24b68f9ae23e42a52c821f5c5ad12f1074bad7fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 15 Aug 2023 15:04:56 +0200 Subject: [PATCH 08/20] Add weigths as parameter --- .vscode/launch.json | 5 ++--- src/BayesNet/AODE.cc | 2 +- src/BayesNet/AODE.h | 2 +- src/BayesNet/AODELd.cc | 4 ++-- src/BayesNet/AODELd.h | 4 ++-- src/BayesNet/BaseClassifier.h | 2 +- src/BayesNet/BayesMetrics.cc | 3 ++- src/BayesNet/Classifier.cc | 9 +++++---- src/BayesNet/Classifier.h | 5 ++--- src/BayesNet/Ensemble.cc | 2 +- src/BayesNet/Ensemble.h | 2 +- src/BayesNet/KDB.cc | 2 +- src/BayesNet/KDB.h | 3 ++- src/BayesNet/Network.cc | 2 +- src/BayesNet/Proposal.cc | 1 + src/BayesNet/SPODE.cc | 2 +- src/BayesNet/SPODE.h | 2 +- src/BayesNet/TAN.cc | 2 +- src/BayesNet/TAN.h | 2 +- src/Platform/Report.h | 2 +- src/Platform/main.cc | 2 +- 21 files changed, 31 insertions(+), 29 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 828604b..e0da5f0 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -25,13 +25,12 @@ "program": "${workspaceFolder}/build/src/Platform/main", "args": [ "-m", - "SPODE", + "TANLd", "-p", "/Users/rmontanana/Code/discretizbench/datasets", "--stratified", - "--discretize", "-d", - "letter" + "iris" ], "cwd": "/Users/rmontanana/Code/discretizbench", }, diff --git a/src/BayesNet/AODE.cc b/src/BayesNet/AODE.cc index 7e6a95f..d90c495 100644 --- a/src/BayesNet/AODE.cc +++ b/src/BayesNet/AODE.cc @@ -2,7 +2,7 @@ namespace bayesnet { AODE::AODE() : Ensemble() {} - void AODE::buildModel() + void AODE::buildModel(const torch::Tensor& weights) { models.clear(); for (int i = 0; i < features.size(); ++i) { diff --git a/src/BayesNet/AODE.h b/src/BayesNet/AODE.h index 3d58851..00965f6 100644 --- a/src/BayesNet/AODE.h +++ b/src/BayesNet/AODE.h @@ -5,7 +5,7 @@ namespace bayesnet { class AODE : public Ensemble { protected: - void buildModel() override; + void buildModel(const torch::Tensor& weights) override; public: AODE(); virtual ~AODE() {}; diff --git a/src/BayesNet/AODELd.cc b/src/BayesNet/AODELd.cc index 9f36ed2..cc842be 100644 --- a/src/BayesNet/AODELd.cc +++ b/src/BayesNet/AODELd.cc @@ -19,7 +19,7 @@ namespace bayesnet { return *this; } - void AODELd::buildModel() + void AODELd::buildModel(const torch::Tensor& weights) { models.clear(); for (int i = 0; i < features.size(); ++i) { @@ -27,7 +27,7 @@ namespace bayesnet { } n_models = models.size(); } - void AODELd::trainModel() + void AODELd::trainModel(const torch::Tensor& weights) { for (const auto& model : models) { model->fit(Xf, y, features, className, states); diff --git a/src/BayesNet/AODELd.h b/src/BayesNet/AODELd.h index 14be0c4..aa67247 100644 --- a/src/BayesNet/AODELd.h +++ b/src/BayesNet/AODELd.h @@ -8,8 +8,8 @@ namespace bayesnet { using namespace std; class AODELd : public Ensemble, public Proposal { protected: - void trainModel() override; - void buildModel() override; + void trainModel(const torch::Tensor& weights) override; + void buildModel(const torch::Tensor& weights) override; public: AODELd(); AODELd& fit(torch::Tensor& X_, torch::Tensor& y_, vector& features_, string className_, map>& states_) override; diff --git a/src/BayesNet/BaseClassifier.h b/src/BayesNet/BaseClassifier.h index ff202e1..527b5c5 100644 --- a/src/BayesNet/BaseClassifier.h +++ b/src/BayesNet/BaseClassifier.h @@ -6,7 +6,7 @@ namespace bayesnet { using namespace std; class BaseClassifier { protected: - virtual void trainModel() = 0; + virtual void trainModel(const torch::Tensor& weights) = 0; public: // X is nxm vector, y is nx1 vector virtual BaseClassifier& fit(vector>& X, vector& y, vector& features, string className, map>& states) = 0; diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 2f0de11..cb93141 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -52,7 +52,8 @@ namespace bayesnet { auto mask = samples.index({ -1, "..." }) == value; auto first_dataset = samples.index({ index_first, mask }); auto second_dataset = samples.index({ index_second, mask }); - auto mi = mutualInformation(first_dataset, second_dataset, weights); + auto weights_dataset = weights.index({ mask }); + auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset); auto pb = margin[value].item(); accumulated += pb * mi; } diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index 87bae91..1fab813 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -16,8 +16,10 @@ namespace bayesnet { auto n_classes = states[className].size(); metrics = Metrics(dataset, features, className, n_classes); model.initialize(); - buildModel(); - trainModel(); + // TODO weights can't be ones + const torch::Tensor weights = torch::ones({ m }, torch::kFloat); + buildModel(weights); + trainModel(weights); fitted = true; return *this; } @@ -35,9 +37,8 @@ namespace bayesnet { exit(1); } } - void Classifier::trainModel() + void Classifier::trainModel(const torch::Tensor& weights) { - const torch::Tensor weights = torch::ones({ m }); model.fit(dataset, weights, features, className, states); } // X is nxm where n is the number of features and m the number of samples diff --git a/src/BayesNet/Classifier.h b/src/BayesNet/Classifier.h index 6d00928..3c18295 100644 --- a/src/BayesNet/Classifier.h +++ b/src/BayesNet/Classifier.h @@ -21,10 +21,9 @@ namespace bayesnet { string className; map> states; Tensor dataset; // (n+1)xm tensor - Tensor weights; void checkFitParameters(); - virtual void buildModel() = 0; - void trainModel() override; + virtual void buildModel(const torch::Tensor& weights) = 0; + void trainModel(const torch::Tensor& weights) override; public: Classifier(Network model); virtual ~Classifier() = default; diff --git a/src/BayesNet/Ensemble.cc b/src/BayesNet/Ensemble.cc index 34c6894..926fa5b 100644 --- a/src/BayesNet/Ensemble.cc +++ b/src/BayesNet/Ensemble.cc @@ -5,7 +5,7 @@ namespace bayesnet { Ensemble::Ensemble() : Classifier(Network()) {} - void Ensemble::trainModel() + void Ensemble::trainModel(const torch::Tensor& weights) { n_models = models.size(); for (auto i = 0; i < n_models; ++i) { diff --git a/src/BayesNet/Ensemble.h b/src/BayesNet/Ensemble.h index f0d750b..95c1da6 100644 --- a/src/BayesNet/Ensemble.h +++ b/src/BayesNet/Ensemble.h @@ -14,7 +14,7 @@ namespace bayesnet { protected: unsigned n_models; vector> models; - void trainModel() override; + void trainModel(const torch::Tensor& weights) override; vector voting(Tensor& y_pred); public: Ensemble(); diff --git a/src/BayesNet/KDB.cc b/src/BayesNet/KDB.cc index 874e08a..471f3fd 100644 --- a/src/BayesNet/KDB.cc +++ b/src/BayesNet/KDB.cc @@ -4,7 +4,7 @@ namespace bayesnet { using namespace torch; KDB::KDB(int k, float theta) : Classifier(Network()), k(k), theta(theta) {} - void KDB::buildModel() + void KDB::buildModel(const torch::Tensor& weights) { /* 1. For each feature Xi, compute mutual information, I(X;C), diff --git a/src/BayesNet/KDB.h b/src/BayesNet/KDB.h index e7af8c5..b997cdd 100644 --- a/src/BayesNet/KDB.h +++ b/src/BayesNet/KDB.h @@ -1,5 +1,6 @@ #ifndef KDB_H #define KDB_H +#include #include "Classifier.h" #include "bayesnetUtils.h" namespace bayesnet { @@ -11,7 +12,7 @@ namespace bayesnet { float theta; void add_m_edges(int idx, vector& S, Tensor& weights); protected: - void buildModel() override; + void buildModel(const torch::Tensor& weights) override; public: explicit KDB(int k, float theta = 0.03); virtual ~KDB() {}; diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index fbb62cc..b65f570 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -107,7 +107,7 @@ namespace bayesnet { void Network::checkFitData(int n_samples, int n_features, int n_samples_y, const vector& featureNames, const string& className, const map>& states, const torch::Tensor& weights) { if (weights.size(0) != n_samples) { - throw invalid_argument("Weights must have the same number of elements as samples in Network::fit"); + throw invalid_argument("Weights (" + to_string(weights.size(0)) + ") must have the same number of elements as samples (" + to_string(n_samples) + ") in Network::fit"); } if (n_samples != n_samples_y) { throw invalid_argument("X and y must have the same number of samples in Network::fit (" + to_string(n_samples) + " != " + to_string(n_samples_y) + ")"); diff --git a/src/BayesNet/Proposal.cc b/src/BayesNet/Proposal.cc index d95e701..87767b5 100644 --- a/src/BayesNet/Proposal.cc +++ b/src/BayesNet/Proposal.cc @@ -65,6 +65,7 @@ namespace bayesnet { //Update new states of the feature/node states[pFeatures[index]] = xStates; } + // TODO weights can't be ones const torch::Tensor weights = torch::ones({ pDataset.size(1) }, torch::kFloat); model.fit(pDataset, weights, pFeatures, pClassName, states); } diff --git a/src/BayesNet/SPODE.cc b/src/BayesNet/SPODE.cc index a90e5ef..83c9231 100644 --- a/src/BayesNet/SPODE.cc +++ b/src/BayesNet/SPODE.cc @@ -4,7 +4,7 @@ namespace bayesnet { SPODE::SPODE(int root) : Classifier(Network()), root(root) {} - void SPODE::buildModel() + void SPODE::buildModel(const torch::Tensor& weights) { // 0. Add all nodes to the model addNodes(); diff --git a/src/BayesNet/SPODE.h b/src/BayesNet/SPODE.h index f9b6af0..0a78830 100644 --- a/src/BayesNet/SPODE.h +++ b/src/BayesNet/SPODE.h @@ -7,7 +7,7 @@ namespace bayesnet { private: int root; protected: - void buildModel() override; + void buildModel(const torch::Tensor& weights) override; public: explicit SPODE(int root); virtual ~SPODE() {}; diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index 843a5e6..f0728be 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -5,7 +5,7 @@ namespace bayesnet { TAN::TAN() : Classifier(Network()) {} - void TAN::buildModel() + void TAN::buildModel(const torch::Tensor& weights) { // 0. Add all nodes to the model addNodes(); diff --git a/src/BayesNet/TAN.h b/src/BayesNet/TAN.h index 4c1c5f5..91b5109 100644 --- a/src/BayesNet/TAN.h +++ b/src/BayesNet/TAN.h @@ -7,7 +7,7 @@ namespace bayesnet { class TAN : public Classifier { private: protected: - void buildModel() override; + void buildModel(const torch::Tensor& weights) override; public: TAN(); virtual ~TAN() {}; diff --git a/src/Platform/Report.h b/src/Platform/Report.h index 5934b2f..2708d4e 100644 --- a/src/Platform/Report.h +++ b/src/Platform/Report.h @@ -6,7 +6,7 @@ #include "Colors.h" using json = nlohmann::json; -const int MAXL = 121; +const int MAXL = 122; namespace platform { using namespace std; class Report { diff --git a/src/Platform/main.cc b/src/Platform/main.cc index 0618c89..6f9ce1c 100644 --- a/src/Platform/main.cc +++ b/src/Platform/main.cc @@ -103,7 +103,7 @@ int main(int argc, char** argv) */ auto env = platform::DotEnv(); auto experiment = platform::Experiment(); - experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("1.0.0"); + experiment.setTitle(title).setLanguage("cpp").setLanguageVersion("14.0.3"); experiment.setDiscretized(discretize_dataset).setModel(model_name).setPlatform(env.get("platform")); experiment.setStratified(stratified).setNFolds(n_folds).setScoreName("accuracy"); for (auto seed : seeds) { From fa612c531e1a8b9957ac4ef4bca475e0699a8e83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 15 Aug 2023 15:59:56 +0200 Subject: [PATCH 09/20] Complete Adding weights to Models --- src/BayesNet/BaseClassifier.h | 1 + src/BayesNet/Classifier.cc | 19 ++++++++++++------- src/BayesNet/Classifier.h | 3 ++- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/BayesNet/BaseClassifier.h b/src/BayesNet/BaseClassifier.h index 527b5c5..5f1cbaa 100644 --- a/src/BayesNet/BaseClassifier.h +++ b/src/BayesNet/BaseClassifier.h @@ -13,6 +13,7 @@ namespace bayesnet { // X is nxm tensor, y is nx1 tensor virtual BaseClassifier& fit(torch::Tensor& X, torch::Tensor& y, vector& features, string className, map>& states) = 0; virtual BaseClassifier& fit(torch::Tensor& dataset, vector& features, string className, map>& states) = 0; + virtual BaseClassifier& fit(torch::Tensor& dataset, vector& features, string className, map>& states, const torch::Tensor& weights) = 0; virtual ~BaseClassifier() = default; torch::Tensor virtual predict(torch::Tensor& X) = 0; vector virtual predict(vector>& X) = 0; diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index 1fab813..154f1df 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -5,7 +5,7 @@ namespace bayesnet { using namespace torch; Classifier::Classifier(Network model) : model(model), m(0), n(0), metrics(Metrics()), fitted(false) {} - Classifier& Classifier::build(vector& features, string className, map>& states) + Classifier& Classifier::build(vector& features, string className, map>& states, const torch::Tensor& weights) { this->features = features; this->className = className; @@ -16,14 +16,11 @@ namespace bayesnet { auto n_classes = states[className].size(); metrics = Metrics(dataset, features, className, n_classes); model.initialize(); - // TODO weights can't be ones - const torch::Tensor weights = torch::ones({ m }, torch::kFloat); buildModel(weights); trainModel(weights); fitted = true; return *this; } - void Classifier::buildDataset(Tensor& ytmp) { try { @@ -46,7 +43,8 @@ namespace bayesnet { { dataset = X; buildDataset(y); - return build(features, className, states); + const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + return build(features, className, states, weights); } // X is nxm where n is the number of features and m the number of samples Classifier& Classifier::fit(vector>& X, vector& y, vector& features, string className, map>& states) @@ -57,12 +55,19 @@ namespace bayesnet { } auto ytmp = torch::tensor(y, kInt32); buildDataset(ytmp); - return build(features, className, states); + const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + return build(features, className, states, weights); } Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states) { this->dataset = dataset; - return build(features, className, states); + const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + return build(features, className, states, weights); + } + Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states, const torch::Tensor& weights) + { + this->dataset = dataset; + return build(features, className, states, weights); } void Classifier::checkFitParameters() { diff --git a/src/BayesNet/Classifier.h b/src/BayesNet/Classifier.h index 3c18295..0c2940b 100644 --- a/src/BayesNet/Classifier.h +++ b/src/BayesNet/Classifier.h @@ -11,7 +11,7 @@ namespace bayesnet { class Classifier : public BaseClassifier { private: void buildDataset(torch::Tensor& y); - Classifier& build(vector& features, string className, map>& states); + Classifier& build(vector& features, string className, map>& states, const torch::Tensor& weights); protected: bool fitted; int m, n; // m: number of samples, n: number of features @@ -30,6 +30,7 @@ namespace bayesnet { Classifier& fit(vector>& X, vector& y, vector& features, string className, map>& states) override; Classifier& fit(torch::Tensor& X, torch::Tensor& y, vector& features, string className, map>& states) override; Classifier& fit(torch::Tensor& dataset, vector& features, string className, map>& states) override; + Classifier& fit(torch::Tensor& dataset, vector& features, string className, map>& states, const torch::Tensor& weights) override; void addNodes(); int getNumberOfNodes() const override; int getNumberOfEdges() const override; From 4d4780c1d5797084f32c70aac3d735c658f10e9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Tue, 15 Aug 2023 16:16:04 +0200 Subject: [PATCH 10/20] Add BoostAODE model based on AODE --- src/BayesNet/BoostAODE.cc | 16 ++++++++++++++++ src/BayesNet/BoostAODE.h | 15 +++++++++++++++ src/BayesNet/CMakeLists.txt | 2 +- src/Platform/Models.h | 1 + src/Platform/modelRegister.h | 2 ++ 5 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 src/BayesNet/BoostAODE.cc create mode 100644 src/BayesNet/BoostAODE.h diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc new file mode 100644 index 0000000..baafa16 --- /dev/null +++ b/src/BayesNet/BoostAODE.cc @@ -0,0 +1,16 @@ +#include "BoostAODE.h" + +namespace bayesnet { + BoostAODE::BoostAODE() : Ensemble() {} + void BoostAODE::buildModel(const torch::Tensor& weights) + { + models.clear(); + for (int i = 0; i < features.size(); ++i) { + models.push_back(std::make_unique(i)); + } + } + vector BoostAODE::graph(const string& title) const + { + return Ensemble::graph(title); + } +} \ No newline at end of file diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h new file mode 100644 index 0000000..66a871f --- /dev/null +++ b/src/BayesNet/BoostAODE.h @@ -0,0 +1,15 @@ +#ifndef BOOSTAODE_H +#define BOOSTAODE_H +#include "Ensemble.h" +#include "SPODE.h" +namespace bayesnet { + class BoostAODE : public Ensemble { + protected: + void buildModel(const torch::Tensor& weights) override; + public: + BoostAODE(); + virtual ~BoostAODE() {}; + vector graph(const string& title = "BoostAODE") const override; + }; +} +#endif \ No newline at end of file diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index a2b9126..a94d8e9 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -3,5 +3,5 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/Files) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc - KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) + KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) target_link_libraries(BayesNet mdlp ArffFiles "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Models.h b/src/Platform/Models.h index 0e3184b..6c5d437 100644 --- a/src/Platform/Models.h +++ b/src/Platform/Models.h @@ -10,6 +10,7 @@ #include "KDBLd.h" #include "SPODELd.h" #include "AODELd.h" +#include "BoostAODE.h" namespace platform { class Models { private: diff --git a/src/Platform/modelRegister.h b/src/Platform/modelRegister.h index 6ae9af3..04b48cf 100644 --- a/src/Platform/modelRegister.h +++ b/src/Platform/modelRegister.h @@ -16,4 +16,6 @@ static platform::Registrar registrarA("AODE", [](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODE();}); static platform::Registrar registrarALD("AODELd", [](void) -> bayesnet::BaseClassifier* { return new bayesnet::AODELd();}); +static platform::Registrar registrarBA("BoostAODE", + [](void) -> bayesnet::BaseClassifier* { return new bayesnet::BoostAODE();}); #endif \ No newline at end of file From 80b20f35b4d1e4f674eb435dc51284ea61737830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 16 Aug 2023 12:32:51 +0200 Subject: [PATCH 11/20] Fix weights mistakes in computation --- .vscode/launch.json | 3 +- CMakeLists.txt | 1 + lib/Files/CMakeLists.txt | 3 +- lib/featureselect/CMakeLists.txt | 1 + lib/featureselect/FeatureSelect.cpp | 119 ++++++++++++++++++++++++++++ lib/featureselect/FeatureSelect.h | 38 +++++++++ sample/sample.cc | 110 ++++++++++++------------- src/BayesNet/BayesMetrics.cc | 12 +-- src/BayesNet/BoostAODE.cc | 22 +++++ src/BayesNet/CMakeLists.txt | 6 +- src/BayesNet/Classifier.cc | 6 +- src/BayesNet/Network.cc | 5 +- src/BayesNet/Network.h | 3 +- src/BayesNet/Node.cc | 4 +- src/BayesNet/Node.h | 2 +- src/BayesNet/TAN.cc | 2 + 16 files changed, 262 insertions(+), 75 deletions(-) create mode 100644 lib/featureselect/CMakeLists.txt create mode 100644 lib/featureselect/FeatureSelect.cpp create mode 100644 lib/featureselect/FeatureSelect.h diff --git a/.vscode/launch.json b/.vscode/launch.json index e0da5f0..c1275e6 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -25,9 +25,10 @@ "program": "${workspaceFolder}/build/src/Platform/main", "args": [ "-m", - "TANLd", + "BoostAODE", "-p", "/Users/rmontanana/Code/discretizbench/datasets", + "--discretize", "--stratified", "-d", "iris" diff --git a/CMakeLists.txt b/CMakeLists.txt index c53a3a2..186a175 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,7 @@ add_git_submodule("lib/json") # -------------- add_subdirectory(config) add_subdirectory(lib/Files) +add_subdirectory(lib/FeatureSelect) add_subdirectory(src/BayesNet) add_subdirectory(src/Platform) add_subdirectory(sample) diff --git a/lib/Files/CMakeLists.txt b/lib/Files/CMakeLists.txt index 5e3412f..fce5b8f 100644 --- a/lib/Files/CMakeLists.txt +++ b/lib/Files/CMakeLists.txt @@ -1,2 +1 @@ -add_library(ArffFiles ArffFiles.cc) -#target_link_libraries(BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file +add_library(ArffFiles ArffFiles.cc) \ No newline at end of file diff --git a/lib/featureselect/CMakeLists.txt b/lib/featureselect/CMakeLists.txt new file mode 100644 index 0000000..06da1b7 --- /dev/null +++ b/lib/featureselect/CMakeLists.txt @@ -0,0 +1 @@ +add_library(FeatureSelect FeatureSelect.cpp) \ No newline at end of file diff --git a/lib/featureselect/FeatureSelect.cpp b/lib/featureselect/FeatureSelect.cpp new file mode 100644 index 0000000..6659063 --- /dev/null +++ b/lib/featureselect/FeatureSelect.cpp @@ -0,0 +1,119 @@ +#include "FeatureSelect.h" +namespace features { + SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat) + : samples(samples), labels(labels), weights(weights), k(k), nat(nat) + { + if (samples.size() == 0 || samples[0].size() == 0) + throw invalid_argument("features must be a non-empty matrix"); + if (samples.size() != labels.size()) + throw invalid_argument("number of samples (" + to_string(samples.size()) + ") and labels (" + to_string(labels.size()) + ") must be equal"); + if (samples.size() != weights.size()) + throw invalid_argument("number of samples and weights must be equal"); + if (k < 1 || k > static_cast(samples[0].size())) + throw invalid_argument("k must be between 1 and number of features"); + numFeatures = 0; + numClasses = 0; + numSamples = 0; + fitted = false; + } + SelectKBestWeighted& SelectKBestWeighted::fit() + { + auto labelsCopy = labels; + numFeatures = samples[0].size(); + numSamples = samples.size(); + // compute number of classes + sort(labelsCopy.begin(), labelsCopy.end()); + auto last = unique(labelsCopy.begin(), labelsCopy.end()); + labelsCopy.erase(last, labelsCopy.end()); + numClasses = labelsCopy.size(); + // compute scores + scores.reserve(numFeatures); + for (int i = 0; i < numFeatures; ++i) { + scores.push_back(MutualInformation(i)); + features.push_back(i); + } + // sort & reduce scores and features + sort(features.begin(), features.end(), [&](int i, int j) + { return scores[i] > scores[j]; }); + sort(scores.begin(), scores.end(), greater()); + features.resize(k); + scores.resize(k); + fitted = true; + return *this; + } + precision_t SelectKBestWeighted::entropyLabel() + { + return entropy(labels); + } + precision_t SelectKBestWeighted::entropy(const sample_t& data) + { + precision_t ventropy = 0, totalWeight = 0; + score_t counts(numClasses + 1, 0); + for (auto i = 0; i < static_cast(data.size()); ++i) { + counts[data[i]] += weights[i]; + totalWeight += weights[i]; + } + for (auto count : counts) { + precision_t p = count / totalWeight; + if (p > 0) { + if (nat) { + ventropy -= p * log(p); + } else { + ventropy -= p * log2(p); + } + } + } + return ventropy; + } + // H(Y|X) = sum_{x in X} p(x) H(Y|X=x) + precision_t SelectKBestWeighted::conditionalEntropy(const int feature) + { + unordered_map featureCounts; + unordered_map> jointCounts; + featureCounts.clear(); + jointCounts.clear(); + precision_t totalWeight = 0; + for (auto i = 0; i < numSamples; i++) { + featureCounts[samples[i][feature]] += weights[i]; + jointCounts[samples[i][feature]][labels[i]] += weights[i]; + totalWeight += weights[i]; + } + if (totalWeight == 0) + throw invalid_argument("Total weight should not be zero"); + precision_t entropy = 0; + for (auto& [feat, count] : featureCounts) { + auto p_f = count / totalWeight; + precision_t entropy_f = 0; + for (auto& [label, jointCount] : jointCounts[feat]) { + auto p_l_f = jointCount / count; + if (p_l_f > 0) { + if (nat) { + entropy_f -= p_l_f * log(p_l_f); + } else { + entropy_f -= p_l_f * log2(p_l_f); + } + } + } + entropy += p_f * entropy_f; + } + return entropy; + } + // I(X;Y) = H(Y) - H(Y|X) + precision_t SelectKBestWeighted::MutualInformation(const int i) + { + return entropyLabel() - conditionalEntropy(i); + } + score_t SelectKBestWeighted::getScores() const + { + if (!fitted) + throw logic_error("score not fitted"); + return scores; + } + //Return the indices of the selected features + labels_t SelectKBestWeighted::getFeatures() const + { + if (!fitted) + throw logic_error("score not fitted"); + return features; + } +} diff --git a/lib/featureselect/FeatureSelect.h b/lib/featureselect/FeatureSelect.h new file mode 100644 index 0000000..18ddd99 --- /dev/null +++ b/lib/featureselect/FeatureSelect.h @@ -0,0 +1,38 @@ +#ifndef SELECT_K_BEST_WEIGHTED_H +#define SELECT_K_BEST_WEIGHTED_H +#include +#include +#include +using namespace std; +namespace features { + typedef float precision_t; + typedef int value_t; + typedef vector sample_t; + typedef vector samples_t; + typedef vector labels_t; + typedef vector score_t, weights_t; + + class SelectKBestWeighted { + private: + const samples_t samples; + const labels_t labels; + const weights_t weights; + const int k; + bool nat; // use natural log or log2 + int numFeatures, numClasses, numSamples; + bool fitted; + score_t scores; // scores of the features + labels_t features; // indices of the selected features + precision_t entropyLabel(); + precision_t entropy(const sample_t&); + precision_t conditionalEntropy(const int); + precision_t MutualInformation(const int); + public: + SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool); + SelectKBestWeighted& fit(); + score_t getScores() const; + labels_t getFeatures() const; //Return the indices of the selected features + static inline string version() { return "0.1.0"; }; + }; +} +#endif \ No newline at end of file diff --git a/sample/sample.cc b/sample/sample.cc index 7da318d..ecf76be 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -178,59 +178,59 @@ int main(int argc, char** argv) cout << "end." << endl; auto score = clf->score(Xd, y); cout << "Score: " << score << endl; - // auto graph = clf->graph(); - // auto dot_file = model_name + "_" + file_name; - // ofstream file(dot_file + ".dot"); - // file << graph; - // file.close(); - // cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; - // cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; - // string stratified_string = stratified ? " Stratified" : ""; - // cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; - // cout << "==========================================" << endl; - // torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); - // torch::Tensor yt = torch::tensor(y, torch::kInt32); - // for (int i = 0; i < features.size(); ++i) { - // Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); - // } - // float total_score = 0, total_score_train = 0, score_train, score_test; - // Fold* fold; - // if (stratified) - // fold = new StratifiedKFold(nFolds, y, seed); - // else - // fold = new KFold(nFolds, y.size(), seed); - // for (auto i = 0; i < nFolds; ++i) { - // auto [train, test] = fold->getFold(i); - // cout << "Fold: " << i + 1 << endl; - // if (tensors) { - // auto ttrain = torch::tensor(train, torch::kInt64); - // auto ttest = torch::tensor(test, torch::kInt64); - // torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); - // torch::Tensor ytraint = yt.index({ ttrain }); - // torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); - // torch::Tensor ytestt = yt.index({ ttest }); - // clf->fit(Xtraint, ytraint, features, className, states); - // auto temp = clf->predict(Xtraint); - // score_train = clf->score(Xtraint, ytraint); - // score_test = clf->score(Xtestt, ytestt); - // } else { - // auto [Xtrain, ytrain] = extract_indices(train, Xd, y); - // auto [Xtest, ytest] = extract_indices(test, Xd, y); - // clf->fit(Xtrain, ytrain, features, className, states); - // score_train = clf->score(Xtrain, ytrain); - // score_test = clf->score(Xtest, ytest); - // } - // if (dump_cpt) { - // cout << "--- CPT Tables ---" << endl; - // clf->dump_cpt(); - // } - // total_score_train += score_train; - // total_score += score_test; - // cout << "Score Train: " << score_train << endl; - // cout << "Score Test : " << score_test << endl; - // cout << "-------------------------------------------------------------------------------" << endl; - // } - // cout << "**********************************************************************************" << endl; - // cout << "Average Score Train: " << total_score_train / nFolds << endl; - // cout << "Average Score Test : " << total_score / nFolds << endl;return 0; + auto graph = clf->graph(); + auto dot_file = model_name + "_" + file_name; + ofstream file(dot_file + ".dot"); + file << graph; + file.close(); + cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; + cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; + string stratified_string = stratified ? " Stratified" : ""; + cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; + cout << "==========================================" << endl; + torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); + torch::Tensor yt = torch::tensor(y, torch::kInt32); + for (int i = 0; i < features.size(); ++i) { + Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + } + float total_score = 0, total_score_train = 0, score_train, score_test; + Fold* fold; + if (stratified) + fold = new StratifiedKFold(nFolds, y, seed); + else + fold = new KFold(nFolds, y.size(), seed); + for (auto i = 0; i < nFolds; ++i) { + auto [train, test] = fold->getFold(i); + cout << "Fold: " << i + 1 << endl; + if (tensors) { + auto ttrain = torch::tensor(train, torch::kInt64); + auto ttest = torch::tensor(test, torch::kInt64); + torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); + torch::Tensor ytraint = yt.index({ ttrain }); + torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); + torch::Tensor ytestt = yt.index({ ttest }); + clf->fit(Xtraint, ytraint, features, className, states); + auto temp = clf->predict(Xtraint); + score_train = clf->score(Xtraint, ytraint); + score_test = clf->score(Xtestt, ytestt); + } else { + auto [Xtrain, ytrain] = extract_indices(train, Xd, y); + auto [Xtest, ytest] = extract_indices(test, Xd, y); + clf->fit(Xtrain, ytrain, features, className, states); + score_train = clf->score(Xtrain, ytrain); + score_test = clf->score(Xtest, ytest); + } + if (dump_cpt) { + cout << "--- CPT Tables ---" << endl; + clf->dump_cpt(); + } + total_score_train += score_train; + total_score += score_test; + cout << "Score Train: " << score_train << endl; + cout << "Score Test : " << score_test << endl; + cout << "-------------------------------------------------------------------------------" << endl; + } + cout << "**********************************************************************************" << endl; + cout << "Average Score Train: " << total_score_train / nFolds << endl; + cout << "Average Score Test : " << total_score / nFolds << endl;return 0; } \ No newline at end of file diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index cb93141..7d249ee 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -38,12 +38,14 @@ namespace bayesnet { auto source = vector(features); source.push_back(className); auto combinations = doCombinations(source); + double totalWeight = weights.sum().item(); // Compute class prior - auto margin = torch::zeros({ classNumStates }); + auto margin = torch::zeros({ classNumStates }, torch::kFloat); for (int value = 0; value < classNumStates; ++value) { auto mask = samples.index({ -1, "..." }) == value; - margin[value] = mask.sum().item() / samples.size(1); + margin[value] = mask.sum().item() / samples.size(1); } + cout << "Margin: " << margin; for (auto [first, second] : combinations) { int index_first = find(features.begin(), features.end(), first) - features.begin(); int index_second = find(features.begin(), features.end(), second) - features.begin(); @@ -54,7 +56,7 @@ namespace bayesnet { auto second_dataset = samples.index({ index_second, mask }); auto weights_dataset = weights.index({ mask }); auto mi = mutualInformation(first_dataset, second_dataset, weights_dataset); - auto pb = margin[value].item(); + auto pb = margin[value].item(); accumulated += pb * mi; } result.push_back(accumulated); @@ -81,7 +83,7 @@ namespace bayesnet { double Metrics::entropy(const torch::Tensor& feature, const torch::Tensor& weights) { torch::Tensor counts = feature.bincount(weights); - int totalWeight = counts.sum().item(); + double totalWeight = counts.sum().item(); torch::Tensor probs = counts.to(torch::kFloat) / totalWeight; torch::Tensor logProbs = torch::log(probs); torch::Tensor entropy = -probs * logProbs; @@ -95,7 +97,7 @@ namespace bayesnet { unordered_map> jointCounts; double totalWeight = 0; for (auto i = 0; i < numSamples; i++) { - jointCounts[secondFeature[i].item()][firstFeature[i].item()] += 1; + jointCounts[secondFeature[i].item()][firstFeature[i].item()] += weights[i].item(); totalWeight += weights[i].item(); } if (totalWeight == 0) diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index baafa16..d68ac2a 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -1,10 +1,32 @@ #include "BoostAODE.h" +#include "FeatureSelect.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} void BoostAODE::buildModel(const torch::Tensor& weights) { models.clear(); + int n_samples = dataset.size(1); + int n_features = dataset.size(0); + features::samples_t vsamples; + for (auto i = 0; i < n_samples; ++i) { + auto row = dataset.index({ "...", i }); + // convert row to std::vector + auto vrow = vector(row.data_ptr(), row.data_ptr() + row.numel()); + vsamples.push_back(vrow); + } + auto vweights = features::weights_t(n_samples, 1.0 / n_samples); + auto row = dataset.index({ -1, "..." }); + auto yv = features::labels_t(row.data_ptr(), row.data_ptr() + row.numel()); + auto featureSelection = features::SelectKBestWeighted(vsamples, yv, vweights, n_features, true); + auto features = featureSelection.fit().getFeatures(); + // features = ( + // CSelectKBestWeighted( + // self.X_, self.y_, weights, k = self.n_features_in_ + // ) + // .fit() + // .get_features() + auto scores = features::score_t(n_features, 0.0); for (int i = 0; i < features.size(); ++i) { models.push_back(std::make_unique(i)); } diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index a94d8e9..2f2f631 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -1,7 +1,9 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) +include_directories(${BayesNet_SOURCE_DIR}/lib/featureselect) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc - KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) -target_link_libraries(BayesNet mdlp ArffFiles "${TORCH_LIBRARIES}") \ No newline at end of file + KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc + Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) +target_link_libraries(BayesNet mdlp FeatureSelect "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index 154f1df..4d4ab08 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -43,7 +43,7 @@ namespace bayesnet { { dataset = X; buildDataset(y); - const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); return build(features, className, states, weights); } // X is nxm where n is the number of features and m the number of samples @@ -55,13 +55,13 @@ namespace bayesnet { } auto ytmp = torch::tensor(y, kInt32); buildDataset(ytmp); - const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); return build(features, className, states, weights); } Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states) { this->dataset = dataset; - const torch::Tensor weights = torch::ones({ dataset.size(1) }, torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); return build(features, className, states, weights); } Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states, const torch::Tensor& weights) diff --git a/src/BayesNet/Network.cc b/src/BayesNet/Network.cc index b65f570..5753eb8 100644 --- a/src/BayesNet/Network.cc +++ b/src/BayesNet/Network.cc @@ -5,7 +5,6 @@ namespace bayesnet { Network::Network() : features(vector()), className(""), classNumStates(0), fitted(false) {} Network::Network(float maxT) : features(vector()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} - Network::Network(float maxT, int smoothing) : laplaceSmoothing(smoothing), features(vector()), className(""), classNumStates(0), maxThreads(maxT), fitted(false) {} Network::Network(Network& other) : laplaceSmoothing(other.laplaceSmoothing), features(other.features), className(other.className), classNumStates(other.getClassNumStates()), maxThreads(other. getmaxThreads()), fitted(other.fitted) { @@ -174,6 +173,7 @@ namespace bayesnet { void Network::completeFit(const map>& states, const torch::Tensor& weights) { setStates(states); + laplaceSmoothing = 1.0 / samples.size(1); // To use in CPT computation int maxThreadsRunning = static_cast(std::thread::hardware_concurrency() * maxThreads); if (maxThreadsRunning < 1) { maxThreadsRunning = 1; @@ -347,7 +347,7 @@ namespace bayesnet { } // Normalize result double sum = accumulate(result.begin(), result.end(), 0.0); - transform(result.begin(), result.end(), result.begin(), [sum](double& value) { return value / sum; }); + transform(result.begin(), result.end(), result.begin(), [sum](const double& value) { return value / sum; }); return result; } vector Network::show() const @@ -435,6 +435,7 @@ namespace bayesnet { { for (auto& node : nodes) { cout << "* " << node.first << ": (" << node.second->getNumStates() << ") : " << node.second->getCPT().sizes() << endl; + cout << node.second->getCPT() << endl; } } } diff --git a/src/BayesNet/Network.h b/src/BayesNet/Network.h index 5ea94ec..a26e790 100644 --- a/src/BayesNet/Network.h +++ b/src/BayesNet/Network.h @@ -13,7 +13,7 @@ namespace bayesnet { int classNumStates; vector features; // Including classname string className; - int laplaceSmoothing = 1; + double laplaceSmoothing; torch::Tensor samples; // nxm tensor used to fit the model bool isCyclic(const std::string&, std::unordered_set&, std::unordered_set&); vector predict_sample(const vector&); @@ -25,7 +25,6 @@ namespace bayesnet { void setStates(const map>&); public: Network(); - explicit Network(float, int); explicit Network(float); explicit Network(Network&); torch::Tensor& getSamples(); diff --git a/src/BayesNet/Node.cc b/src/BayesNet/Node.cc index 10f26b8..04d2ed2 100644 --- a/src/BayesNet/Node.cc +++ b/src/BayesNet/Node.cc @@ -84,7 +84,7 @@ namespace bayesnet { } return result; } - void Node::computeCPT(const torch::Tensor& dataset, const vector& features, const int laplaceSmoothing, const torch::Tensor& weights) + void Node::computeCPT(const torch::Tensor& dataset, const vector& features, const double laplaceSmoothing, const torch::Tensor& weights) { dimensions.clear(); // Get dimensions of the CPT @@ -111,7 +111,7 @@ namespace bayesnet { coordinates.push_back(dataset.index({ parent_index, n_sample })); } // Increment the count of the corresponding coordinate - cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item()); + cpTable.index_put_({ coordinates }, cpTable.index({ coordinates }) + weights.index({ n_sample }).item()); } // Normalize the counts cpTable = cpTable / cpTable.sum(0); diff --git a/src/BayesNet/Node.h b/src/BayesNet/Node.h index 83c4b1a..6758c5c 100644 --- a/src/BayesNet/Node.h +++ b/src/BayesNet/Node.h @@ -26,7 +26,7 @@ namespace bayesnet { vector& getParents(); vector& getChildren(); torch::Tensor& getCPT(); - void computeCPT(const torch::Tensor& dataset, const vector& features, const int laplaceSmoothing, const torch::Tensor& weights); + void computeCPT(const torch::Tensor& dataset, const vector& features, const double laplaceSmoothing, const torch::Tensor& weights); int getNumStates() const; void setNumStates(int); unsigned minFill(); diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index f0728be..3bdfa8e 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -22,6 +22,8 @@ namespace bayesnet { auto root = mi[mi.size() - 1].first; // 2. Compute mutual information between each feature and the class auto weights_matrix = metrics.conditionalEdge(weights); + cout << "*** Weights matrix ***\n"; + cout << weights_matrix << "\n"; // 3. Compute the maximum spanning tree auto mst = metrics.maximumSpanningTree(features, weights_matrix, root); // 4. Add edges from the maximum spanning tree to the model From 918a7b4180d8114c9fbe4c5fd19d5af871b00bf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 16 Aug 2023 12:36:38 +0200 Subject: [PATCH 12/20] Remove unneeded output --- src/BayesNet/BayesMetrics.cc | 1 - src/BayesNet/TAN.cc | 2 -- 2 files changed, 3 deletions(-) diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 7d249ee..a0b46f1 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -45,7 +45,6 @@ namespace bayesnet { auto mask = samples.index({ -1, "..." }) == value; margin[value] = mask.sum().item() / samples.size(1); } - cout << "Margin: " << margin; for (auto [first, second] : combinations) { int index_first = find(features.begin(), features.end(), first) - features.begin(); int index_second = find(features.begin(), features.end(), second) - features.begin(); diff --git a/src/BayesNet/TAN.cc b/src/BayesNet/TAN.cc index 3bdfa8e..f0728be 100644 --- a/src/BayesNet/TAN.cc +++ b/src/BayesNet/TAN.cc @@ -22,8 +22,6 @@ namespace bayesnet { auto root = mi[mi.size() - 1].first; // 2. Compute mutual information between each feature and the class auto weights_matrix = metrics.conditionalEdge(weights); - cout << "*** Weights matrix ***\n"; - cout << weights_matrix << "\n"; // 3. Compute the maximum spanning tree auto mst = metrics.maximumSpanningTree(features, weights_matrix, root); // 4. Add edges from the maximum spanning tree to the model From a3e665eed6a2d6afd6d9957cc4895aa6a0cd281d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 16 Aug 2023 12:46:09 +0200 Subject: [PATCH 13/20] make weights double --- src/BayesNet/Classifier.cc | 6 +++--- src/BayesNet/Proposal.cc | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/BayesNet/Classifier.cc b/src/BayesNet/Classifier.cc index 4d4ab08..ff25657 100644 --- a/src/BayesNet/Classifier.cc +++ b/src/BayesNet/Classifier.cc @@ -43,7 +43,7 @@ namespace bayesnet { { dataset = X; buildDataset(y); - const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble); return build(features, className, states, weights); } // X is nxm where n is the number of features and m the number of samples @@ -55,13 +55,13 @@ namespace bayesnet { } auto ytmp = torch::tensor(y, kInt32); buildDataset(ytmp); - const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble); return build(features, className, states, weights); } Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states) { this->dataset = dataset; - const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kFloat); + const torch::Tensor weights = torch::full({ dataset.size(1) }, 1.0 / dataset.size(1), torch::kDouble); return build(features, className, states, weights); } Classifier& Classifier::fit(torch::Tensor& dataset, vector& features, string className, map>& states, const torch::Tensor& weights) diff --git a/src/BayesNet/Proposal.cc b/src/BayesNet/Proposal.cc index 87767b5..c410289 100644 --- a/src/BayesNet/Proposal.cc +++ b/src/BayesNet/Proposal.cc @@ -65,8 +65,7 @@ namespace bayesnet { //Update new states of the feature/node states[pFeatures[index]] = xStates; } - // TODO weights can't be ones - const torch::Tensor weights = torch::ones({ pDataset.size(1) }, torch::kFloat); + const torch::Tensor weights = torch::full({ pDataset.size(1) }, 1.0 / pDataset.size(1), torch::kDouble); model.fit(pDataset, weights, pFeatures, pClassName, states); } return states; From 704dc937be0658f8b87e86f46ac4c8f6bfd53d48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Wed, 16 Aug 2023 19:05:18 +0200 Subject: [PATCH 14/20] Remove FeatureSel, add SelectKBest to BayesMetrics --- CMakeLists.txt | 1 - lib/featureselect/CMakeLists.txt | 1 - lib/featureselect/FeatureSelect.cpp | 119 ---------------------------- lib/featureselect/FeatureSelect.h | 38 --------- src/BayesNet/BayesMetrics.cc | 25 ++++++ src/BayesNet/BayesMetrics.h | 4 + src/BayesNet/BoostAODE.cc | 43 +++++----- src/BayesNet/BoostAODE.h | 1 + src/BayesNet/CMakeLists.txt | 3 +- src/BayesNet/SPODELd.cc | 1 - 10 files changed, 52 insertions(+), 184 deletions(-) delete mode 100644 lib/featureselect/CMakeLists.txt delete mode 100644 lib/featureselect/FeatureSelect.cpp delete mode 100644 lib/featureselect/FeatureSelect.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 186a175..c53a3a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,7 +60,6 @@ add_git_submodule("lib/json") # -------------- add_subdirectory(config) add_subdirectory(lib/Files) -add_subdirectory(lib/FeatureSelect) add_subdirectory(src/BayesNet) add_subdirectory(src/Platform) add_subdirectory(sample) diff --git a/lib/featureselect/CMakeLists.txt b/lib/featureselect/CMakeLists.txt deleted file mode 100644 index 06da1b7..0000000 --- a/lib/featureselect/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_library(FeatureSelect FeatureSelect.cpp) \ No newline at end of file diff --git a/lib/featureselect/FeatureSelect.cpp b/lib/featureselect/FeatureSelect.cpp deleted file mode 100644 index 6659063..0000000 --- a/lib/featureselect/FeatureSelect.cpp +++ /dev/null @@ -1,119 +0,0 @@ -#include "FeatureSelect.h" -namespace features { - SelectKBestWeighted::SelectKBestWeighted(samples_t& samples, labels_t& labels, weights_t& weights, int k, bool nat) - : samples(samples), labels(labels), weights(weights), k(k), nat(nat) - { - if (samples.size() == 0 || samples[0].size() == 0) - throw invalid_argument("features must be a non-empty matrix"); - if (samples.size() != labels.size()) - throw invalid_argument("number of samples (" + to_string(samples.size()) + ") and labels (" + to_string(labels.size()) + ") must be equal"); - if (samples.size() != weights.size()) - throw invalid_argument("number of samples and weights must be equal"); - if (k < 1 || k > static_cast(samples[0].size())) - throw invalid_argument("k must be between 1 and number of features"); - numFeatures = 0; - numClasses = 0; - numSamples = 0; - fitted = false; - } - SelectKBestWeighted& SelectKBestWeighted::fit() - { - auto labelsCopy = labels; - numFeatures = samples[0].size(); - numSamples = samples.size(); - // compute number of classes - sort(labelsCopy.begin(), labelsCopy.end()); - auto last = unique(labelsCopy.begin(), labelsCopy.end()); - labelsCopy.erase(last, labelsCopy.end()); - numClasses = labelsCopy.size(); - // compute scores - scores.reserve(numFeatures); - for (int i = 0; i < numFeatures; ++i) { - scores.push_back(MutualInformation(i)); - features.push_back(i); - } - // sort & reduce scores and features - sort(features.begin(), features.end(), [&](int i, int j) - { return scores[i] > scores[j]; }); - sort(scores.begin(), scores.end(), greater()); - features.resize(k); - scores.resize(k); - fitted = true; - return *this; - } - precision_t SelectKBestWeighted::entropyLabel() - { - return entropy(labels); - } - precision_t SelectKBestWeighted::entropy(const sample_t& data) - { - precision_t ventropy = 0, totalWeight = 0; - score_t counts(numClasses + 1, 0); - for (auto i = 0; i < static_cast(data.size()); ++i) { - counts[data[i]] += weights[i]; - totalWeight += weights[i]; - } - for (auto count : counts) { - precision_t p = count / totalWeight; - if (p > 0) { - if (nat) { - ventropy -= p * log(p); - } else { - ventropy -= p * log2(p); - } - } - } - return ventropy; - } - // H(Y|X) = sum_{x in X} p(x) H(Y|X=x) - precision_t SelectKBestWeighted::conditionalEntropy(const int feature) - { - unordered_map featureCounts; - unordered_map> jointCounts; - featureCounts.clear(); - jointCounts.clear(); - precision_t totalWeight = 0; - for (auto i = 0; i < numSamples; i++) { - featureCounts[samples[i][feature]] += weights[i]; - jointCounts[samples[i][feature]][labels[i]] += weights[i]; - totalWeight += weights[i]; - } - if (totalWeight == 0) - throw invalid_argument("Total weight should not be zero"); - precision_t entropy = 0; - for (auto& [feat, count] : featureCounts) { - auto p_f = count / totalWeight; - precision_t entropy_f = 0; - for (auto& [label, jointCount] : jointCounts[feat]) { - auto p_l_f = jointCount / count; - if (p_l_f > 0) { - if (nat) { - entropy_f -= p_l_f * log(p_l_f); - } else { - entropy_f -= p_l_f * log2(p_l_f); - } - } - } - entropy += p_f * entropy_f; - } - return entropy; - } - // I(X;Y) = H(Y) - H(Y|X) - precision_t SelectKBestWeighted::MutualInformation(const int i) - { - return entropyLabel() - conditionalEntropy(i); - } - score_t SelectKBestWeighted::getScores() const - { - if (!fitted) - throw logic_error("score not fitted"); - return scores; - } - //Return the indices of the selected features - labels_t SelectKBestWeighted::getFeatures() const - { - if (!fitted) - throw logic_error("score not fitted"); - return features; - } -} diff --git a/lib/featureselect/FeatureSelect.h b/lib/featureselect/FeatureSelect.h deleted file mode 100644 index 18ddd99..0000000 --- a/lib/featureselect/FeatureSelect.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef SELECT_K_BEST_WEIGHTED_H -#define SELECT_K_BEST_WEIGHTED_H -#include -#include -#include -using namespace std; -namespace features { - typedef float precision_t; - typedef int value_t; - typedef vector sample_t; - typedef vector samples_t; - typedef vector labels_t; - typedef vector score_t, weights_t; - - class SelectKBestWeighted { - private: - const samples_t samples; - const labels_t labels; - const weights_t weights; - const int k; - bool nat; // use natural log or log2 - int numFeatures, numClasses, numSamples; - bool fitted; - score_t scores; // scores of the features - labels_t features; // indices of the selected features - precision_t entropyLabel(); - precision_t entropy(const sample_t&); - precision_t conditionalEntropy(const int); - precision_t MutualInformation(const int); - public: - SelectKBestWeighted(samples_t&, labels_t&, weights_t&, int, bool); - SelectKBestWeighted& fit(); - score_t getScores() const; - labels_t getFeatures() const; //Return the indices of the selected features - static inline string version() { return "0.1.0"; }; - }; -} -#endif \ No newline at end of file diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index a0b46f1..88f0306 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -21,6 +21,31 @@ namespace bayesnet { } samples.index_put_({ -1, "..." }, torch::tensor(labels, torch::kInt32)); } + vector Metrics::SelectKBestWeighted(const torch::Tensor& weights, unsigned k) + { + auto n = samples.size(1); + if (k == 0) { + k = n; + } + // compute scores + scoresKBest.reserve(n); + auto label = samples.index({ -1, "..." }); + for (int i = 0; i < n; ++i) { + scoresKBest.push_back(mutualInformation(label, samples.index({ i, "..." }), weights)); + featuresKBest.push_back(i); + } + // sort & reduce scores and features + sort(featuresKBest.begin(), featuresKBest.end(), [&](int i, int j) + { return scoresKBest[i] > scoresKBest[j]; }); + sort(scoresKBest.begin(), scoresKBest.end(), std::greater()); + featuresKBest.resize(k); + scoresKBest.resize(k); + return featuresKBest; + } + vector Metrics::getScoresKBest() const + { + return scoresKBest; + } vector> Metrics::doCombinations(const vector& source) { vector> result; diff --git a/src/BayesNet/BayesMetrics.h b/src/BayesNet/BayesMetrics.h index 5bd25b6..70d33e9 100644 --- a/src/BayesNet/BayesMetrics.h +++ b/src/BayesNet/BayesMetrics.h @@ -12,6 +12,8 @@ namespace bayesnet { vector features; string className; int classNumStates = 0; + vector scoresKBest; + vector featuresKBest; // sorted indices of the features double entropy(const Tensor& feature, const Tensor& weights); double conditionalEntropy(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); vector> doCombinations(const vector&); @@ -19,6 +21,8 @@ namespace bayesnet { Metrics() = default; Metrics(const torch::Tensor& samples, const vector& features, const string& className, const int classNumStates); Metrics(const vector>& vsamples, const vector& labels, const vector& features, const string& className, const int classNumStates); + vector SelectKBestWeighted(const torch::Tensor& weights, unsigned k = 0); + vector getScoresKBest() const; double mutualInformation(const Tensor& firstFeature, const Tensor& secondFeature, const Tensor& weights); vector conditionalEdgeWeights(vector& weights); // To use in Python Tensor conditionalEdge(const torch::Tensor& weights); diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index d68ac2a..e9b5e62 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -1,36 +1,35 @@ #include "BoostAODE.h" -#include "FeatureSelect.h" +#include "BayesMetrics.h" namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} void BoostAODE::buildModel(const torch::Tensor& weights) { models.clear(); - int n_samples = dataset.size(1); - int n_features = dataset.size(0); - features::samples_t vsamples; - for (auto i = 0; i < n_samples; ++i) { - auto row = dataset.index({ "...", i }); - // convert row to std::vector - auto vrow = vector(row.data_ptr(), row.data_ptr() + row.numel()); - vsamples.push_back(vrow); - } - auto vweights = features::weights_t(n_samples, 1.0 / n_samples); - auto row = dataset.index({ -1, "..." }); - auto yv = features::labels_t(row.data_ptr(), row.data_ptr() + row.numel()); - auto featureSelection = features::SelectKBestWeighted(vsamples, yv, vweights, n_features, true); - auto features = featureSelection.fit().getFeatures(); - // features = ( - // CSelectKBestWeighted( - // self.X_, self.y_, weights, k = self.n_features_in_ - // ) - // .fit() - // .get_features() - auto scores = features::score_t(n_features, 0.0); for (int i = 0; i < features.size(); ++i) { models.push_back(std::make_unique(i)); } } + void BoostAODE::trainModel(const torch::Tensor& weights) + { + // End building vectors + Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kDouble); + auto X_ = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); + auto featureSelection = metrics.SelectKBestWeighted(weights_, n); // Get all the features sorted + for (int i = 0; i < features.size(); ++i) { + models[i].fit(dataset, features, className, states, weights_); + auto ypred = models[i].predict(X_); + // em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights) + // am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1) + // # Step 3.2: Update weights for next classifier + // weights = [ + // wm * np.exp(am * (ym != yp)) + // for wm, ym, yp in zip(weights, self.y_, y_pred) + // ] + // # Step 4: Add the new model + // self.estimators_.append(estimator) + } + } vector BoostAODE::graph(const string& title) const { return Ensemble::graph(title); diff --git a/src/BayesNet/BoostAODE.h b/src/BayesNet/BoostAODE.h index 66a871f..b14c7c6 100644 --- a/src/BayesNet/BoostAODE.h +++ b/src/BayesNet/BoostAODE.h @@ -6,6 +6,7 @@ namespace bayesnet { class BoostAODE : public Ensemble { protected: void buildModel(const torch::Tensor& weights) override; + void trainModel(const torch::Tensor& weights) override; public: BoostAODE(); virtual ~BoostAODE() {}; diff --git a/src/BayesNet/CMakeLists.txt b/src/BayesNet/CMakeLists.txt index 2f2f631..435511c 100644 --- a/src/BayesNet/CMakeLists.txt +++ b/src/BayesNet/CMakeLists.txt @@ -1,9 +1,8 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/mdlp) include_directories(${BayesNet_SOURCE_DIR}/lib/Files) -include_directories(${BayesNet_SOURCE_DIR}/lib/featureselect) include_directories(${BayesNet_SOURCE_DIR}/src/BayesNet) include_directories(${BayesNet_SOURCE_DIR}/src/Platform) add_library(BayesNet bayesnetUtils.cc Network.cc Node.cc BayesMetrics.cc Classifier.cc KDB.cc TAN.cc SPODE.cc Ensemble.cc AODE.cc TANLd.cc KDBLd.cc SPODELd.cc AODELd.cc BoostAODE.cc Mst.cc Proposal.cc ${BayesNet_SOURCE_DIR}/src/Platform/Models.cc) -target_link_libraries(BayesNet mdlp FeatureSelect "${TORCH_LIBRARIES}") \ No newline at end of file +target_link_libraries(BayesNet mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/BayesNet/SPODELd.cc b/src/BayesNet/SPODELd.cc index 8a38160..2711c86 100644 --- a/src/BayesNet/SPODELd.cc +++ b/src/BayesNet/SPODELd.cc @@ -21,7 +21,6 @@ namespace bayesnet { SPODELd& SPODELd::fit(torch::Tensor& dataset, vector& features_, string className_, map>& states_) { Xf = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }).clone(); - cout << "Xf " << Xf.sizes() << " dtype: " << Xf.dtype() << endl; y = dataset.index({ -1, "..." }).clone(); // This first part should go in a Classifier method called fit_local_discretization o fit_float... features = features_; From a6bb22dfb5e4f6fffc43bae9cd3d5959382835ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Fri, 18 Aug 2023 11:50:34 +0200 Subject: [PATCH 15/20] Complete first BoostAODE --- sample/sample.cc | 199 ++++++++++++++++++---------------- src/BayesNet/AODE.cc | 2 + src/BayesNet/BayesMetrics.cc | 2 +- src/BayesNet/BoostAODE.cc | 87 +++++++++++---- src/BayesNet/Ensemble.cc | 4 +- src/BayesNet/Ensemble.h | 1 + src/BayesNet/KDB.cc | 2 +- src/BayesNet/bayesnetUtils.cc | 2 +- src/BayesNet/bayesnetUtils.h | 2 +- 9 files changed, 184 insertions(+), 117 deletions(-) diff --git a/sample/sample.cc b/sample/sample.cc index ecf76be..1045c2f 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -141,96 +141,111 @@ int main(int argc, char** argv) /* * Begin Processing */ - auto handler = ArffFiles(); - handler.load(complete_file_name, class_last); - // Get Dataset X, y - vector& X = handler.getX(); - mdlp::labels_t& y = handler.getY(); - // Get className & Features - auto className = handler.getClassName(); - vector features; - auto attributes = handler.getAttributes(); - transform(attributes.begin(), attributes.end(), back_inserter(features), - [](const pair& item) { return item.first; }); - // Discretize Dataset - auto [Xd, maxes] = discretize(X, y, features); - maxes[className] = *max_element(y.begin(), y.end()) + 1; - map> states; - for (auto feature : features) { - states[feature] = vector(maxes[feature]); - } - states[className] = vector(maxes[className]); - auto clf = platform::Models::instance()->create(model_name); - clf->fit(Xd, y, features, className, states); - if (dump_cpt) { - cout << "--- CPT Tables ---" << endl; - clf->dump_cpt(); - } - auto lines = clf->show(); - for (auto line : lines) { - cout << line << endl; - } - cout << "--- Topological Order ---" << endl; - auto order = clf->topological_order(); - for (auto name : order) { - cout << name << ", "; - } - cout << "end." << endl; - auto score = clf->score(Xd, y); - cout << "Score: " << score << endl; - auto graph = clf->graph(); - auto dot_file = model_name + "_" + file_name; - ofstream file(dot_file + ".dot"); - file << graph; - file.close(); - cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; - cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; - string stratified_string = stratified ? " Stratified" : ""; - cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; - cout << "==========================================" << endl; - torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); - torch::Tensor yt = torch::tensor(y, torch::kInt32); - for (int i = 0; i < features.size(); ++i) { - Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); - } - float total_score = 0, total_score_train = 0, score_train, score_test; - Fold* fold; - if (stratified) - fold = new StratifiedKFold(nFolds, y, seed); - else - fold = new KFold(nFolds, y.size(), seed); - for (auto i = 0; i < nFolds; ++i) { - auto [train, test] = fold->getFold(i); - cout << "Fold: " << i + 1 << endl; - if (tensors) { - auto ttrain = torch::tensor(train, torch::kInt64); - auto ttest = torch::tensor(test, torch::kInt64); - torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); - torch::Tensor ytraint = yt.index({ ttrain }); - torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); - torch::Tensor ytestt = yt.index({ ttest }); - clf->fit(Xtraint, ytraint, features, className, states); - auto temp = clf->predict(Xtraint); - score_train = clf->score(Xtraint, ytraint); - score_test = clf->score(Xtestt, ytestt); - } else { - auto [Xtrain, ytrain] = extract_indices(train, Xd, y); - auto [Xtest, ytest] = extract_indices(test, Xd, y); - clf->fit(Xtrain, ytrain, features, className, states); - score_train = clf->score(Xtrain, ytrain); - score_test = clf->score(Xtest, ytest); - } - if (dump_cpt) { - cout << "--- CPT Tables ---" << endl; - clf->dump_cpt(); - } - total_score_train += score_train; - total_score += score_test; - cout << "Score Train: " << score_train << endl; - cout << "Score Test : " << score_test << endl; - cout << "-------------------------------------------------------------------------------" << endl; - } - cout << "**********************************************************************************" << endl; - cout << "Average Score Train: " << total_score_train / nFolds << endl; - cout << "Average Score Test : " << total_score / nFolds << endl;return 0; + auto ypred = torch::tensor({ 1,2,3,2,2,3,4,5,2,1 }); + auto y = torch::tensor({ 0,0,0,0,2,3,4,0,0,0 }); + auto weights = torch::ones({ 10 }, kDouble); + auto mask = ypred == y; + cout << "ypred:" << ypred << endl; + cout << "y:" << y << endl; + cout << "weights:" << weights << endl; + cout << "mask:" << mask << endl; + double value_to_add = 0.5; + weights += mask.to(torch::kDouble) * value_to_add; + cout << "New weights:" << weights << endl; + auto masked_weights = weights * mask.to(weights.dtype()); + double sum_of_weights = masked_weights.sum().item(); + cout << "Sum of weights: " << sum_of_weights << endl; + //weights.index_put_({ mask }, weights + 10); + // auto handler = ArffFiles(); + // handler.load(complete_file_name, class_last); + // // Get Dataset X, y + // vector& X = handler.getX(); + // mdlp::labels_t& y = handler.getY(); + // // Get className & Features + // auto className = handler.getClassName(); + // vector features; + // auto attributes = handler.getAttributes(); + // transform(attributes.begin(), attributes.end(), back_inserter(features), + // [](const pair& item) { return item.first; }); + // // Discretize Dataset + // auto [Xd, maxes] = discretize(X, y, features); + // maxes[className] = *max_element(y.begin(), y.end()) + 1; + // map> states; + // for (auto feature : features) { + // states[feature] = vector(maxes[feature]); + // } + // states[className] = vector(maxes[className]); + // auto clf = platform::Models::instance()->create(model_name); + // clf->fit(Xd, y, features, className, states); + // if (dump_cpt) { + // cout << "--- CPT Tables ---" << endl; + // clf->dump_cpt(); + // } + // auto lines = clf->show(); + // for (auto line : lines) { + // cout << line << endl; + // } + // cout << "--- Topological Order ---" << endl; + // auto order = clf->topological_order(); + // for (auto name : order) { + // cout << name << ", "; + // } + // cout << "end." << endl; + // auto score = clf->score(Xd, y); + // cout << "Score: " << score << endl; + // auto graph = clf->graph(); + // auto dot_file = model_name + "_" + file_name; + // ofstream file(dot_file + ".dot"); + // file << graph; + // file.close(); + // cout << "Graph saved in " << model_name << "_" << file_name << ".dot" << endl; + // cout << "dot -Tpng -o " + dot_file + ".png " + dot_file + ".dot " << endl; + // string stratified_string = stratified ? " Stratified" : ""; + // cout << nFolds << " Folds" << stratified_string << " Cross validation" << endl; + // cout << "==========================================" << endl; + // torch::Tensor Xt = torch::zeros({ static_cast(Xd.size()), static_cast(Xd[0].size()) }, torch::kInt32); + // torch::Tensor yt = torch::tensor(y, torch::kInt32); + // for (int i = 0; i < features.size(); ++i) { + // Xt.index_put_({ i, "..." }, torch::tensor(Xd[i], torch::kInt32)); + // } + // float total_score = 0, total_score_train = 0, score_train, score_test; + // Fold* fold; + // if (stratified) + // fold = new StratifiedKFold(nFolds, y, seed); + // else + // fold = new KFold(nFolds, y.size(), seed); + // for (auto i = 0; i < nFolds; ++i) { + // auto [train, test] = fold->getFold(i); + // cout << "Fold: " << i + 1 << endl; + // if (tensors) { + // auto ttrain = torch::tensor(train, torch::kInt64); + // auto ttest = torch::tensor(test, torch::kInt64); + // torch::Tensor Xtraint = torch::index_select(Xt, 1, ttrain); + // torch::Tensor ytraint = yt.index({ ttrain }); + // torch::Tensor Xtestt = torch::index_select(Xt, 1, ttest); + // torch::Tensor ytestt = yt.index({ ttest }); + // clf->fit(Xtraint, ytraint, features, className, states); + // auto temp = clf->predict(Xtraint); + // score_train = clf->score(Xtraint, ytraint); + // score_test = clf->score(Xtestt, ytestt); + // } else { + // auto [Xtrain, ytrain] = extract_indices(train, Xd, y); + // auto [Xtest, ytest] = extract_indices(test, Xd, y); + // clf->fit(Xtrain, ytrain, features, className, states); + // score_train = clf->score(Xtrain, ytrain); + // score_test = clf->score(Xtest, ytest); + // } + // if (dump_cpt) { + // cout << "--- CPT Tables ---" << endl; + // clf->dump_cpt(); + // } + // total_score_train += score_train; + // total_score += score_test; + // cout << "Score Train: " << score_train << endl; + // cout << "Score Test : " << score_test << endl; + // cout << "-------------------------------------------------------------------------------" << endl; + // } + // cout << "**********************************************************************************" << endl; + // cout << "Average Score Train: " << total_score_train / nFolds << endl; + // cout << "Average Score Test : " << total_score / nFolds << endl;return 0; } \ No newline at end of file diff --git a/src/BayesNet/AODE.cc b/src/BayesNet/AODE.cc index d90c495..6db843e 100644 --- a/src/BayesNet/AODE.cc +++ b/src/BayesNet/AODE.cc @@ -8,6 +8,8 @@ namespace bayesnet { for (int i = 0; i < features.size(); ++i) { models.push_back(std::make_unique(i)); } + n_models = models.size(); + significanceModels = vector(n_models, 1.0); } vector AODE::graph(const string& title) const { diff --git a/src/BayesNet/BayesMetrics.cc b/src/BayesNet/BayesMetrics.cc index 88f0306..2c08836 100644 --- a/src/BayesNet/BayesMetrics.cc +++ b/src/BayesNet/BayesMetrics.cc @@ -23,7 +23,7 @@ namespace bayesnet { } vector Metrics::SelectKBestWeighted(const torch::Tensor& weights, unsigned k) { - auto n = samples.size(1); + auto n = samples.size(0) - 1; if (k == 0) { k = n; } diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index e9b5e62..b6a535d 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -5,30 +5,79 @@ namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} void BoostAODE::buildModel(const torch::Tensor& weights) { - models.clear(); - for (int i = 0; i < features.size(); ++i) { - models.push_back(std::make_unique(i)); - } + // models.clear(); + // for (int i = 0; i < features.size(); ++i) { + // models.push_back(std::make_unique(i)); + // } + // n_models = models.size(); } void BoostAODE::trainModel(const torch::Tensor& weights) { - // End building vectors - Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kDouble); + models.clear(); + n_models = 0; + int max_models = .1 * n > 10 ? .1 * n : n; + Tensor weights_ = torch::full({ m }, 1.0 / m, torch::kFloat64); auto X_ = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); - auto featureSelection = metrics.SelectKBestWeighted(weights_, n); // Get all the features sorted - for (int i = 0; i < features.size(); ++i) { - models[i].fit(dataset, features, className, states, weights_); - auto ypred = models[i].predict(X_); - // em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights) - // am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1) - // # Step 3.2: Update weights for next classifier - // weights = [ - // wm * np.exp(am * (ym != yp)) - // for wm, ym, yp in zip(weights, self.y_, y_pred) - // ] - // # Step 4: Add the new model - // self.estimators_.append(estimator) + auto y_ = dataset.index({ -1, "..." }); + bool exitCondition = false; + bool repeatSparent = true; + vector featuresUsed; + // Step 0: Set the finish condition + // if not repeatSparent a finish condition is run out of features + // n_models == max_models + int numClasses = states[className].size(); + while (!exitCondition) { + // Step 1: Build ranking with mutual information + auto featureSelection = metrics.SelectKBestWeighted(weights_, n); // Get all the features sorted + auto feature = featureSelection[0]; + unique_ptr model; + if (!repeatSparent) { + if (n_models == 0) { + models.resize(n); // Resize for n==nfeatures SPODEs + significanceModels.resize(n); + } + bool found = false; + for (int i = 0; i < featureSelection.size(); ++i) { + if (find(featuresUsed.begin(), featuresUsed.end(), i) != featuresUsed.end()) { + continue; + } + found = true; + feature = i; + featuresUsed.push_back(feature); + n_models++; + break; + } + if (!found) { + exitCondition = true; + continue; + } + } + model = std::make_unique(feature); + model->fit(dataset, features, className, states, weights_); + auto ypred = model->predict(X_); + // Step 3.1: Compute the classifier amout of say + auto mask_wrong = ypred != y_; + auto masked_weights = weights_ * mask_wrong.to(weights_.dtype()); + double wrongWeights = masked_weights.sum().item(); + double significance = wrongWeights == 0 ? 1 : 0.5 * log((1 - wrongWeights) / wrongWeights); + // Step 3.2: Update weights for next classifier + // Step 3.2.1: Update weights of wrong samples + weights_ += mask_wrong.to(weights_.dtype()) * exp(significance) * weights_; + // Step 3.3: Normalise the weights + double totalWeights = torch::sum(weights_).item(); + weights_ = weights_ / totalWeights; + // Step 3.4: Store classifier and its accuracy to weigh its future vote + if (!repeatSparent) { + models[feature] = std::move(model); + significanceModels[feature] = significance; + } else { + models.push_back(std::move(model)); + significanceModels.push_back(significance); + n_models++; + } + exitCondition = n_models == max_models; } + weights.copy_(weights_); } vector BoostAODE::graph(const string& title) const { diff --git a/src/BayesNet/Ensemble.cc b/src/BayesNet/Ensemble.cc index 926fa5b..33a11a2 100644 --- a/src/BayesNet/Ensemble.cc +++ b/src/BayesNet/Ensemble.cc @@ -18,9 +18,9 @@ namespace bayesnet { auto y_pred_ = y_pred.accessor(); vector y_pred_final; for (int i = 0; i < y_pred.size(0); ++i) { - vector votes(y_pred.size(1), 0); + vector votes(y_pred.size(1), 0); for (int j = 0; j < y_pred.size(1); ++j) { - votes[y_pred_[i][j]] += 1; + votes[y_pred_[i][j]] += significanceModels[j]; } // argsort in descending order auto indices = argsort(votes); diff --git a/src/BayesNet/Ensemble.h b/src/BayesNet/Ensemble.h index 95c1da6..58a1d63 100644 --- a/src/BayesNet/Ensemble.h +++ b/src/BayesNet/Ensemble.h @@ -14,6 +14,7 @@ namespace bayesnet { protected: unsigned n_models; vector> models; + vector significanceModels; void trainModel(const torch::Tensor& weights) override; vector voting(Tensor& y_pred); public: diff --git a/src/BayesNet/KDB.cc b/src/BayesNet/KDB.cc index 471f3fd..cfbbca1 100644 --- a/src/BayesNet/KDB.cc +++ b/src/BayesNet/KDB.cc @@ -29,7 +29,7 @@ namespace bayesnet { // where C is the class. addNodes(); const Tensor& y = dataset.index({ -1, "..." }); - vector mi; + vector mi; for (auto i = 0; i < features.size(); i++) { Tensor firstFeature = dataset.index({ i, "..." }); mi.push_back(metrics.mutualInformation(firstFeature, y, weights)); diff --git a/src/BayesNet/bayesnetUtils.cc b/src/BayesNet/bayesnetUtils.cc index 8b69006..480034b 100644 --- a/src/BayesNet/bayesnetUtils.cc +++ b/src/BayesNet/bayesnetUtils.cc @@ -4,7 +4,7 @@ namespace bayesnet { using namespace std; using namespace torch; // Return the indices in descending order - vector argsort(vector& nums) + vector argsort(vector& nums) { int n = nums.size(); vector indices(n); diff --git a/src/BayesNet/bayesnetUtils.h b/src/BayesNet/bayesnetUtils.h index adfa8d7..b5811f7 100644 --- a/src/BayesNet/bayesnetUtils.h +++ b/src/BayesNet/bayesnetUtils.h @@ -5,7 +5,7 @@ namespace bayesnet { using namespace std; using namespace torch; - vector argsort(vector& nums); + vector argsort(vector& nums); vector> tensorToVector(Tensor& tensor); } #endif //BAYESNET_UTILS_H \ No newline at end of file From 2d7999d5f2ad8cce2e15d7d621ad4b61eea68e2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Fri, 18 Aug 2023 13:43:13 +0200 Subject: [PATCH 16/20] Add manage to release targets --- Makefile | 2 +- src/BayesNet/BoostAODE.cc | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 9806dc3..a0ce791 100644 --- a/Makefile +++ b/Makefile @@ -35,7 +35,7 @@ release: ## Build a Release version of the project @if [ -d ./build ]; then rm -rf ./build; fi @mkdir build; cmake -S . -B build -D CMAKE_BUILD_TYPE=Release; \ - cmake --build build -t main -t BayesNetSample -j 32; + cmake --build build -t main -t BayesNetSample -t manage -j 32; @echo ">>> Done"; test: ## Run tests diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index b6a535d..a4379b4 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -5,11 +5,7 @@ namespace bayesnet { BoostAODE::BoostAODE() : Ensemble() {} void BoostAODE::buildModel(const torch::Tensor& weights) { - // models.clear(); - // for (int i = 0; i < features.size(); ++i) { - // models.push_back(std::make_unique(i)); - // } - // n_models = models.size(); + // Models shall be built in trainModel } void BoostAODE::trainModel(const torch::Tensor& weights) { From bafcb26bb6779991fd6b09b2af353b26e5fd552a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Fri, 18 Aug 2023 13:43:53 +0200 Subject: [PATCH 17/20] Add manage to build target --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a0ce791..3e097b7 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ dependency: ## Create a dependency graph diagram of the project (build/dependenc cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png build: ## Build the main and BayesNetSample - cmake --build build -t main -t BayesNetSample -j 32 + cmake --build build -t main -t BayesNetSample -t manage -j 32 clean: ## Clean the debug info @echo ">>> Cleaning Debug BayesNet ..."; From 9972738debc097d82283388094ee8e91d7bd6460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 19 Aug 2023 19:05:16 +0200 Subject: [PATCH 18/20] Add list datasets and add locale format --- .vscode/launch.json | 8 ++++ .vscode/tasks.json | 23 +++++++++++ Makefile | 4 +- src/Platform/CMakeLists.txt | 4 +- src/Platform/Datasets.cc | 79 ++++++++++++++++++++++++++----------- src/Platform/Datasets.h | 31 ++++++++------- src/Platform/Paths.h | 1 + src/Platform/Report.cc | 29 ++++++++++---- src/Platform/list.cc | 57 ++++++++++++++++++++++++++ 9 files changed, 190 insertions(+), 46 deletions(-) create mode 100644 src/Platform/list.cc diff --git a/.vscode/launch.json b/.vscode/launch.json index c1275e6..cade330 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -46,6 +46,14 @@ ], "cwd": "/Users/rmontanana/Code/discretizbench", }, + { + "type": "lldb", + "request": "launch", + "name": "list", + "program": "${workspaceFolder}/build/src/Platform/list", + "args": [], + "cwd": "/Users/rmontanana/Code/discretizbench", + }, { "name": "Build & debug active file", "type": "cppdbg", diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 5d92a8f..45cc63d 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -32,6 +32,29 @@ ], "group": "build", "detail": "Task generated by Debugger." + }, + { + "type": "cppbuild", + "label": "C/C++: g++ build active file", + "command": "/usr/bin/g++", + "args": [ + "-fdiagnostics-color=always", + "-g", + "${file}", + "-o", + "${fileDirname}/${fileBasenameNoExtension}" + ], + "options": { + "cwd": "${fileDirname}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "detail": "Task generated by Debugger." } ] } \ No newline at end of file diff --git a/Makefile b/Makefile index 3e097b7..b883892 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ dependency: ## Create a dependency graph diagram of the project (build/dependenc cd build && cmake .. --graphviz=dependency.dot && dot -Tpng dependency.dot -o dependency.png build: ## Build the main and BayesNetSample - cmake --build build -t main -t BayesNetSample -t manage -j 32 + cmake --build build -t main -t BayesNetSample -t manage -t list -j 32 clean: ## Clean the debug info @echo ">>> Cleaning Debug BayesNet ..."; @@ -35,7 +35,7 @@ release: ## Build a Release version of the project @if [ -d ./build ]; then rm -rf ./build; fi @mkdir build; cmake -S . -B build -D CMAKE_BUILD_TYPE=Release; \ - cmake --build build -t main -t BayesNetSample -t manage -j 32; + cmake --build build -t main -t BayesNetSample -t manage -t list -j 32; @echo ">>> Done"; test: ## Run tests diff --git a/src/Platform/CMakeLists.txt b/src/Platform/CMakeLists.txt index 0eb26ce..78c6615 100644 --- a/src/Platform/CMakeLists.txt +++ b/src/Platform/CMakeLists.txt @@ -6,5 +6,7 @@ include_directories(${BayesNet_SOURCE_DIR}/lib/argparse/include) include_directories(${BayesNet_SOURCE_DIR}/lib/json/include) add_executable(main main.cc Folding.cc platformUtils.cc Experiment.cc Datasets.cc Models.cc Report.cc) add_executable(manage manage.cc Results.cc Report.cc) +add_executable(list list.cc platformUtils Datasets.cc) target_link_libraries(main BayesNet ArffFiles mdlp "${TORCH_LIBRARIES}") -target_link_libraries(manage "${TORCH_LIBRARIES}") \ No newline at end of file +target_link_libraries(manage "${TORCH_LIBRARIES}") +target_link_libraries(list ArffFiles mdlp "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/src/Platform/Datasets.cc b/src/Platform/Datasets.cc index 6756148..b187be8 100644 --- a/src/Platform/Datasets.cc +++ b/src/Platform/Datasets.cc @@ -24,75 +24,110 @@ namespace platform { transform(datasets.begin(), datasets.end(), back_inserter(result), [](const auto& d) { return d.first; }); return result; } - vector Datasets::getFeatures(string name) + vector Datasets::getFeatures(const string& name) const { - if (datasets[name]->isLoaded()) { - return datasets[name]->getFeatures(); + if (datasets.at(name)->isLoaded()) { + return datasets.at(name)->getFeatures(); } else { throw invalid_argument("Dataset not loaded."); } } - map> Datasets::getStates(string name) + map> Datasets::getStates(const string& name) const { - if (datasets[name]->isLoaded()) { - return datasets[name]->getStates(); + if (datasets.at(name)->isLoaded()) { + return datasets.at(name)->getStates(); } else { throw invalid_argument("Dataset not loaded."); } } - string Datasets::getClassName(string name) + void Datasets::loadDataset(const string& name) const { - if (datasets[name]->isLoaded()) { - return datasets[name]->getClassName(); + if (datasets.at(name)->isLoaded()) { + return; + } else { + datasets.at(name)->load(); + } + } + string Datasets::getClassName(const string& name) const + { + if (datasets.at(name)->isLoaded()) { + return datasets.at(name)->getClassName(); } else { throw invalid_argument("Dataset not loaded."); } } - int Datasets::getNSamples(string name) + int Datasets::getNSamples(const string& name) const { - if (datasets[name]->isLoaded()) { - return datasets[name]->getNSamples(); + if (datasets.at(name)->isLoaded()) { + return datasets.at(name)->getNSamples(); } else { throw invalid_argument("Dataset not loaded."); } } - pair>&, vector&> Datasets::getVectors(string name) + int Datasets::getNClasses(const string& name) + { + if (datasets.at(name)->isLoaded()) { + auto className = datasets.at(name)->getClassName(); + if (discretize) { + auto states = getStates(name); + return states.at(className).size(); + } + auto [Xv, yv] = getVectors(name); + return *max_element(yv.begin(), yv.end()) + 1; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + vector Datasets::getClassesCounts(const string& name) const + { + if (datasets.at(name)->isLoaded()) { + auto [Xv, yv] = datasets.at(name)->getVectors(); + vector counts(*max_element(yv.begin(), yv.end()) + 1); + for (auto y : yv) { + counts[y]++; + } + return counts; + } else { + throw invalid_argument("Dataset not loaded."); + } + } + pair>&, vector&> Datasets::getVectors(const string& name) { if (!datasets[name]->isLoaded()) { datasets[name]->load(); } return datasets[name]->getVectors(); } - pair>&, vector&> Datasets::getVectorsDiscretized(string name) + pair>&, vector&> Datasets::getVectorsDiscretized(const string& name) { if (!datasets[name]->isLoaded()) { datasets[name]->load(); } return datasets[name]->getVectorsDiscretized(); } - pair Datasets::getTensors(string name) + pair Datasets::getTensors(const string& name) { if (!datasets[name]->isLoaded()) { datasets[name]->load(); } return datasets[name]->getTensors(); } - bool Datasets::isDataset(const string& name) + bool Datasets::isDataset(const string& name) const { return datasets.find(name) != datasets.end(); } Dataset::Dataset(const Dataset& dataset) : path(dataset.path), name(dataset.name), className(dataset.className), n_samples(dataset.n_samples), n_features(dataset.n_features), features(dataset.features), states(dataset.states), loaded(dataset.loaded), discretize(dataset.discretize), X(dataset.X), y(dataset.y), Xv(dataset.Xv), Xd(dataset.Xd), yv(dataset.yv), fileType(dataset.fileType) { } - string Dataset::getName() + string Dataset::getName() const { return name; } - string Dataset::getClassName() + string Dataset::getClassName() const { return className; } - vector Dataset::getFeatures() + vector Dataset::getFeatures() const { if (loaded) { return features; @@ -100,7 +135,7 @@ namespace platform { throw invalid_argument("Dataset not loaded."); } } - int Dataset::getNFeatures() + int Dataset::getNFeatures() const { if (loaded) { return n_features; @@ -108,7 +143,7 @@ namespace platform { throw invalid_argument("Dataset not loaded."); } } - int Dataset::getNSamples() + int Dataset::getNSamples() const { if (loaded) { return n_samples; @@ -116,7 +151,7 @@ namespace platform { throw invalid_argument("Dataset not loaded."); } } - map> Dataset::getStates() + map> Dataset::getStates() const { if (loaded) { return states; diff --git a/src/Platform/Datasets.h b/src/Platform/Datasets.h index 4ccd1f0..a99c86e 100644 --- a/src/Platform/Datasets.h +++ b/src/Platform/Datasets.h @@ -29,15 +29,15 @@ namespace platform { public: Dataset(const string& path, const string& name, const string& className, bool discretize, fileType_t fileType) : path(path), name(name), className(className), discretize(discretize), loaded(false), fileType(fileType) {}; explicit Dataset(const Dataset&); - string getName(); - string getClassName(); - vector getFeatures(); - map> getStates(); + string getName() const; + string getClassName() const; + vector getFeatures() const; + map> getStates() const; pair>&, vector&> getVectors(); pair>&, vector&> getVectorsDiscretized(); pair getTensors(); - int getNFeatures(); - int getNSamples(); + int getNFeatures() const; + int getNSamples() const; void load(); const bool inline isLoaded() const { return loaded; }; }; @@ -51,14 +51,17 @@ namespace platform { public: explicit Datasets(const string& path, bool discretize = false, fileType_t fileType = ARFF) : path(path), discretize(discretize), fileType(fileType) { load(); }; vector getNames(); - vector getFeatures(string name); - int getNSamples(string name); - string getClassName(string name); - map> getStates(string name); - pair>&, vector&> getVectors(string name); - pair>&, vector&> getVectorsDiscretized(string name); - pair getTensors(string name); - bool isDataset(const string& name); + vector getFeatures(const string& name) const; + int getNSamples(const string& name) const; + string getClassName(const string& name) const; + int getNClasses(const string& name); + vector getClassesCounts(const string& name) const; + map> getStates(const string& name) const; + pair>&, vector&> getVectors(const string& name); + pair>&, vector&> getVectorsDiscretized(const string& name); + pair getTensors(const string& name); + bool isDataset(const string& name) const; + void loadDataset(const string& name) const; }; }; diff --git a/src/Platform/Paths.h b/src/Platform/Paths.h index 756e61a..fdda25a 100644 --- a/src/Platform/Paths.h +++ b/src/Platform/Paths.h @@ -1,5 +1,6 @@ #ifndef PATHS_H #define PATHS_H +#include namespace platform { class Paths { public: diff --git a/src/Platform/Report.cc b/src/Platform/Report.cc index a40a482..cc3b0a0 100644 --- a/src/Platform/Report.cc +++ b/src/Platform/Report.cc @@ -1,6 +1,9 @@ +#include +#include #include "Report.h" #include "BestResult.h" + namespace platform { string headerLine(const string& text) { @@ -31,21 +34,31 @@ namespace platform { body(); footer(); } + struct separated : numpunct { + char do_decimal_point() const { return ','; } + char do_thousands_sep() const { return '.'; } + string do_grouping() const { return "\03"; } + }; void Report::header() { + locale mylocale(cout.getloc(), new separated); + locale::global(mylocale); + cout.imbue(mylocale); + stringstream oss; cout << Colors::MAGENTA() << string(MAXL, '*') << endl; cout << headerLine("Report " + data["model"].get() + " ver. " + data["version"].get() + " with " + to_string(data["folds"].get()) + " Folds cross validation and " + to_string(data["seeds"].size()) + " random seeds. " + data["date"].get() + " " + data["time"].get()); cout << headerLine(data["title"].get()); cout << headerLine("Random seeds: " + fromVector("seeds") + " Stratified: " + (data["stratified"].get() ? "True" : "False")); - cout << headerLine("Execution took " + to_string(data["duration"].get()) + " seconds, " + to_string(data["duration"].get() / 3600) + " hours, on " + data["platform"].get()); + oss << "Execution took " << setprecision(2) << fixed << data["duration"].get() << " seconds, " << data["duration"].get() / 3600 << " hours, on " << data["platform"].get(); + cout << headerLine(oss.str()); cout << headerLine("Score is " + data["score_name"].get()); cout << string(MAXL, '*') << endl; cout << endl; } void Report::body() { - cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl; - cout << "============================== ====== ===== === ======= ======= ======= =============== ================== ===============" << endl; + cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls Nodes Edges States Score Time Hyperparameters" << endl; + cout << "============================== ====== ===== === ========= ========= ========= =============== ================== ===============" << endl; json lastResult; totalScore = 0; bool odd = true; @@ -55,9 +68,9 @@ namespace platform { cout << setw(6) << right << r["samples"].get() << " "; cout << setw(5) << right << r["features"].get() << " "; cout << setw(3) << right << r["classes"].get() << " "; - cout << setw(7) << setprecision(2) << fixed << r["nodes"].get() << " "; - cout << setw(7) << setprecision(2) << fixed << r["leaves"].get() << " "; - cout << setw(7) << setprecision(2) << fixed << r["depth"].get() << " "; + cout << setw(9) << setprecision(2) << fixed << r["nodes"].get() << " "; + cout << setw(9) << setprecision(2) << fixed << r["leaves"].get() << " "; + cout << setw(9) << setprecision(2) << fixed << r["depth"].get() << " "; cout << setw(8) << right << setprecision(6) << fixed << r["score"].get() << "±" << setw(6) << setprecision(4) << fixed << r["score_std"].get() << " "; cout << setw(11) << right << setprecision(6) << fixed << r["time"].get() << "±" << setw(6) << setprecision(4) << fixed << r["time_std"].get() << " "; try { @@ -85,7 +98,9 @@ namespace platform { cout << Colors::MAGENTA() << string(MAXL, '*') << endl; auto score = data["score_name"].get(); if (score == BestResult::scoreName()) { - cout << headerLine(score + " compared to " + BestResult::title() + " .: " + to_string(totalScore / BestResult::score())); + stringstream oss; + oss << score << " compared to " << BestResult::title() << " .: " << totalScore / BestResult::score(); + cout << headerLine(oss.str()); } cout << string(MAXL, '*') << endl << Colors::RESET(); diff --git a/src/Platform/list.cc b/src/Platform/list.cc new file mode 100644 index 0000000..ed8396d --- /dev/null +++ b/src/Platform/list.cc @@ -0,0 +1,57 @@ +#include +#include +#include "Paths.h" +#include "Colors.h" +#include "Datasets.h" + +using namespace std; +const int BALANCE_LENGTH = 75; + +struct separated : numpunct { + char do_decimal_point() const { return ','; } + char do_thousands_sep() const { return '.'; } + string do_grouping() const { return "\03"; } +}; + +void outputBalance(const string& balance) +{ + auto temp = string(balance); + while (temp.size() > BALANCE_LENGTH - 1) { + auto part = temp.substr(0, BALANCE_LENGTH); + cout << part << endl; + cout << setw(48) << " "; + temp = temp.substr(BALANCE_LENGTH); + } + cout << temp << endl; +} + +int main(int argc, char** argv) +{ + auto data = platform::Datasets(platform::Paths().datasets(), false); + locale mylocale(cout.getloc(), new separated); + locale::global(mylocale); + cout.imbue(mylocale); + cout << Colors::GREEN() << "Dataset Sampl. Feat. Cls. Balance" << endl; + string balanceBars = string(BALANCE_LENGTH, '='); + cout << "============================== ====== ===== === " << balanceBars << endl; + bool odd = true; + for (const auto& dataset : data.getNames()) { + auto color = odd ? Colors::CYAN() : Colors::BLUE(); + cout << color << setw(30) << left << dataset << " "; + data.loadDataset(dataset); + auto nSamples = data.getNSamples(dataset); + cout << setw(6) << right << nSamples << " "; + cout << setw(5) << right << data.getFeatures(dataset).size() << " "; + cout << setw(3) << right << data.getNClasses(dataset) << " "; + stringstream oss; + string sep = ""; + for (auto number : data.getClassesCounts(dataset)) { + oss << sep << setprecision(2) << fixed << (float)number / nSamples * 100.0 << "% (" << number << ")"; + sep = " / "; + } + outputBalance(oss.str()); + odd = !odd; + } + cout << Colors::RESET() << endl; + return 0; +} From 59ffd179f45f083f498c5bac82dca0998ccd95d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 19 Aug 2023 21:26:48 +0200 Subject: [PATCH 19/20] Fix report format --- src/BayesNet/BoostAODE.cc | 2 +- src/Platform/Report.cc | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/BayesNet/BoostAODE.cc b/src/BayesNet/BoostAODE.cc index a4379b4..eb8da07 100644 --- a/src/BayesNet/BoostAODE.cc +++ b/src/BayesNet/BoostAODE.cc @@ -16,7 +16,7 @@ namespace bayesnet { auto X_ = dataset.index({ torch::indexing::Slice(0, dataset.size(0) - 1), "..." }); auto y_ = dataset.index({ -1, "..." }); bool exitCondition = false; - bool repeatSparent = true; + bool repeatSparent = false; vector featuresUsed; // Step 0: Set the finish condition // if not repeatSparent a finish condition is run out of features diff --git a/src/Platform/Report.cc b/src/Platform/Report.cc index cc3b0a0..b52a620 100644 --- a/src/Platform/Report.cc +++ b/src/Platform/Report.cc @@ -13,20 +13,27 @@ namespace platform { } string Report::fromVector(const string& key) { - string result = ""; - + stringstream oss; + string sep = ""; + oss << "[" << fixed << setprecision(16); for (auto& item : data[key]) { - result += to_string(item) + ", "; + oss << sep << item.get(); + sep = ", "; } - return "[" + result.substr(0, result.size() - 2) + "]"; + oss << "]"; + return oss.str(); } - string fVector(const json& data) + string fVector(const string& title, const json& data) { - string result = ""; + stringstream oss; + string sep = ""; + oss << title << "[" << fixed << setprecision(16); for (const auto& item : data) { - result += to_string(item) + ", "; + oss << sep << item.get(); + sep = ", "; } - return "[" + result.substr(0, result.size() - 2) + "]"; + oss << "]"; + return oss.str(); } void Report::show() { @@ -86,10 +93,10 @@ namespace platform { } if (data["results"].size() == 1) { cout << string(MAXL, '*') << endl; - cout << headerLine("Train scores: " + fVector(lastResult["scores_train"])); - cout << headerLine("Test scores: " + fVector(lastResult["scores_test"])); - cout << headerLine("Train times: " + fVector(lastResult["times_train"])); - cout << headerLine("Test times: " + fVector(lastResult["times_test"])); + cout << headerLine(fVector("Train scores: ", lastResult["scores_train"])); + cout << headerLine(fVector("Test scores: ", lastResult["scores_test"])); + cout << headerLine(fVector("Train times: ", lastResult["times_train"])); + cout << headerLine(fVector("Test times: ", lastResult["times_test"])); cout << string(MAXL, '*') << endl; } } From 1a534888d6f864124ab773a3f0194860c5c424ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 19 Aug 2023 23:30:44 +0200 Subject: [PATCH 20/20] Fix report format --- src/Platform/Report.cc | 16 ++++++++-------- src/Platform/Report.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Platform/Report.cc b/src/Platform/Report.cc index b52a620..5690668 100644 --- a/src/Platform/Report.cc +++ b/src/Platform/Report.cc @@ -15,7 +15,7 @@ namespace platform { { stringstream oss; string sep = ""; - oss << "[" << fixed << setprecision(16); + oss << "["; for (auto& item : data[key]) { oss << sep << item.get(); sep = ", "; @@ -23,13 +23,13 @@ namespace platform { oss << "]"; return oss.str(); } - string fVector(const string& title, const json& data) + string fVector(const string& title, const json& data, const int width, const int precision) { stringstream oss; string sep = ""; - oss << title << "[" << fixed << setprecision(16); + oss << title << "["; for (const auto& item : data) { - oss << sep << item.get(); + oss << sep << fixed << setw(width) << setprecision(precision) << item.get(); sep = ", "; } oss << "]"; @@ -93,10 +93,10 @@ namespace platform { } if (data["results"].size() == 1) { cout << string(MAXL, '*') << endl; - cout << headerLine(fVector("Train scores: ", lastResult["scores_train"])); - cout << headerLine(fVector("Test scores: ", lastResult["scores_test"])); - cout << headerLine(fVector("Train times: ", lastResult["times_train"])); - cout << headerLine(fVector("Test times: ", lastResult["times_test"])); + cout << headerLine(fVector("Train scores: ", lastResult["scores_train"], 14, 12)); + cout << headerLine(fVector("Test scores: ", lastResult["scores_test"], 14, 12)); + cout << headerLine(fVector("Train times: ", lastResult["times_train"], 10, 3)); + cout << headerLine(fVector("Test times: ", lastResult["times_test"], 10, 3)); cout << string(MAXL, '*') << endl; } } diff --git a/src/Platform/Report.h b/src/Platform/Report.h index 2708d4e..105785f 100644 --- a/src/Platform/Report.h +++ b/src/Platform/Report.h @@ -6,7 +6,7 @@ #include "Colors.h" using json = nlohmann::json; -const int MAXL = 122; +const int MAXL = 128; namespace platform { using namespace std; class Report {