diff --git a/.vscode/launch.json b/.vscode/launch.json index ae7831f..f9a89d3 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,7 +1,4 @@ { - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ { @@ -20,6 +17,19 @@ "program": "${workspaceFolder}/a.out", "args": [], "cwd": "${workspaceFolder}" + }, + { + "name": "Build & debug active file", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/build/bayesnet", + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "lldb", + "preLaunchTask": "CMake: build" } ] } \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 86d08cf..4d408cb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -81,7 +81,12 @@ "*.toml": "toml", "utility": "cpp", "__verbose_abort": "cpp", - "bit": "cpp" + "bit": "cpp", + "random": "cpp", + "*.tcc": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory_resource": "cpp" }, "cmake.configureOnOpen": false, "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools" diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 4edc48b..5d92a8f 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -1,16 +1,37 @@ { - "version": "2.0.0", - "tasks": [ - { - "type": "cmake", - "label": "CMake: build", - "command": "build", - "targets": [ - "all" - ], - "group": "build", - "problemMatcher": [], - "detail": "CMake template build task" - } - ] + "version": "2.0.0", + "tasks": [ + { + "type": "cmake", + "label": "CMake: build", + "command": "build", + "targets": [ + "all" + ], + "group": "build", + "problemMatcher": [], + "detail": "CMake template build task" + }, + { + "type": "cppbuild", + "label": "C/C++: clang build active file", + "command": "/usr/bin/clang", + "args": [ + "-fcolor-diagnostics", + "-fansi-escape-codes", + "-g", + "${file}", + "-o", + "${fileDirname}/${fileBasenameNoExtension}" + ], + "options": { + "cwd": "${fileDirname}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": "build", + "detail": "Task generated by Debugger." + } + ] } \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f7ffd6..6cd37d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,5 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") # add_library(BayesNet Node.cc Network.cc) -# add_executable(BayesNet main.cc Node.cc Network.cc ArffFiles.cc) -add_executable(BayesNet main.cc ArffFiles.cc Node.cc Network.cc) +add_executable(BayesNet main.cc ArffFiles.cc Node.cc Network.cc CPPFImdlp.cpp Metrics.cpp) target_link_libraries(BayesNet "${TORCH_LIBRARIES}") \ No newline at end of file diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp new file mode 100644 index 0000000..80d6578 --- /dev/null +++ b/CPPFImdlp.cpp @@ -0,0 +1,220 @@ +#include +#include +#include +#include +#include "CPPFImdlp.h" +#include "Metrics.h" + +namespace mdlp { + + CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed) : min_length(min_length_), + max_depth(max_depth_), + proposed_cuts(proposed) + { + } + + CPPFImdlp::CPPFImdlp() = default; + + CPPFImdlp::~CPPFImdlp() = default; + + size_t CPPFImdlp::compute_max_num_cut_points() const + { + // Set the actual maximum number of cut points as a number or as a percentage of the number of samples + if (proposed_cuts == 0) { + return numeric_limits::max(); + } + if (proposed_cuts < 0 || proposed_cuts > static_cast(X.size())) { + throw invalid_argument("wrong proposed num_cuts value"); + } + if (proposed_cuts < 1) + return static_cast(round(static_cast(X.size()) * proposed_cuts)); + return static_cast(proposed_cuts); + } + + void CPPFImdlp::fit(samples_t& X_, labels_t& y_) + { + X = X_; + y = y_; + num_cut_points = compute_max_num_cut_points(); + depth = 0; + cutPoints.clear(); + if (X.size() != y.size()) { + throw invalid_argument("X and y must have the same size"); + } + if (X.empty() || y.empty()) { + throw invalid_argument("X and y must have at least one element"); + } + if (min_length < 3) { + throw invalid_argument("min_length must be greater than 2"); + } + if (max_depth < 1) { + throw invalid_argument("max_depth must be greater than 0"); + } + indices = sortIndices(X_, y_); + metrics.setData(y, indices); + computeCutPoints(0, X.size(), 1); + sort(cutPoints.begin(), cutPoints.end()); + if (num_cut_points > 0) { + // Select the best (with lower entropy) cut points + while (cutPoints.size() > num_cut_points) { + resizeCutPoints(); + } + } + } + + pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) + { + size_t n; + size_t m; + size_t idxPrev = cut - 1 >= start ? cut - 1 : cut; + size_t idxNext = cut + 1 < end ? cut + 1 : cut; + bool backWall; // true if duplicates reach beginning of the interval + precision_t previous; + precision_t actual; + precision_t next; + previous = X[indices[idxPrev]]; + actual = X[indices[cut]]; + next = X[indices[idxNext]]; + // definition 2 of the paper => X[t-1] < X[t] + // get the first equal value of X in the interval + while (idxPrev > start && actual == previous) { + previous = X[indices[--idxPrev]]; + } + backWall = idxPrev == start && actual == previous; + // get the last equal value of X in the interval + while (idxNext < end - 1 && actual == next) { + next = X[indices[++idxNext]]; + } + // # of duplicates before cutpoint + n = cut - 1 - idxPrev; + // # of duplicates after cutpoint + m = idxNext - cut - 1; + // Decide which values to use + cut = cut + (backWall ? m + 1 : -n); + actual = X[indices[cut]]; + return { (actual + previous) / 2, cut }; + } + + void CPPFImdlp::computeCutPoints(size_t start, size_t end, int depth_) + { + size_t cut; + pair result; + // Check if the interval length and the depth are Ok + if (end - start < min_length || depth_ > max_depth) + return; + depth = depth_ > depth ? depth_ : depth; + cut = getCandidate(start, end); + if (cut == numeric_limits::max()) + return; + if (mdlp(start, cut, end)) { + result = valueCutPoint(start, cut, end); + cut = result.second; + cutPoints.push_back(result.first); + computeCutPoints(start, cut, depth_ + 1); + computeCutPoints(cut, end, depth_ + 1); + } + } + + size_t CPPFImdlp::getCandidate(size_t start, size_t end) + { + /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which + E(A, TA; S) is minimal amongst all the candidate cut points. */ + size_t candidate = numeric_limits::max(); + size_t elements = end - start; + bool sameValues = true; + precision_t entropy_left; + precision_t entropy_right; + precision_t minEntropy; + // Check if all the values of the variable in the interval are the same + for (size_t idx = start + 1; idx < end; idx++) { + if (X[indices[idx]] != X[indices[start]]) { + sameValues = false; + break; + } + } + if (sameValues) + return candidate; + minEntropy = metrics.entropy(start, end); + for (size_t idx = start + 1; idx < end; idx++) { + // Cutpoints are always on boundaries (definition 2) + if (y[indices[idx]] == y[indices[idx - 1]]) + continue; + entropy_left = precision_t(idx - start) / static_cast(elements) * metrics.entropy(start, idx); + entropy_right = precision_t(end - idx) / static_cast(elements) * metrics.entropy(idx, end); + if (entropy_left + entropy_right < minEntropy) { + minEntropy = entropy_left + entropy_right; + candidate = idx; + } + } + return candidate; + } + + bool CPPFImdlp::mdlp(size_t start, size_t cut, size_t end) + { + int k; + int k1; + int k2; + precision_t ig; + precision_t delta; + precision_t ent; + precision_t ent1; + precision_t ent2; + auto N = precision_t(end - start); + k = metrics.computeNumClasses(start, end); + k1 = metrics.computeNumClasses(start, cut); + k2 = metrics.computeNumClasses(cut, end); + ent = metrics.entropy(start, end); + ent1 = metrics.entropy(start, cut); + ent2 = metrics.entropy(cut, end); + ig = metrics.informationGain(start, cut, end); + delta = static_cast(log2(pow(3, precision_t(k)) - 2) - + (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2)); + precision_t term = 1 / N * (log2(N - 1) + delta); + return ig > term; + } + + // Argsort from https://stackoverflow.com/questions/1577475/c-sorting-and-keeping-track-of-indexes + indices_t CPPFImdlp::sortIndices(samples_t& X_, labels_t& y_) + { + indices_t idx(X_.size()); + iota(idx.begin(), idx.end(), 0); + stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { + if (X_[i1] == X_[i2]) + return y_[i1] < y_[i2]; + else + return X_[i1] < X_[i2]; + }); + return idx; + } + + void CPPFImdlp::resizeCutPoints() + { + //Compute entropy of each of the whole cutpoint set and discards the biggest value + precision_t maxEntropy = 0; + precision_t entropy; + size_t maxEntropyIdx = 0; + size_t begin = 0; + size_t end; + for (size_t idx = 0; idx < cutPoints.size(); idx++) { + end = begin; + while (X[indices[end]] < cutPoints[idx] && end < X.size()) + end++; + entropy = metrics.entropy(begin, end); + if (entropy > maxEntropy) { + maxEntropy = entropy; + maxEntropyIdx = idx; + } + begin = end; + } + cutPoints.erase(cutPoints.begin() + static_cast(maxEntropyIdx)); + } + labels_t& CPPFImdlp::transform(const samples_t& data) + { + discretizedData.reserve(data.size()); + for (const precision_t& item : data) { + auto upper = upper_bound(cutPoints.begin(), cutPoints.end(), item); + discretizedData.push_back(upper - cutPoints.begin()); + } + return discretizedData; + } +} diff --git a/CPPFImdlp.h b/CPPFImdlp.h new file mode 100644 index 0000000..1fb0cab --- /dev/null +++ b/CPPFImdlp.h @@ -0,0 +1,45 @@ +#ifndef CPPFIMDLP_H +#define CPPFIMDLP_H + +#include "typesFImdlp.h" +#include "Metrics.h" +#include +#include +#include + +namespace mdlp { + class CPPFImdlp { + protected: + size_t min_length = 3; + int depth = 0; + int max_depth = numeric_limits::max(); + float proposed_cuts = 0; + indices_t indices = indices_t(); + samples_t X = samples_t(); + labels_t y = labels_t(); + Metrics metrics = Metrics(y, indices); + cutPoints_t cutPoints; + size_t num_cut_points = numeric_limits::max(); + labels_t discretizedData = labels_t(); + + static indices_t sortIndices(samples_t&, labels_t&); + + void computeCutPoints(size_t, size_t, int); + void resizeCutPoints(); + bool mdlp(size_t, size_t, size_t); + size_t getCandidate(size_t, size_t); + size_t compute_max_num_cut_points() const; + pair valueCutPoint(size_t, size_t, size_t); + + public: + CPPFImdlp(); + CPPFImdlp(size_t, int, float); + ~CPPFImdlp(); + void fit(samples_t&, labels_t&); + inline cutPoints_t getCutPoints() const { return cutPoints; }; + labels_t& transform(const samples_t&); + inline int get_depth() const { return depth; }; + static inline string version() { return "1.1.2"; }; + }; +} +#endif diff --git a/Metrics.cpp b/Metrics.cpp new file mode 100644 index 0000000..71a3c07 --- /dev/null +++ b/Metrics.cpp @@ -0,0 +1,78 @@ +#include "Metrics.h" +#include +#include + +using namespace std; +namespace mdlp { + Metrics::Metrics(labels_t& y_, indices_t& indices_): y(y_), indices(indices_), + numClasses(computeNumClasses(0, indices.size())) + { + } + + int Metrics::computeNumClasses(size_t start, size_t end) + { + set nClasses; + for (auto i = start; i < end; ++i) { + nClasses.insert(y[indices[i]]); + } + return static_cast(nClasses.size()); + } + + void Metrics::setData(const labels_t& y_, const indices_t& indices_) + { + indices = indices_; + y = y_; + numClasses = computeNumClasses(0, indices.size()); + entropyCache.clear(); + igCache.clear(); + } + + precision_t Metrics::entropy(size_t start, size_t end) + { + precision_t p; + precision_t ventropy = 0; + int nElements = 0; + labels_t counts(numClasses + 1, 0); + if (end - start < 2) + return 0; + if (entropyCache.find({ start, end }) != entropyCache.end()) { + return entropyCache[{start, end}]; + } + for (auto i = &indices[start]; i != &indices[end]; ++i) { + counts[y[*i]]++; + nElements++; + } + for (auto count : counts) { + if (count > 0) { + p = static_cast(count) / static_cast(nElements); + ventropy -= p * log2(p); + } + } + entropyCache[{start, end}] = ventropy; + return ventropy; + } + + precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) + { + precision_t iGain; + precision_t entropyInterval; + precision_t entropyLeft; + precision_t entropyRight; + size_t nElementsLeft = cut - start; + size_t nElementsRight = end - cut; + size_t nElements = end - start; + if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { + return igCache[make_tuple(start, cut, end)]; + } + entropyInterval = entropy(start, end); + entropyLeft = entropy(start, cut); + entropyRight = entropy(cut, end); + iGain = entropyInterval - + (static_cast(nElementsLeft) * entropyLeft + + static_cast(nElementsRight) * entropyRight) / + static_cast(nElements); + igCache[make_tuple(start, cut, end)] = iGain; + return iGain; + } + +} \ No newline at end of file diff --git a/Metrics.h b/Metrics.h new file mode 100644 index 0000000..4f8151a --- /dev/null +++ b/Metrics.h @@ -0,0 +1,22 @@ +#ifndef CCMETRICS_H +#define CCMETRICS_H + +#include "typesFImdlp.h" + +namespace mdlp { + class Metrics { + protected: + labels_t& y; + indices_t& indices; + int numClasses; + cacheEnt_t entropyCache = cacheEnt_t(); + cacheIg_t igCache = cacheIg_t(); + public: + Metrics(labels_t&, indices_t&); + void setData(const labels_t&, const indices_t&); + int computeNumClasses(size_t, size_t); + precision_t entropy(size_t, size_t); + precision_t informationGain(size_t, size_t, size_t); + }; +} +#endif \ No newline at end of file diff --git a/Network.cc b/Network.cc index a81c150..9962384 100644 --- a/Network.cc +++ b/Network.cc @@ -1,5 +1,7 @@ #include "Network.h" namespace bayesnet { + Network::Network() : laplaceSmoothing(1), root(nullptr) {} + Network::Network(int smoothing) : laplaceSmoothing(smoothing), root(nullptr) {} Network::~Network() { for (auto& pair : nodes) { @@ -70,46 +72,62 @@ namespace bayesnet { { return nodes; } - void Network::fit(const vector>& dataset, const int smoothing) + void Network::buildNetwork(const vector>& dataset, const vector& labels, const vector& featureNames, const string& className) { - auto jointCounts = [](const vector>& data, const vector& indices, int numStates) { - int size = indices.size(); - vector sizes(size, numStates); - torch::Tensor counts = torch::zeros(sizes, torch::kLong); - - for (const auto& row : data) { - int idx = 0; - for (int i = 0; i < size; ++i) { - idx = idx * numStates + row[indices[i]]; - } - counts.view({ -1 }).add_(idx, 1); - } - - return counts; - }; - - auto marginalCounts = [](const torch::Tensor& jointCounts) { - return jointCounts.sum(-1); - }; - - for (auto& pair : nodes) { - Node* node = pair.second; - - vector indices; - for (const auto& parent : node->getParents()) { - indices.push_back(nodes[parent->getName()]->getId()); - } - indices.push_back(node->getId()); - - for (auto& child : node->getChildren()) { - torch::Tensor counts = jointCounts(dataset, indices, node->getNumStates()) + smoothing; - torch::Tensor parentCounts = marginalCounts(counts); - parentCounts = parentCounts.unsqueeze(-1); - - torch::Tensor cpt = counts.to(torch::kDouble) / parentCounts.to(torch::kDouble); - setCPD(node->getCPDKey(child), cpt); - } + // Add features as nodes to the network + for (int i = 0; i < featureNames.size(); ++i) { + addNode(featureNames[i], *max_element(dataset[i].begin(), dataset[i].end()) + 1); } + // Add class as node to the network + addNode(className, *max_element(labels.begin(), labels.end()) + 1); + // Add edges from class to features => naive Bayes + for (auto feature : featureNames) { + addEdge(className, feature); + } + } + void Network::fit(const vector>& dataset, const vector& labels, const vector& featureNames, const string& className) + { + buildNetwork(dataset, labels, featureNames, className); + //estimateParameters(dataset); + + // auto jointCounts = [](const vector>& data, const vector& indices, int numStates) { + // int size = indices.size(); + // vector sizes(size, numStates); + // torch::Tensor counts = torch::zeros(sizes, torch::kLong); + + // for (const auto& row : data) { + // int idx = 0; + // for (int i = 0; i < size; ++i) { + // idx = idx * numStates + row[indices[i]]; + // } + // counts.view({ -1 }).add_(idx, 1); + // } + + // return counts; + // }; + + // auto marginalCounts = [](const torch::Tensor& jointCounts) { + // return jointCounts.sum(-1); + // }; + + // for (auto& pair : nodes) { + // Node* node = pair.second; + + // vector indices; + // for (const auto& parent : node->getParents()) { + // indices.push_back(nodes[parent->getName()]->getId()); + // } + // indices.push_back(node->getId()); + + // for (auto& child : node->getChildren()) { + // torch::Tensor counts = jointCounts(dataset, indices, node->getNumStates()) + laplaceSmoothing; + // torch::Tensor parentCounts = marginalCounts(counts); + // parentCounts = parentCounts.unsqueeze(-1); + + // torch::Tensor cpt = counts.to(torch::kDouble) / parentCounts.to(torch::kDouble); + // setCPD(node->getCPDKey(child), cpt); + // } + // } } torch::Tensor& Network::getCPD(const string& key) diff --git a/Network.h b/Network.h index 8b287eb..b93df26 100644 --- a/Network.h +++ b/Network.h @@ -8,14 +8,18 @@ namespace bayesnet { private: map nodes; map cpds; // Map from CPD key to CPD tensor - Node* root = nullptr; + Node* root; + int laplaceSmoothing; bool isCyclic(const std::string&, std::unordered_set&, std::unordered_set&); public: + Network(); + Network(int); ~Network(); void addNode(string, int); void addEdge(const string, const string); map& getNodes(); - void fit(const vector>&, const int); + void fit(const vector>&, const vector&, const vector&, const string&); + void buildNetwork(const vector>&, const vector&, const vector&, const string&); torch::Tensor& getCPD(const string&); void setCPD(const string&, const torch::Tensor&); void setRoot(string); diff --git a/main copy.cc b/main copy.cc new file mode 100644 index 0000000..5ec155d --- /dev/null +++ b/main copy.cc @@ -0,0 +1,54 @@ +#include +#include +#include +#include "ArffFiles.h" +#include "Network.h" + +using namespace std; + +int main() +{ + auto handler = ArffFiles(); + handler.load("iris.arff"); + auto X = handler.getX(); + auto y = handler.getY(); + auto className = handler.getClassName(); + vector> edges = { {className, "sepallength"}, {className, "sepalwidth"}, {className, "petallength"}, {className, "petalwidth"} }; + auto network = bayesnet::Network(); + // Add nodes to the network + for (auto feature : handler.getAttributes()) { + cout << "Adding feature: " << feature.first << endl; + network.addNode(feature.first, 7); + } + network.addNode(className, 3); + for (auto item : edges) { + network.addEdge(item.first, item.second); + } + cout << "Hello, Bayesian Networks!" << endl; + torch::Tensor tensor = torch::eye(3); + cout << "Now I'll add a cycle" << endl; + try { + network.addEdge("petallength", className); + } + catch (invalid_argument& e) { + cout << e.what() << endl; + } + cout << tensor << std::endl; + cout << "Nodes:" << endl; + for (auto [name, item] : network.getNodes()) { + cout << "*" << item->getName() << endl; + cout << "-Parents:" << endl; + for (auto parent : item->getParents()) { + cout << " " << parent->getName() << endl; + } + cout << "-Children:" << endl; + for (auto child : item->getChildren()) { + cout << " " << child->getName() << endl; + } + } + cout << "Root: " << network.getRoot()->getName() << endl; + network.setRoot(className); + cout << "Now Root should be class: " << network.getRoot()->getName() << endl; + cout << "PyTorch version: " << TORCH_VERSION << endl; + return 0; +} \ No newline at end of file diff --git a/main.cc b/main.cc index 49bbe75..7714e00 100644 --- a/main.cc +++ b/main.cc @@ -3,40 +3,44 @@ #include #include "ArffFiles.h" #include "Network.h" +#include "CPPFImdlp.h" + using namespace std; +vector discretize(vector& X, mdlp::labels_t& y) +{ + vectorXd; + auto fimdlp = mdlp::CPPFImdlp(); + for (int i = 0; i < X.size(); i++) { + fimdlp.fit(X[i], y); + Xd.push_back(fimdlp.transform(X[i])); + } + return Xd; +} + int main() { auto handler = ArffFiles(); handler.load("iris.arff"); - auto X = handler.getX(); - auto y = handler.getY(); + // Get Dataset X, y + vector& X = handler.getX(); + mdlp::labels_t& y = handler.getY(); + // Get className & Features auto className = handler.getClassName(); - vector> edges = { {className, "sepallength"}, {className, "sepalwidth"}, {className, "petallength"}, {className, "petalwidth"} }; - auto network = bayesnet::Network(); - // Add nodes to the network + vector features; for (auto feature : handler.getAttributes()) { - cout << "Adding feature: " << feature.first << endl; - network.addNode(feature.first, 7); - } - network.addNode(className, 3); - for (auto item : edges) { - network.addEdge(item.first, item.second); + features.push_back(feature.first); } + // Discretize Dataset + vector Xd = discretize(X, y);; + // Build Network + auto network = bayesnet::Network(); + network.fit(Xd, y, features, className); cout << "Hello, Bayesian Networks!" << endl; - torch::Tensor tensor = torch::eye(3); - cout << "Now I'll add a cycle" << endl; - try { - network.addEdge("petallength", className); - } - catch (invalid_argument& e) { - cout << e.what() << endl; - } - cout << tensor << std::endl; cout << "Nodes:" << endl; for (auto [name, item] : network.getNodes()) { - cout << "*" << item->getName() << endl; + cout << "*" << item->getName() << " -> " << item->getNumStates() << endl; cout << "-Parents:" << endl; for (auto parent : item->getParents()) { cout << " " << parent->getName() << endl; @@ -49,6 +53,6 @@ int main() cout << "Root: " << network.getRoot()->getName() << endl; network.setRoot(className); cout << "Now Root should be class: " << network.getRoot()->getName() << endl; - + cout << "PyTorch version: " << TORCH_VERSION << endl; return 0; } \ No newline at end of file diff --git a/typesFImdlp.h b/typesFImdlp.h new file mode 100644 index 0000000..b28b2ca --- /dev/null +++ b/typesFImdlp.h @@ -0,0 +1,18 @@ +#ifndef TYPES_H +#define TYPES_H + +#include +#include +#include + +using namespace std; +namespace mdlp { + typedef float precision_t; + typedef vector samples_t; + typedef vector labels_t; + typedef vector indices_t; + typedef vector cutPoints_t; + typedef map, precision_t> cacheEnt_t; + typedef map, precision_t> cacheIg_t; +} +#endif