diff --git a/CMakeLists.txt b/CMakeLists.txt index 67cee94..152447e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,8 +25,11 @@ set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -fno-elide-constructors -fno-default-inline") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fprofile-arcs -ftest-coverage -fno-elide-constructors") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") +if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-default-inline") +endif() # Options # ------- @@ -63,7 +66,6 @@ endif (ENABLE_CLANG_TIDY) # include(FetchContent) add_git_submodule("lib/json") add_git_submodule("lib/mdlp") -add_subdirectory("lib/Files") # Subdirectories # -------------- diff --git a/bayesnet/classifiers/Proposal.cc b/bayesnet/classifiers/Proposal.cc index 3ee9cda..d62ed6e 100644 --- a/bayesnet/classifiers/Proposal.cc +++ b/bayesnet/classifiers/Proposal.cc @@ -4,7 +4,7 @@ // SPDX-License-Identifier: MIT // *************************************************************** -#include +#include #include "Proposal.h" namespace bayesnet { diff --git a/lib/Files/ArffFiles.cc b/lib/Files/ArffFiles.cc deleted file mode 100644 index 299b60c..0000000 --- a/lib/Files/ArffFiles.cc +++ /dev/null @@ -1,171 +0,0 @@ -#include "ArffFiles.h" -#include -#include -#include -#include - -std::vector ArffFiles::getLines() const -{ - return lines; -} - -unsigned long int ArffFiles::getSize() const -{ - return lines.size(); -} - -std::vector> ArffFiles::getAttributes() const -{ - return attributes; -} - -std::string ArffFiles::getClassName() const -{ - return className; -} - -std::string ArffFiles::getClassType() const -{ - return classType; -} - -std::vector>& ArffFiles::getX() -{ - return X; -} - -std::vector& ArffFiles::getY() -{ - return y; -} - -void ArffFiles::loadCommon(const std::string& fileName) -{ - std::ifstream file(fileName); - if (!file.is_open()) { - throw std::invalid_argument("Unable to open file"); - } - std::string line; - std::string keyword; - std::string attribute; - std::string type; - std::string type_w; - // Read file - while (getline(file, line)) { - if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { - // Skip comments and empty lines - continue; - } - if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { - // Read attributes - std::stringstream ss(line); - ss >> keyword >> attribute; - type = ""; - while (ss >> type_w) - type += type_w + " "; - attributes.emplace_back(trim(attribute), trim(type)); - continue; - } - if (line[0] == '@') { - continue; - } - // Read data - lines.push_back(line); - } - file.close(); - if (attributes.empty()) - throw std::invalid_argument("No attributes found"); -} - -void ArffFiles::load(const std::string& fileName, bool classLast) -{ - int labelIndex; - loadCommon(fileName); - if (classLast) { - className = std::get<0>(attributes.back()); - classType = std::get<1>(attributes.back()); - attributes.pop_back(); - labelIndex = static_cast(attributes.size()); - } else { - className = std::get<0>(attributes.front()); - classType = std::get<1>(attributes.front()); - attributes.erase(attributes.begin()); - labelIndex = 0; - } - generateDataset(labelIndex); -} -void ArffFiles::load(const std::string& fileName, const std::string& name) -{ - int labelIndex; - loadCommon(fileName); - bool found = false; - for (int i = 0; i < attributes.size(); ++i) { - if (attributes[i].first == name) { - className = std::get<0>(attributes[i]); - classType = std::get<1>(attributes[i]); - attributes.erase(attributes.begin() + i); - labelIndex = i; - found = true; - break; - } - } - if (!found) { - throw std::invalid_argument("Class name not found"); - } - generateDataset(labelIndex); -} - -void ArffFiles::generateDataset(int labelIndex) -{ - X = std::vector>(attributes.size(), std::vector(lines.size())); - auto yy = std::vector(lines.size(), ""); - auto removeLines = std::vector(); // Lines with missing values - size_t numLine = 0; - for (numLine = 0; numLine < lines.size(); numLine++) { - std::stringstream ss(lines[numLine]); - std::string value; - int pos = 0; - int xIndex = 0; - while (getline(ss, value, ',')) { - if (pos++ == labelIndex) { - yy[numLine] = value; - } else { - if (value == "?") { - X[xIndex++][numLine] = -1; - removeLines.push_back(numLine); - } else - X[xIndex++][numLine] = stof(value); - } - } - } - for (auto i : removeLines) { - yy.erase(yy.begin() + i); - for (auto& x : X) { - x.erase(x.begin() + i); - } - } - y = factorize(yy); -} - -std::string ArffFiles::trim(const std::string& source) -{ - std::string s(source); - s.erase(0, s.find_first_not_of(" '\n\r\t")); - s.erase(s.find_last_not_of(" '\n\r\t") + 1); - return s; -} - -std::vector ArffFiles::factorize(const std::vector& labels_t) -{ - std::vector yy; - yy.reserve(labels_t.size()); - std::map labelMap; - int i = 0; - for (const std::string& label : labels_t) { - if (labelMap.find(label) == labelMap.end()) { - labelMap[label] = i++; - } - yy.push_back(labelMap[label]); - } - return yy; -} \ No newline at end of file diff --git a/lib/Files/ArffFiles.h b/lib/Files/ArffFiles.h deleted file mode 100644 index 4602a05..0000000 --- a/lib/Files/ArffFiles.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef ARFFFILES_H -#define ARFFFILES_H - -#include -#include - -class ArffFiles { -public: - ArffFiles() = default; - void load(const std::string&, bool = true); - void load(const std::string&, const std::string&); - std::vector getLines() const; - unsigned long int getSize() const; - std::string getClassName() const; - std::string getClassType() const; - static std::string trim(const std::string&); - std::vector>& getX(); - std::vector& getY(); - std::vector> getAttributes() const; - static std::vector factorize(const std::vector& labels_t); -protected: - std::vector lines; - std::vector> attributes; - std::string className; - std::string classType; - std::vector> X; - std::vector y; - int maxLines = 0; - void generateDataset(int); - void loadCommon(const std::string&); -}; - -#endif \ No newline at end of file diff --git a/lib/Files/ArffFiles.hpp b/lib/Files/ArffFiles.hpp new file mode 100644 index 0000000..7227299 --- /dev/null +++ b/lib/Files/ArffFiles.hpp @@ -0,0 +1,161 @@ +#ifndef ARFFFILES_HPP +#define ARFFFILES_HPP + +#include +#include +#include +#include +#include +#include // std::isdigit +#include // std::all_of +#include + +class ArffFiles { +public: + ArffFiles() = default; + void load(const std::string& fileName, bool classLast = true) + { + int labelIndex; + loadCommon(fileName); + if (classLast) { + className = std::get<0>(attributes.back()); + classType = std::get<1>(attributes.back()); + attributes.pop_back(); + labelIndex = static_cast(attributes.size()); + } else { + className = std::get<0>(attributes.front()); + classType = std::get<1>(attributes.front()); + attributes.erase(attributes.begin()); + labelIndex = 0; + } + generateDataset(labelIndex); + }; + void load(const std::string& fileName, const std::string& name) + { + int labelIndex; + loadCommon(fileName); + bool found = false; + for (int i = 0; i < attributes.size(); ++i) { + if (attributes[i].first == name) { + className = std::get<0>(attributes[i]); + classType = std::get<1>(attributes[i]); + attributes.erase(attributes.begin() + i); + labelIndex = i; + found = true; + break; + } + } + if (!found) { + throw std::invalid_argument("Class name not found"); + } + generateDataset(labelIndex); + }; + std::vector getLines() const { return lines; }; + unsigned long int getSize() const { return lines.size(); }; + std::string getClassName() const { return className; }; + std::string getClassType() const { return classType; }; + std::vector getLabels() const { return labels; } + static std::string trim(const std::string& source) + { + std::string s(source); + s.erase(0, s.find_first_not_of(" '\n\r\t")); + s.erase(s.find_last_not_of(" '\n\r\t") + 1); + return s; + }; + std::vector>& getX() { return X; }; + std::vector& getY() { return y; } + std::vector> getAttributes() const { return attributes; }; + std::vector factorize(const std::vector& labels_t) + { + std::vector yy; + labels.clear(); + yy.reserve(labels_t.size()); + std::map labelMap; + int i = 0; + for (const std::string& label : labels_t) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + bool allDigits = std::all_of(label.begin(), label.end(), isdigit); + if (allDigits) + labels.push_back("Class " + label); + else + labels.push_back(label); + } + yy.push_back(labelMap[label]); + } + return yy; + }; +private: + void generateDataset(int labelIndex) + { + X = std::vector>(attributes.size(), std::vector(lines.size())); + auto yy = std::vector(lines.size(), ""); + auto removeLines = std::vector(); // Lines with missing values + for (size_t i = 0; i < lines.size(); i++) { + std::stringstream ss(lines[i]); + std::string value; + int pos = 0; + int xIndex = 0; + while (getline(ss, value, ',')) { + if (pos++ == labelIndex) { + yy[i] = value; + } else { + if (value == "?") { + X[xIndex++][i] = -1; + removeLines.push_back(i); + } else + X[xIndex++][i] = stof(value); + } + } + } + for (auto i : removeLines) { + yy.erase(yy.begin() + i); + for (auto& x : X) { + x.erase(x.begin() + i); + } + } + y = factorize(yy); + }; + void loadCommon(std::string fileName) + { + std::ifstream file(fileName); + if (!file.is_open()) { + throw std::invalid_argument("Unable to open file"); + } + std::string line; + std::string keyword; + std::string attribute; + std::string type; + std::string type_w; + while (getline(file, line)) { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { + std::stringstream ss(line); + ss >> keyword >> attribute; + type = ""; + while (ss >> type_w) + type += type_w + " "; + attributes.emplace_back(trim(attribute), trim(type)); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw std::invalid_argument("No attributes found"); + }; + std::vector lines; + std::vector> attributes; + std::string className; + std::string classType; + std::vector> X; + std::vector y; + std::vector labels; +}; + +#endif \ No newline at end of file diff --git a/lib/Files/CMakeLists.txt b/lib/Files/CMakeLists.txt deleted file mode 100644 index fce5b8f..0000000 --- a/lib/Files/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_library(ArffFiles ArffFiles.cc) \ No newline at end of file diff --git a/lib/catch2 b/lib/catch2 new file mode 160000 index 0000000..029fe3b --- /dev/null +++ b/lib/catch2 @@ -0,0 +1 @@ +Subproject commit 029fe3b4609dd84cd939b73357f37bbb75bcf82f diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index c95e219..d50030e 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -14,7 +14,6 @@ include_directories( /usr/local/include ) -add_subdirectory(lib/Files) add_subdirectory(lib/mdlp) add_executable(bayesnet_sample sample.cc) -target_link_libraries(bayesnet_sample ArffFiles mdlp "${TORCH_LIBRARIES}" "${BayesNet}") \ No newline at end of file +target_link_libraries(bayesnet_sample mdlp "${TORCH_LIBRARIES}" "${BayesNet}") \ No newline at end of file diff --git a/sample/lib/Files/ArffFiles.cc b/sample/lib/Files/ArffFiles.cc deleted file mode 100644 index d333d1e..0000000 --- a/sample/lib/Files/ArffFiles.cc +++ /dev/null @@ -1,174 +0,0 @@ -// *************************************************************** -// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez -// SPDX-FileType: SOURCE -// SPDX-License-Identifier: MIT -// *************************************************************** - -#include "ArffFiles.h" -#include -#include -#include -#include - -ArffFiles::ArffFiles() = default; - -std::vector ArffFiles::getLines() const -{ - return lines; -} - -unsigned long int ArffFiles::getSize() const -{ - return lines.size(); -} - -std::vector> ArffFiles::getAttributes() const -{ - return attributes; -} - -std::string ArffFiles::getClassName() const -{ - return className; -} - -std::string ArffFiles::getClassType() const -{ - return classType; -} - -std::vector>& ArffFiles::getX() -{ - return X; -} - -std::vector& ArffFiles::getY() -{ - return y; -} - -void ArffFiles::loadCommon(std::string fileName) -{ - std::ifstream file(fileName); - if (!file.is_open()) { - throw std::invalid_argument("Unable to open file"); - } - std::string line; - std::string keyword; - std::string attribute; - std::string type; - std::string type_w; - while (getline(file, line)) { - if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { - continue; - } - if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { - std::stringstream ss(line); - ss >> keyword >> attribute; - type = ""; - while (ss >> type_w) - type += type_w + " "; - attributes.emplace_back(trim(attribute), trim(type)); - continue; - } - if (line[0] == '@') { - continue; - } - lines.push_back(line); - } - file.close(); - if (attributes.empty()) - throw std::invalid_argument("No attributes found"); -} - -void ArffFiles::load(const std::string& fileName, bool classLast) -{ - int labelIndex; - loadCommon(fileName); - if (classLast) { - className = std::get<0>(attributes.back()); - classType = std::get<1>(attributes.back()); - attributes.pop_back(); - labelIndex = static_cast(attributes.size()); - } else { - className = std::get<0>(attributes.front()); - classType = std::get<1>(attributes.front()); - attributes.erase(attributes.begin()); - labelIndex = 0; - } - generateDataset(labelIndex); -} -void ArffFiles::load(const std::string& fileName, const std::string& name) -{ - int labelIndex; - loadCommon(fileName); - bool found = false; - for (int i = 0; i < attributes.size(); ++i) { - if (attributes[i].first == name) { - className = std::get<0>(attributes[i]); - classType = std::get<1>(attributes[i]); - attributes.erase(attributes.begin() + i); - labelIndex = i; - found = true; - break; - } - } - if (!found) { - throw std::invalid_argument("Class name not found"); - } - generateDataset(labelIndex); -} - -void ArffFiles::generateDataset(int labelIndex) -{ - X = std::vector>(attributes.size(), std::vector(lines.size())); - auto yy = std::vector(lines.size(), ""); - auto removeLines = std::vector(); // Lines with missing values - for (size_t i = 0; i < lines.size(); i++) { - std::stringstream ss(lines[i]); - std::string value; - int pos = 0; - int xIndex = 0; - while (getline(ss, value, ',')) { - if (pos++ == labelIndex) { - yy[i] = value; - } else { - if (value == "?") { - X[xIndex++][i] = -1; - removeLines.push_back(i); - } else - X[xIndex++][i] = stof(value); - } - } - } - for (auto i : removeLines) { - yy.erase(yy.begin() + i); - for (auto& x : X) { - x.erase(x.begin() + i); - } - } - y = factorize(yy); -} - -std::string ArffFiles::trim(const std::string& source) -{ - std::string s(source); - s.erase(0, s.find_first_not_of(" '\n\r\t")); - s.erase(s.find_last_not_of(" '\n\r\t") + 1); - return s; -} - -std::vector ArffFiles::factorize(const std::vector& labels_t) -{ - std::vector yy; - yy.reserve(labels_t.size()); - std::map labelMap; - int i = 0; - for (const std::string& label : labels_t) { - if (labelMap.find(label) == labelMap.end()) { - labelMap[label] = i++; - } - yy.push_back(labelMap[label]); - } - return yy; -} \ No newline at end of file diff --git a/sample/lib/Files/ArffFiles.h b/sample/lib/Files/ArffFiles.h deleted file mode 100644 index 12206c5..0000000 --- a/sample/lib/Files/ArffFiles.h +++ /dev/null @@ -1,38 +0,0 @@ -// *************************************************************** -// SPDX-FileCopyrightText: Copyright 2024 Ricardo Montañana Gómez -// SPDX-FileType: SOURCE -// SPDX-License-Identifier: MIT -// *************************************************************** - -#ifndef ARFFFILES_H -#define ARFFFILES_H - -#include -#include - -class ArffFiles { -private: - std::vector lines; - std::vector> attributes; - std::string className; - std::string classType; - std::vector> X; - std::vector y; - void generateDataset(int); - void loadCommon(std::string); -public: - ArffFiles(); - void load(const std::string&, bool = true); - void load(const std::string&, const std::string&); - std::vector getLines() const; - unsigned long int getSize() const; - std::string getClassName() const; - std::string getClassType() const; - static std::string trim(const std::string&); - std::vector>& getX(); - std::vector& getY(); - std::vector> getAttributes() const; - static std::vector factorize(const std::vector& labels_t); -}; - -#endif \ No newline at end of file diff --git a/sample/lib/Files/ArffFiles.hpp b/sample/lib/Files/ArffFiles.hpp new file mode 100644 index 0000000..7227299 --- /dev/null +++ b/sample/lib/Files/ArffFiles.hpp @@ -0,0 +1,161 @@ +#ifndef ARFFFILES_HPP +#define ARFFFILES_HPP + +#include +#include +#include +#include +#include +#include // std::isdigit +#include // std::all_of +#include + +class ArffFiles { +public: + ArffFiles() = default; + void load(const std::string& fileName, bool classLast = true) + { + int labelIndex; + loadCommon(fileName); + if (classLast) { + className = std::get<0>(attributes.back()); + classType = std::get<1>(attributes.back()); + attributes.pop_back(); + labelIndex = static_cast(attributes.size()); + } else { + className = std::get<0>(attributes.front()); + classType = std::get<1>(attributes.front()); + attributes.erase(attributes.begin()); + labelIndex = 0; + } + generateDataset(labelIndex); + }; + void load(const std::string& fileName, const std::string& name) + { + int labelIndex; + loadCommon(fileName); + bool found = false; + for (int i = 0; i < attributes.size(); ++i) { + if (attributes[i].first == name) { + className = std::get<0>(attributes[i]); + classType = std::get<1>(attributes[i]); + attributes.erase(attributes.begin() + i); + labelIndex = i; + found = true; + break; + } + } + if (!found) { + throw std::invalid_argument("Class name not found"); + } + generateDataset(labelIndex); + }; + std::vector getLines() const { return lines; }; + unsigned long int getSize() const { return lines.size(); }; + std::string getClassName() const { return className; }; + std::string getClassType() const { return classType; }; + std::vector getLabels() const { return labels; } + static std::string trim(const std::string& source) + { + std::string s(source); + s.erase(0, s.find_first_not_of(" '\n\r\t")); + s.erase(s.find_last_not_of(" '\n\r\t") + 1); + return s; + }; + std::vector>& getX() { return X; }; + std::vector& getY() { return y; } + std::vector> getAttributes() const { return attributes; }; + std::vector factorize(const std::vector& labels_t) + { + std::vector yy; + labels.clear(); + yy.reserve(labels_t.size()); + std::map labelMap; + int i = 0; + for (const std::string& label : labels_t) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + bool allDigits = std::all_of(label.begin(), label.end(), isdigit); + if (allDigits) + labels.push_back("Class " + label); + else + labels.push_back(label); + } + yy.push_back(labelMap[label]); + } + return yy; + }; +private: + void generateDataset(int labelIndex) + { + X = std::vector>(attributes.size(), std::vector(lines.size())); + auto yy = std::vector(lines.size(), ""); + auto removeLines = std::vector(); // Lines with missing values + for (size_t i = 0; i < lines.size(); i++) { + std::stringstream ss(lines[i]); + std::string value; + int pos = 0; + int xIndex = 0; + while (getline(ss, value, ',')) { + if (pos++ == labelIndex) { + yy[i] = value; + } else { + if (value == "?") { + X[xIndex++][i] = -1; + removeLines.push_back(i); + } else + X[xIndex++][i] = stof(value); + } + } + } + for (auto i : removeLines) { + yy.erase(yy.begin() + i); + for (auto& x : X) { + x.erase(x.begin() + i); + } + } + y = factorize(yy); + }; + void loadCommon(std::string fileName) + { + std::ifstream file(fileName); + if (!file.is_open()) { + throw std::invalid_argument("Unable to open file"); + } + std::string line; + std::string keyword; + std::string attribute; + std::string type; + std::string type_w; + while (getline(file, line)) { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { + std::stringstream ss(line); + ss >> keyword >> attribute; + type = ""; + while (ss >> type_w) + type += type_w + " "; + attributes.emplace_back(trim(attribute), trim(type)); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw std::invalid_argument("No attributes found"); + }; + std::vector lines; + std::vector> attributes; + std::string className; + std::string classType; + std::vector> X; + std::vector y; + std::vector labels; +}; + +#endif \ No newline at end of file diff --git a/sample/lib/Files/CMakeLists.txt b/sample/lib/Files/CMakeLists.txt deleted file mode 100644 index fce5b8f..0000000 --- a/sample/lib/Files/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_library(ArffFiles ArffFiles.cc) \ No newline at end of file diff --git a/sample/sample.cc b/sample/sample.cc index e58dab7..511230f 100644 --- a/sample/sample.cc +++ b/sample/sample.cc @@ -4,7 +4,7 @@ // SPDX-License-Identifier: MIT // *************************************************************** -#include +#include #include #include