diff --git a/.gitmodules b/.gitmodules index e709952..0122614 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ -[submodule "lib/BayesNet"] - path = lib/BayesNet - url = https://github.com/rmontanana/BayesNet - update = merge + +[submodule "lib/json"] + path = lib/json + url = https://github.com/nlohmann/json.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ebc464..696dd3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,7 @@ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") # ------- option(ENABLE_TESTING "Unit testing build" OFF) option(CODE_COVERAGE "Collect coverage from test library" OFF) +option(INSTALL_GTEST "Enable installation of googletest." OFF) # Boost Library set(Boost_USE_STATIC_LIBS OFF) @@ -64,12 +65,11 @@ endif (ENABLE_CLANG_TIDY) # External libraries - dependencies of PyClassifiers # -------------------------------------------------- -add_git_submodule("lib/BayesNet") +find_library(BayesNet NAMES libBayesNet BayesNet) # Subdirectories # -------------- file(GLOB PyClassifiers_SOURCES CONFIGURE_DEPENDS ${PyClassifiers_SOURCE_DIR}/src/*.cc ${PyClassifiers_SOURCE_DIR}/src/*.hpp) -add_subdirectory(config) add_subdirectory(src) # Testing @@ -78,4 +78,13 @@ if (ENABLE_TESTING) MESSAGE("Testing enabled") include(CTest) add_subdirectory(tests) -endif (ENABLE_TESTING) \ No newline at end of file +endif (ENABLE_TESTING) + +# Installation +# ------------ +install(TARGETS PyClassifiers + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib + CONFIGURATIONS Release) +install(DIRECTORY src/ DESTINATION include/pyclassifiers FILES_MATCHING CONFIGURATIONS Release PATTERN "*.h" PATTERN "*.hpp") +install(FILES /usr/local/include/bayesnet/config.h DESTINATION include/pyclassifiers CONFIGURATIONS Release) \ No newline at end of file diff --git a/Makefile b/Makefile index e964637..8fed862 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,11 @@ clean: ## Clean the tests info $(call ClearTests) @echo ">>> Done"; +install: ## Install library + @echo ">>> Installing PyClassifiers..."; + @cmake --install $(f_release) + @echo ">>> Done"; + debug: ## Build a debug version of the project @echo ">>> Building Debug PyClassifiers..."; @if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi diff --git a/README.md b/README.md index ca900f6..4c8db4f 100644 --- a/README.md +++ b/README.md @@ -59,3 +59,11 @@ export BOOST_ROOT=/path/to/library/own/ ``` Don't forget to add the export BOOST_ROOT statement to .bashrc or wherever it is meant to be. + +## Installation + +```bash +make release +make buildr +sudo make install +``` diff --git a/config/CMakeLists.txt b/config/CMakeLists.txt deleted file mode 100644 index c6c4cde..0000000 --- a/config/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -configure_file( - "config.h.in" - "${CMAKE_BINARY_DIR}/configured_files/include/config.h" ESCAPE_QUOTES -) diff --git a/config/config.h.in b/config/config.h.in deleted file mode 100644 index 3aa005e..0000000 --- a/config/config.h.in +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include - -#define PROJECT_VERSION_MAJOR @PROJECT_VERSION_MAJOR @ -#define PROJECT_VERSION_MINOR @PROJECT_VERSION_MINOR @ -#define PROJECT_VERSION_PATCH @PROJECT_VERSION_PATCH @ - -static constexpr std::string_view project_name = "@PROJECT_NAME@"; -static constexpr std::string_view project_version = "@PROJECT_VERSION@"; -static constexpr std::string_view project_description = "@PROJECT_DESCRIPTION@"; -static constexpr std::string_view git_sha = "@GIT_SHA@"; -static constexpr std::string_view data_path = "@PyClassifiers_SOURCE_DIR@/tests/data/"; diff --git a/lib/BayesNet b/lib/BayesNet deleted file mode 160000 index cbe15e3..0000000 --- a/lib/BayesNet +++ /dev/null @@ -1 +0,0 @@ -Subproject commit cbe15e317dfe2c06a88b9bce5925421689d8ec58 diff --git a/lib/Files/ArffFiles.cc b/lib/Files/ArffFiles.cc new file mode 100644 index 0000000..99f29bd --- /dev/null +++ b/lib/Files/ArffFiles.cc @@ -0,0 +1,168 @@ +#include "ArffFiles.h" +#include +#include +#include +#include + +ArffFiles::ArffFiles() = default; + +std::vector ArffFiles::getLines() const +{ + return lines; +} + +unsigned long int ArffFiles::getSize() const +{ + return lines.size(); +} + +std::vector> ArffFiles::getAttributes() const +{ + return attributes; +} + +std::string ArffFiles::getClassName() const +{ + return className; +} + +std::string ArffFiles::getClassType() const +{ + return classType; +} + +std::vector>& ArffFiles::getX() +{ + return X; +} + +std::vector& ArffFiles::getY() +{ + return y; +} + +void ArffFiles::loadCommon(std::string fileName) +{ + std::ifstream file(fileName); + if (!file.is_open()) { + throw std::invalid_argument("Unable to open file"); + } + std::string line; + std::string keyword; + std::string attribute; + std::string type; + std::string type_w; + while (getline(file, line)) { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { + std::stringstream ss(line); + ss >> keyword >> attribute; + type = ""; + while (ss >> type_w) + type += type_w + " "; + attributes.emplace_back(trim(attribute), trim(type)); + continue; + } + if (line[0] == '@') { + continue; + } + lines.push_back(line); + } + file.close(); + if (attributes.empty()) + throw std::invalid_argument("No attributes found"); +} + +void ArffFiles::load(const std::string& fileName, bool classLast) +{ + int labelIndex; + loadCommon(fileName); + if (classLast) { + className = std::get<0>(attributes.back()); + classType = std::get<1>(attributes.back()); + attributes.pop_back(); + labelIndex = static_cast(attributes.size()); + } else { + className = std::get<0>(attributes.front()); + classType = std::get<1>(attributes.front()); + attributes.erase(attributes.begin()); + labelIndex = 0; + } + generateDataset(labelIndex); +} +void ArffFiles::load(const std::string& fileName, const std::string& name) +{ + int labelIndex; + loadCommon(fileName); + bool found = false; + for (int i = 0; i < attributes.size(); ++i) { + if (attributes[i].first == name) { + className = std::get<0>(attributes[i]); + classType = std::get<1>(attributes[i]); + attributes.erase(attributes.begin() + i); + labelIndex = i; + found = true; + break; + } + } + if (!found) { + throw std::invalid_argument("Class name not found"); + } + generateDataset(labelIndex); +} + +void ArffFiles::generateDataset(int labelIndex) +{ + X = std::vector>(attributes.size(), std::vector(lines.size())); + auto yy = std::vector(lines.size(), ""); + auto removeLines = std::vector(); // Lines with missing values + for (size_t i = 0; i < lines.size(); i++) { + std::stringstream ss(lines[i]); + std::string value; + int pos = 0; + int xIndex = 0; + while (getline(ss, value, ',')) { + if (pos++ == labelIndex) { + yy[i] = value; + } else { + if (value == "?") { + X[xIndex++][i] = -1; + removeLines.push_back(i); + } else + X[xIndex++][i] = stof(value); + } + } + } + for (auto i : removeLines) { + yy.erase(yy.begin() + i); + for (auto& x : X) { + x.erase(x.begin() + i); + } + } + y = factorize(yy); +} + +std::string ArffFiles::trim(const std::string& source) +{ + std::string s(source); + s.erase(0, s.find_first_not_of(" '\n\r\t")); + s.erase(s.find_last_not_of(" '\n\r\t") + 1); + return s; +} + +std::vector ArffFiles::factorize(const std::vector& labels_t) +{ + std::vector yy; + yy.reserve(labels_t.size()); + std::map labelMap; + int i = 0; + for (const std::string& label : labels_t) { + if (labelMap.find(label) == labelMap.end()) { + labelMap[label] = i++; + } + yy.push_back(labelMap[label]); + } + return yy; +} \ No newline at end of file diff --git a/lib/Files/ArffFiles.h b/lib/Files/ArffFiles.h new file mode 100644 index 0000000..25e5a8c --- /dev/null +++ b/lib/Files/ArffFiles.h @@ -0,0 +1,32 @@ +#ifndef ARFFFILES_H +#define ARFFFILES_H + +#include +#include + +class ArffFiles { +private: + std::vector lines; + std::vector> attributes; + std::string className; + std::string classType; + std::vector> X; + std::vector y; + void generateDataset(int); + void loadCommon(std::string); +public: + ArffFiles(); + void load(const std::string&, bool = true); + void load(const std::string&, const std::string&); + std::vector getLines() const; + unsigned long int getSize() const; + std::string getClassName() const; + std::string getClassType() const; + static std::string trim(const std::string&); + std::vector>& getX(); + std::vector& getY(); + std::vector> getAttributes() const; + static std::vector factorize(const std::vector& labels_t); +}; + +#endif \ No newline at end of file diff --git a/lib/Files/CMakeLists.txt b/lib/Files/CMakeLists.txt new file mode 100644 index 0000000..fce5b8f --- /dev/null +++ b/lib/Files/CMakeLists.txt @@ -0,0 +1 @@ +add_library(ArffFiles ArffFiles.cc) \ No newline at end of file diff --git a/lib/json b/lib/json new file mode 160000 index 0000000..0457de2 --- /dev/null +++ b/lib/json @@ -0,0 +1 @@ +Subproject commit 0457de21cffb298c22b629e538036bfeb96130b7 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0622516..2050e8c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,14 +1,9 @@ include_directories( ${PyClassifiers_SOURCE_DIR}/lib/Files - ${PyClassifiers_SOURCE_DIR}/lib/BayesNet/lib/json/include - ${PyClassifiers_SOURCE_DIR}/lib/BayesNet/src - ${PyClassifiers_SOURCE_DIR}/lib/BayesNet/src/classifiers - ${PyClassifiers_SOURCE_DIR}/lib/BayesNet/src/ensembles - ${PyClassifiers_SOURCE_DIR}/lib/BayesNet/src/bayesian_network - ${PyClassifiers_SOURCE_DIR}/lib/BayesNet/src/utils - ${CMAKE_BINARY_DIR}/configured_files/include + ${PyClassifiers_SOURCE_DIR}/lib/json/include ${Python3_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} + /usr/local/include/bayesnet ) -add_library(PyClassifiers SHARED ${PyClassifiers_SOURCES}) -target_link_libraries(PyClassifiers BayesNet ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy) \ No newline at end of file +add_library(PyClassifiers ${PyClassifiers_SOURCES}) +target_link_libraries(PyClassifiers ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy) \ No newline at end of file diff --git a/src/PyClassifier.h b/src/PyClassifier.h index 520794d..fc44bb4 100644 --- a/src/PyClassifier.h +++ b/src/PyClassifier.h @@ -9,7 +9,7 @@ #include #include #include "PyWrap.h" -#include "Classifier.h" +#include "classifiers/Classifier.h" #include "TypeId.h" namespace pywrap { diff --git a/tests/TestPythonClassifiers.cc b/tests/TestPythonClassifiers.cc index 3856ec9..beee7d6 100644 --- a/tests/TestPythonClassifiers.cc +++ b/tests/TestPythonClassifiers.cc @@ -13,6 +13,8 @@ #include "TestUtils.h" #include +const std::string ACTUAL_VERSION = "1.0.4"; + TEST_CASE("Test Python Classifiers score", "[PyClassifiers]") { map , float> scores = { @@ -25,7 +27,6 @@ TEST_CASE("Test Python Classifiers score", "[PyClassifiers]") // Iris {{"iris", "STree"}, 0.99333}, {{"iris", "ODTE"}, 0.98667}, {{"iris", "SVC"}, 0.97333}, {{"iris", "RandomForest"}, 1.0}, }; - std::string name = GENERATE("ODTE", "STree", "SVC", "RandomForest"); map models = { {"ODTE", new pywrap::ODTE()}, @@ -33,17 +34,23 @@ TEST_CASE("Test Python Classifiers score", "[PyClassifiers]") {"SVC", new pywrap::SVC()}, {"RandomForest", new pywrap::RandomForest()} }; + auto clf = models[name]; + SECTION("Test Python Classifier " + name + " score ") { for (std::string file_name : { "glass", "iris", "ecoli", "diabetes" }) { auto raw = RawDatasets(file_name, false); - auto clf = models[name]; clf->fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest); auto score = clf->score(raw.Xt, raw.yt); INFO("File: " + file_name + " Classifier: " + name + " Score: " + to_string(score)); REQUIRE(score == Catch::Approx(scores[{file_name, name}]).epsilon(raw.epsilon)); } } + SECTION("Library check version") + { + INFO("Checking version of " + name + " classifier"); + REQUIRE(clf->getVersion() == ACTUAL_VERSION); + } } TEST_CASE("Classifiers features", "[PyClassifiers]") {