diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 65a5706..0000000 --- a/.gitmodules +++ /dev/null @@ -1,10 +0,0 @@ - -[submodule "lib/json"] - path = lib/json - url = https://github.com/nlohmann/json.git -[submodule "lib/catch2"] - path = lib/catch2 - url = https://github.com/catchorg/Catch2.git -[submodule "lib/mdlp"] - path = lib/mdlp - url = https://github.com/rmontanana/mdlp diff --git a/CMakeLists.txt b/CMakeLists.txt index f48212b..b550eda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,8 @@ endif() find_package(Python3 3.11 COMPONENTS Interpreter Development REQUIRED) message("Python3_LIBRARIES=${Python3_LIBRARIES}") +find_package(nlohmann_json CONFIG REQUIRED) + # CMakes modules # -------------- set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules ${CMAKE_MODULE_PATH}) @@ -64,9 +66,10 @@ endif (ENABLE_CLANG_TIDY) # External libraries - dependencies of PyClassifiers # -------------------------------------------------- -find_library(BayesNet NAMES libBayesNet BayesNet libBayesNet.a PATHS ${PyClassifiers_SOURCE_DIR}/../lib/lib REQUIRED) -find_path(Bayesnet_INCLUDE_DIRS REQUIRED NAMES bayesnet PATHS ${PyClassifiers_SOURCE_DIR}/../lib/include) -message(STATUS "BayesNet=${BayesNet}") +find_library(bayesnet NAMES libbayesnet bayesnet libbayesnet.a PATHS ${PyClassifiers_SOURCE_DIR}/../lib/lib REQUIRED) +find_path(Bayesnet_INCLUDE_DIRS REQUIRED NAMES bayesnet PATHS ../lib/include) + +message(STATUS "BayesNet=${bayesnet}") message(STATUS "Bayesnet_INCLUDE_DIRS=${Bayesnet_INCLUDE_DIRS}") @@ -78,9 +81,8 @@ add_subdirectory(pyclfs) # ------- if (ENABLE_TESTING) MESSAGE("Testing enabled") - add_git_submodule(lib/catch2) - add_git_submodule(lib/mdlp) - add_subdirectory(lib/Files) + find_package(Catch2 CONFIG REQUIRED) + find_package(arff-files CONFIG REQUIRED) include(CTest) add_subdirectory(tests) endif (ENABLE_TESTING) @@ -92,4 +94,4 @@ install(TARGETS PyClassifiers LIBRARY DESTINATION lib CONFIGURATIONS Release) install(DIRECTORY pyclfs/ DESTINATION include/pyclassifiers FILES_MATCHING CONFIGURATIONS Release PATTERN "*.h" PATTERN "*.hpp") -install(FILES ${Bayesnet_INCLUDE_DIRS}/bayesnet/config.h DESTINATION include/pyclassifiers CONFIGURATIONS Release) \ No newline at end of file +install(FILES ${Bayesnet_INCLUDE_DIRS}/bayesnet/config.h DESTINATION include/pyclassifiers CONFIGURATIONS Release) diff --git a/Makefile b/Makefile index def64c8..c02ec3b 100644 --- a/Makefile +++ b/Makefile @@ -52,14 +52,14 @@ debug: ## Build a debug version of the project @echo ">>> Building Debug PyClassifiers..."; @if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi @mkdir $(f_debug); - @cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON + @cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake @echo ">>> Done"; release: ## Build a Release version of the project @echo ">>> Building Release PyClassifiers..."; @if [ -d ./$(f_release) ]; then rm -rf ./$(f_release); fi @mkdir $(f_release); - @cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release + @cmake -S . -B $(f_release) -D CMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake @echo ">>> Done"; opt = "" diff --git a/lib/Files/ArffFiles.cc b/lib/Files/ArffFiles.cc deleted file mode 100644 index 99f29bd..0000000 --- a/lib/Files/ArffFiles.cc +++ /dev/null @@ -1,168 +0,0 @@ -#include "ArffFiles.h" -#include -#include -#include -#include - -ArffFiles::ArffFiles() = default; - -std::vector ArffFiles::getLines() const -{ - return lines; -} - -unsigned long int ArffFiles::getSize() const -{ - return lines.size(); -} - -std::vector> ArffFiles::getAttributes() const -{ - return attributes; -} - -std::string ArffFiles::getClassName() const -{ - return className; -} - -std::string ArffFiles::getClassType() const -{ - return classType; -} - -std::vector>& ArffFiles::getX() -{ - return X; -} - -std::vector& ArffFiles::getY() -{ - return y; -} - -void ArffFiles::loadCommon(std::string fileName) -{ - std::ifstream file(fileName); - if (!file.is_open()) { - throw std::invalid_argument("Unable to open file"); - } - std::string line; - std::string keyword; - std::string attribute; - std::string type; - std::string type_w; - while (getline(file, line)) { - if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { - continue; - } - if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { - std::stringstream ss(line); - ss >> keyword >> attribute; - type = ""; - while (ss >> type_w) - type += type_w + " "; - attributes.emplace_back(trim(attribute), trim(type)); - continue; - } - if (line[0] == '@') { - continue; - } - lines.push_back(line); - } - file.close(); - if (attributes.empty()) - throw std::invalid_argument("No attributes found"); -} - -void ArffFiles::load(const std::string& fileName, bool classLast) -{ - int labelIndex; - loadCommon(fileName); - if (classLast) { - className = std::get<0>(attributes.back()); - classType = std::get<1>(attributes.back()); - attributes.pop_back(); - labelIndex = static_cast(attributes.size()); - } else { - className = std::get<0>(attributes.front()); - classType = std::get<1>(attributes.front()); - attributes.erase(attributes.begin()); - labelIndex = 0; - } - generateDataset(labelIndex); -} -void ArffFiles::load(const std::string& fileName, const std::string& name) -{ - int labelIndex; - loadCommon(fileName); - bool found = false; - for (int i = 0; i < attributes.size(); ++i) { - if (attributes[i].first == name) { - className = std::get<0>(attributes[i]); - classType = std::get<1>(attributes[i]); - attributes.erase(attributes.begin() + i); - labelIndex = i; - found = true; - break; - } - } - if (!found) { - throw std::invalid_argument("Class name not found"); - } - generateDataset(labelIndex); -} - -void ArffFiles::generateDataset(int labelIndex) -{ - X = std::vector>(attributes.size(), std::vector(lines.size())); - auto yy = std::vector(lines.size(), ""); - auto removeLines = std::vector(); // Lines with missing values - for (size_t i = 0; i < lines.size(); i++) { - std::stringstream ss(lines[i]); - std::string value; - int pos = 0; - int xIndex = 0; - while (getline(ss, value, ',')) { - if (pos++ == labelIndex) { - yy[i] = value; - } else { - if (value == "?") { - X[xIndex++][i] = -1; - removeLines.push_back(i); - } else - X[xIndex++][i] = stof(value); - } - } - } - for (auto i : removeLines) { - yy.erase(yy.begin() + i); - for (auto& x : X) { - x.erase(x.begin() + i); - } - } - y = factorize(yy); -} - -std::string ArffFiles::trim(const std::string& source) -{ - std::string s(source); - s.erase(0, s.find_first_not_of(" '\n\r\t")); - s.erase(s.find_last_not_of(" '\n\r\t") + 1); - return s; -} - -std::vector ArffFiles::factorize(const std::vector& labels_t) -{ - std::vector yy; - yy.reserve(labels_t.size()); - std::map labelMap; - int i = 0; - for (const std::string& label : labels_t) { - if (labelMap.find(label) == labelMap.end()) { - labelMap[label] = i++; - } - yy.push_back(labelMap[label]); - } - return yy; -} \ No newline at end of file diff --git a/lib/Files/ArffFiles.h b/lib/Files/ArffFiles.h deleted file mode 100644 index 25e5a8c..0000000 --- a/lib/Files/ArffFiles.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARFFFILES_H -#define ARFFFILES_H - -#include -#include - -class ArffFiles { -private: - std::vector lines; - std::vector> attributes; - std::string className; - std::string classType; - std::vector> X; - std::vector y; - void generateDataset(int); - void loadCommon(std::string); -public: - ArffFiles(); - void load(const std::string&, bool = true); - void load(const std::string&, const std::string&); - std::vector getLines() const; - unsigned long int getSize() const; - std::string getClassName() const; - std::string getClassType() const; - static std::string trim(const std::string&); - std::vector>& getX(); - std::vector& getY(); - std::vector> getAttributes() const; - static std::vector factorize(const std::vector& labels_t); -}; - -#endif \ No newline at end of file diff --git a/lib/Files/CMakeLists.txt b/lib/Files/CMakeLists.txt deleted file mode 100644 index fce5b8f..0000000 --- a/lib/Files/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_library(ArffFiles ArffFiles.cc) \ No newline at end of file diff --git a/lib/catch2 b/lib/catch2 deleted file mode 160000 index 506276c..0000000 --- a/lib/catch2 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 506276c59217429c93abd2fe9507c7f45eb81072 diff --git a/lib/json b/lib/json deleted file mode 160000 index 48e7b4c..0000000 --- a/lib/json +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 48e7b4c23b089c088c11e51c824d78d0f0949b40 diff --git a/lib/mdlp b/lib/mdlp deleted file mode 160000 index 7d62d6a..0000000 --- a/lib/mdlp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7d62d6af4a6ca944a3bbde0b61f651fd4b2d3f57 diff --git a/pyclfs/AdaBoostPy.cc b/pyclfs/AdaBoostPy.cc new file mode 100644 index 0000000..ee27537 --- /dev/null +++ b/pyclfs/AdaBoostPy.cc @@ -0,0 +1,20 @@ +#include "AdaBoostPy.h" + +namespace pywrap { + AdaBoostPy::AdaBoostPy() : PyClassifier("sklearn.ensemble", "AdaBoostClassifier", true) + { + validHyperparameters = { "n_estimators", "n_jobs", "random_state" }; + } + int AdaBoostPy::getNumberOfEdges() const + { + return callMethodSumOfItems("get_n_leaves"); + } + int AdaBoostPy::getNumberOfStates() const + { + return callMethodSumOfItems("get_depth"); + } + int AdaBoostPy::getNumberOfNodes() const + { + return callMethodSumOfItems("node_count"); + } +} /* namespace pywrap */ \ No newline at end of file diff --git a/pyclfs/AdaBoostPy.h b/pyclfs/AdaBoostPy.h new file mode 100644 index 0000000..447b48c --- /dev/null +++ b/pyclfs/AdaBoostPy.h @@ -0,0 +1,15 @@ +#ifndef ADABOOSTPY_H +#define ADABOOSTPY_H +#include "PyClassifier.h" + +namespace pywrap { + class AdaBoostPy : public PyClassifier { + public: + AdaBoostPy(); + ~AdaBoostPy() = default; + int getNumberOfEdges() const override; + int getNumberOfStates() const override; + int getNumberOfNodes() const override; + }; +} /* namespace pywrap */ +#endif /* ADABOOST_H */ \ No newline at end of file diff --git a/pyclfs/CMakeLists.txt b/pyclfs/CMakeLists.txt index 541bb7e..b0da972 100644 --- a/pyclfs/CMakeLists.txt +++ b/pyclfs/CMakeLists.txt @@ -4,5 +4,5 @@ include_directories( ${PyClassifiers_SOURCE_DIR}/lib/json/include ${Bayesnet_INCLUDE_DIRS} ) -add_library(PyClassifiers ODTE.cc STree.cc SVC.cc RandomForest.cc XGBoost.cc PyClassifier.cc PyWrap.cc) -target_link_libraries(PyClassifiers ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy) \ No newline at end of file +add_library(PyClassifiers ODTE.cc STree.cc SVC.cc RandomForest.cc XGBoost.cc AdaBoostPy.cc PyClassifier.cc PyWrap.cc) +target_link_libraries(PyClassifiers nlohmann_json::nlohmann_json ${Python3_LIBRARIES} "${TORCH_LIBRARIES}" ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy) \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 44fd486..79384e3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -2,9 +2,6 @@ if(ENABLE_TESTING) set(TEST_PYCLASSIFIERS "unit_tests_pyclassifiers") include_directories( ${PyClassifiers_SOURCE_DIR} - ${PyClassifiers_SOURCE_DIR}/lib/Files - ${PyClassifiers_SOURCE_DIR}/lib/mdlp/src - ${PyClassifiers_SOURCE_DIR}/lib/json/include ${Python3_INCLUDE_DIRS} ${TORCH_INCLUDE_DIRS} ${CMAKE_BINARY_DIR}/configured_files/include @@ -13,5 +10,5 @@ if(ENABLE_TESTING) file(GLOB_RECURSE PyClassifiers_SOURCES "${PyClassifiers_SOURCE_DIR}/pyclfs/*.cc") set(TEST_SOURCES_PYCLASSIFIERS TestPythonClassifiers.cc TestUtils.cc ${PyClassifiers_SOURCES}) add_executable(${TEST_PYCLASSIFIERS} ${TEST_SOURCES_PYCLASSIFIERS}) - target_link_libraries(${TEST_PYCLASSIFIERS} PUBLIC "${TORCH_LIBRARIES}" ${Python3_LIBRARIES} ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy ArffFiles fimdlp Catch2::Catch2WithMain) + target_link_libraries(${TEST_PYCLASSIFIERS} PUBLIC "${TORCH_LIBRARIES}" ${Python3_LIBRARIES} ${LIBTORCH_PYTHON} Boost::boost Boost::python Boost::numpy fimdlp Catch2::Catch2WithMain) endif(ENABLE_TESTING) \ No newline at end of file diff --git a/tests/TestPythonClassifiers.cc b/tests/TestPythonClassifiers.cc index 903f468..4f40aee 100644 --- a/tests/TestPythonClassifiers.cc +++ b/tests/TestPythonClassifiers.cc @@ -10,14 +10,16 @@ #include "pyclfs/SVC.h" #include "pyclfs/RandomForest.h" #include "pyclfs/XGBoost.h" +#include "pyclfs/AdaBoostPy.h" #include "pyclfs/ODTE.h" #include "TestUtils.h" +#include TEST_CASE("Test Python Classifiers score", "[PyClassifiers]") { map , float> scores = { // Diabetes - {{"diabetes", "STree"}, 0.81641}, {{"diabetes", "ODTE"}, 0.854166687}, {{"diabetes", "SVC"}, 0.76823}, {{"diabetes", "RandomForest"}, 1.0}, + {{"diabetes", "STree"}, 0.81641}, {{"diabetes", "ODTE"}, 0.856770813f}, {{"diabetes", "SVC"}, 0.76823}, {{"diabetes", "RandomForest"}, 1.0}, // Ecoli {{"ecoli", "STree"}, 0.8125}, {{"ecoli", "ODTE"}, 0.875}, {{"ecoli", "SVC"}, 0.89583}, {{"ecoli", "RandomForest"}, 1.0}, // Glass @@ -35,8 +37,8 @@ TEST_CASE("Test Python Classifiers score", "[PyClassifiers]") map versions = { {"ODTE", "1.0.0-1"}, {"STree", "1.4.0"}, - {"SVC", "1.5.1"}, - {"RandomForest", "1.5.1"} + {"SVC", "1.5.2"}, + {"RandomForest", "1.5.2"} }; auto clf = models[name]; @@ -58,6 +60,15 @@ TEST_CASE("Test Python Classifiers score", "[PyClassifiers]") REQUIRE(clf->getVersion() == versions[name]); } } +TEST_CASE("AdaBoostClassifier", "[PyClassifiers]") +{ + auto raw = RawDatasets("iris", false); + auto clf = pywrap::AdaBoostPy(); + clf.fit(raw.Xt, raw.yt, raw.featurest, raw.classNamet, raw.statest); + clf.setHyperparameters(nlohmann::json::parse("{ \"n_estimators\": 100 }")); + auto score = clf.score(raw.Xt, raw.yt); + REQUIRE(score == Catch::Approx(0.9599999f).epsilon(raw.epsilon)); +} TEST_CASE("Classifiers features", "[PyClassifiers]") { auto raw = RawDatasets("iris", false); diff --git a/tests/TestUtils.h b/tests/TestUtils.h index 72954c0..238f0c6 100644 --- a/tests/TestUtils.h +++ b/tests/TestUtils.h @@ -5,8 +5,8 @@ #include #include #include -#include "ArffFiles.h" -#include "CPPFImdlp.h" +#include "ArffFiles/ArffFiles.hpp" +#include "fimdlp/CPPFImdlp.h" bool file_exists(const std::string& name); std::pair, map> discretize(std::vector& X, mdlp::labels_t& y, std::vector features); diff --git a/vcpkg-configuration.json b/vcpkg-configuration.json new file mode 100644 index 0000000..b241242 --- /dev/null +++ b/vcpkg-configuration.json @@ -0,0 +1,21 @@ +{ + "default-registry": { + "kind": "git", + "baseline": "760bfd0c8d7c89ec640aec4df89418b7c2745605", + "repository": "https://github.com/microsoft/vcpkg" + }, + "registries": [ + { + "kind": "git", + "repository": "https://github.com/rmontanana/vcpkg-stash", + "baseline": "1ea69243c0e8b0de77c9d1dd6e1d7593ae7f3627", + "packages": [ + "arff-files", + "bayesnet", + "fimdlp", + "folding", + "libtorch-bin" + ] + } + ] +} \ No newline at end of file diff --git a/vcpkg.json b/vcpkg.json new file mode 100644 index 0000000..14f81e4 --- /dev/null +++ b/vcpkg.json @@ -0,0 +1,47 @@ + { + "name": "platform", + "version-string": "1.1.0", + "dependencies": [ + "arff-files", + "nlohmann-json", + "fimdlp", + "libtorch-bin", + "folding", + "argparse", + "catch2" + ], + "overrides": [ + { + "name": "arff-files", + "version": "1.1.0" + }, + { + "name": "fimdlp", + "version": "2.0.1" + }, + { + "name": "libtorch-bin", + "version": "2.7.0" + }, + { + "name": "bayesnet", + "version": "1.1.1" + }, + { + "name": "folding", + "version": "1.1.1" + }, + { + "name": "argpase", + "version": "3.2" + }, + { + "name": "catch2", + "version": "3.8.1" + }, + { + "name": "nlohmann-json", + "version": "3.11.3" + } + ] + } \ No newline at end of file