From 059fd33b4e235548b405484475045d745561acdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Sat, 28 Jun 2025 01:27:22 +0200 Subject: [PATCH] Begin adding conan dependency manager --- CLAUDE.md | 77 +++++++++++++++++++++++++++++ CMakeLists.txt | 5 +- CMakeUserPresets.json | 9 ++++ conanfile.py | 48 +++++++++++------- test_consumer/CMakeLists.txt | 9 ++++ test_consumer/CMakeUserPresets.json | 9 ++++ test_consumer/conanfile.txt | 9 ++++ test_consumer/test_fimdlp.cpp | 39 +++++++++++++++ 8 files changed, 183 insertions(+), 22 deletions(-) create mode 100644 CLAUDE.md create mode 100644 CMakeUserPresets.json create mode 100644 test_consumer/CMakeLists.txt create mode 100644 test_consumer/CMakeUserPresets.json create mode 100644 test_consumer/conanfile.txt create mode 100644 test_consumer/test_fimdlp.cpp diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..59bb777 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,77 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +This is a C++ implementation of the MDLP (Minimum Description Length Principle) discretization algorithm based on Fayyad & Irani's paper. The library provides discretization methods for continuous-valued attributes in classification learning. + +## Build System + +The project uses CMake with a Makefile wrapper for common tasks: + +### Common Commands +- `make build` - Build release version with sample program +- `make test` - Run full test suite with coverage report +- `make install` - Install the library + +### Build Configurations +- **Release**: Built in `build_release/` directory +- **Debug**: Built in `build_debug/` directory (for testing) + +### Dependencies +- PyTorch (libtorch) - Required dependency +- GoogleTest - Fetched automatically for testing +- Coverage tools: lcov, genhtml + +## Code Architecture + +### Core Components + +1. **Discretizer** (`src/Discretizer.h/cpp`) - Abstract base class for all discretizers +2. **CPPFImdlp** (`src/CPPFImdlp.h/cpp`) - Main MDLP algorithm implementation +3. **BinDisc** (`src/BinDisc.h/cpp`) - K-bins discretization (quantile/uniform strategies) +4. **Metrics** (`src/Metrics.h/cpp`) - Entropy and information gain calculations + +### Key Data Types +- `samples_t` - Input data samples +- `labels_t` - Classification labels +- `indices_t` - Index arrays for sorting/processing +- `precision_t` - Floating-point precision type + +### Algorithm Flow +1. Data is sorted using labels as tie-breakers for identical values +2. MDLP recursively finds optimal cut points using entropy-based criteria +3. Cut points are validated to ensure meaningful splits +4. Transform method maps continuous values to discrete bins + +## Testing + +Tests are built with GoogleTest and include: +- `Metrics_unittest` - Entropy/information gain tests +- `FImdlp_unittest` - Core MDLP algorithm tests +- `BinDisc_unittest` - K-bins discretization tests +- `Discretizer_unittest` - Base class functionality tests + +### Running Tests +```bash +make test # Runs all tests and generates coverage report +cd build_debug/tests && ctest # Run tests directly +``` + +Coverage reports are generated at `build_debug/tests/coverage/index.html`. + +## Sample Usage + +The sample program demonstrates basic usage: +```bash +build_release/sample/sample -f iris -m 2 +``` + +## Development Notes + +- The library uses PyTorch tensors for efficient numerical operations +- Code follows C++17 standards +- Coverage is maintained at 100% +- The implementation handles edge cases like duplicate values and small intervals +- Conan package manager support is available via `conanfile.py` \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index d5f6c9c..0c9b73c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,7 +9,7 @@ project(fimdlp set(CMAKE_CXX_STANDARD 17) cmake_policy(SET CMP0135 NEW) -find_package(Torch CONFIG REQUIRED) +find_package(Torch REQUIRED) # Options # ------- @@ -41,13 +41,12 @@ if (ENABLE_SAMPLE) endif() include_directories( - ${TORCH_INCLUDE_DIRS} ${fimdlp_SOURCE_DIR}/src ${CMAKE_BINARY_DIR}/configured_files/include ) add_library(fimdlp src/CPPFImdlp.cpp src/Metrics.cpp src/BinDisc.cpp src/Discretizer.cpp) -target_link_libraries(fimdlp "${TORCH_LIBRARIES}") +target_link_libraries(fimdlp torch::torch) # Installation # ------------ diff --git a/CMakeUserPresets.json b/CMakeUserPresets.json new file mode 100644 index 0000000..71aeace --- /dev/null +++ b/CMakeUserPresets.json @@ -0,0 +1,9 @@ +{ + "version": 4, + "vendor": { + "conan": {} + }, + "include": [ + "build/Release/generators/CMakePresets.json" + ] +} \ No newline at end of file diff --git a/conanfile.py b/conanfile.py index 310d61d..5f9fc11 100644 --- a/conanfile.py +++ b/conanfile.py @@ -1,5 +1,8 @@ import re -from conan import ConanFile, CMake +import os +from conan import ConanFile +from conan.tools.cmake import CMake, CMakeToolchain, cmake_layout, CMakeDeps +from conan.tools.files import save, load class FimdlpConan(ConanFile): name = "fimdlp" @@ -10,25 +13,29 @@ class FimdlpConan(ConanFile): description = "Discretization algorithm based on the paper by Fayyad & Irani." topics = ("discretization", "classification", "machine learning") settings = "os", "compiler", "build_type", "arch" - generators = "cmake" - exports_sources = "src/*", "CMakeLists.txt", "README.md" + exports_sources = "src/*", "CMakeLists.txt", "README.md", "config/*", "fimdlpConfig.cmake.in" - def init(self): + def set_version(self): # Read the CMakeLists.txt file to get the version - # This is a simple example; you might want to use a more robust method - # to parse the CMakeLists.txt file. - # For example, you could use a regex to extract the version number. - with open("CMakeLists.txt", "r") as f: - lines = f.readlines() - for line in lines: - if "VERSION" in line: - # Extract the version number using regex - match = re.search(r"VERSION\s+(\d+\.\d+\.\d+)", line) - if match: - self.version = match.group(1) + try: + content = load(self, "CMakeLists.txt") + match = re.search(r"VERSION\s+(\d+\.\d+\.\d+)", content) + if match: + self.version = match.group(1) + except Exception: + self.version = "2.0.1" # fallback version def requirements(self): - self.requires("libtorch/2.7.0") # Adjust version as necessary + self.requires("libtorch/2.7.0") + + def layout(self): + cmake_layout(self) + + def generate(self): + deps = CMakeDeps(self) + deps.generate() + tc = CMakeToolchain(self) + tc.generate() def build(self): cmake = CMake(self) @@ -36,10 +43,13 @@ class FimdlpConan(ConanFile): cmake.build() def package(self): - # self.copy("*.h", dst="include", src="src/include") - # self.copy("*fimdlp*", dst="lib", keep_path=False) cmake = CMake(self) cmake.install() def package_info(self): - self.cpp_info.libs = ["fimdlp"] \ No newline at end of file + self.cpp_info.libs = ["fimdlp"] + self.cpp_info.includedirs = ["include"] + self.cpp_info.libdirs = ["lib"] + self.cpp_info.set_property("cmake_find_mode", "both") + self.cpp_info.set_property("cmake_target_name", "fimdlp::fimdlp") + self.cpp_info.set_property("cmake_file_name", "fimdlp") \ No newline at end of file diff --git a/test_consumer/CMakeLists.txt b/test_consumer/CMakeLists.txt new file mode 100644 index 0000000..b64bda5 --- /dev/null +++ b/test_consumer/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required(VERSION 3.20) +project(test_fimdlp) + +set(CMAKE_CXX_STANDARD 17) + +find_package(fimdlp REQUIRED) + +add_executable(test_fimdlp test_fimdlp.cpp) +target_link_libraries(test_fimdlp fimdlp::fimdlp) \ No newline at end of file diff --git a/test_consumer/CMakeUserPresets.json b/test_consumer/CMakeUserPresets.json new file mode 100644 index 0000000..71aeace --- /dev/null +++ b/test_consumer/CMakeUserPresets.json @@ -0,0 +1,9 @@ +{ + "version": 4, + "vendor": { + "conan": {} + }, + "include": [ + "build/Release/generators/CMakePresets.json" + ] +} \ No newline at end of file diff --git a/test_consumer/conanfile.txt b/test_consumer/conanfile.txt new file mode 100644 index 0000000..ab254f4 --- /dev/null +++ b/test_consumer/conanfile.txt @@ -0,0 +1,9 @@ +[requires] +fimdlp/2.0.1 + +[generators] +CMakeDeps +CMakeToolchain + +[layout] +cmake_layout \ No newline at end of file diff --git a/test_consumer/test_fimdlp.cpp b/test_consumer/test_fimdlp.cpp new file mode 100644 index 0000000..f840765 --- /dev/null +++ b/test_consumer/test_fimdlp.cpp @@ -0,0 +1,39 @@ +#include +#include +#include +#include + +int main() { + std::cout << "Testing FIMDLP package..." << std::endl; + + // Test data - simple continuous values with binary classification + mdlp::samples_t data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0}; + mdlp::labels_t labels = {0, 0, 0, 1, 1, 0, 1, 1, 1, 1}; + + std::cout << "Created test data with " << data.size() << " samples" << std::endl; + + // Test MDLP discretizer + mdlp::CPPFImdlp discretizer; + discretizer.fit(data, labels); + + auto cut_points = discretizer.getCutPoints(); + std::cout << "MDLP found " << cut_points.size() << " cut points" << std::endl; + + for (size_t i = 0; i < cut_points.size(); ++i) { + std::cout << "Cut point " << i << ": " << cut_points[i] << std::endl; + } + + // Test BinDisc discretizer + mdlp::BinDisc bin_discretizer(3, mdlp::strategy_t::UNIFORM); // 3 bins, uniform strategy + bin_discretizer.fit(data, labels); + + auto bin_cut_points = bin_discretizer.getCutPoints(); + std::cout << "BinDisc found " << bin_cut_points.size() << " cut points" << std::endl; + + for (size_t i = 0; i < bin_cut_points.size(); ++i) { + std::cout << "Bin cut point " << i << ": " << bin_cut_points[i] << std::endl; + } + + std::cout << "FIMDLP package test completed successfully!" << std::endl; + return 0; +} \ No newline at end of file