diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..96d7957 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,182 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Conan dependency manager support +- Technical analysis report + +### Changed +- Updated README.md +- Refactored library version and installation system +- Updated config variable names + +### Fixed +- Removed unneeded semicolon + +## [2.0.1] - 2024-07-22 + +### Added +- CMake install target and make install command +- Flag to control sample building in Makefile + +### Changed +- Library name changed to `fimdlp` +- Updated version numbers across test files + +### Fixed +- Version number consistency in tests + +## [2.0.0] - 2024-07-04 + +### Added +- Makefile with build & test actions for easier development +- PyTorch (libtorch) integration for tensor operations + +### Changed +- Major refactoring of build system +- Updated build workflows and CI configuration + +### Fixed +- BinDisc quantile calculation errors (#9) +- Error in percentile method calculation +- Integer type issues in calculations +- Multiple GitHub Actions configuration fixes + +## [1.2.1] - 2024-06-08 + +### Added +- PyTorch tensor methods for discretization +- Improved library build system + +### Changed +- Refactored sample build process + +### Fixed +- Library creation and linking issues +- Multiple GitHub Actions workflow fixes + +## [1.2.0] - 2024-06-05 + +### Added +- **Discretizer** - Abstract base class for all discretization algorithms (#8) +- **BinDisc** - K-bins discretization with quantile and uniform strategies (#7) +- Transform method to discretize values using existing cut points +- Support for multiple datasets in sample program +- Docker development container configuration + +### Changed +- Refactored system types throughout the library +- Improved sample program with better dataset handling +- Enhanced build system with debug options + +### Fixed +- Transform method initialization issues +- ARFF file attribute name extraction +- Sample program library binary separation + +## [1.1.3] - 2024-06-05 + +### Added +- `max_cutpoints` hyperparameter for controlling algorithm complexity +- `max_depth` and `min_length` as configurable hyperparameters +- Enhanced sample program with hyperparameter support +- Additional datasets for testing + +### Changed +- Improved constructor design and parameter handling +- Enhanced test coverage and reporting +- Refactored build system configuration + +### Fixed +- Depth initialization in fit method +- Code quality improvements and smell fixes +- Exception handling in value cut point calculations + +## [1.1.2] - 2023-04-01 + +### Added +- Comprehensive test suite with GitHub Actions CI +- SonarCloud integration for code quality analysis +- Enhanced build system with automated testing + +### Changed +- Improved GitHub Actions workflow configuration +- Updated project structure for better maintainability + +### Fixed +- Build system configuration issues +- Test execution and coverage reporting + +## [1.1.1] - 2023-02-22 + +### Added +- Limits header for proper compilation +- Enhanced build system support + +### Changed +- Updated version numbering system +- Improved SonarCloud configuration + +### Fixed +- ValueCutPoint exception handling (removed unnecessary exception) +- Build system compatibility issues +- GitHub Actions token configuration + +## [1.1.0] - 2023-02-21 + +### Added +- Classic algorithm implementation for performance comparison +- Enhanced ValueCutPoint logic with same_values detection +- Glass dataset support in sample program +- Debug configuration for development + +### Changed +- Refactored ValueCutPoint algorithm for better accuracy +- Improved candidate selection logic +- Enhanced sample program with multiple datasets + +### Fixed +- Sign error in valueCutPoint calculation +- Final cut value computation +- Duplicate dataset handling in sample + +## [1.0.0.0] - 2022-12-21 + +### Added +- Initial release of MDLP (Minimum Description Length Principle) discretization library +- Core CPPFImdlp algorithm implementation based on Fayyad & Irani's paper +- Entropy and information gain calculation methods +- Sample program demonstrating library usage +- CMake build system +- Basic test suite +- ARFF file format support for datasets + +### Features +- Recursive discretization using entropy-based criteria +- Stable sorting with tie-breaking for identical values +- Configurable algorithm parameters +- Cross-platform C++ implementation + +--- + +## Release Notes + +### Version 2.x +- **Breaking Changes**: Library renamed to `fimdlp` +- **Major Enhancement**: PyTorch integration for improved performance +- **New Features**: Comprehensive discretization framework with multiple algorithms + +### Version 1.x +- **Core Algorithm**: MDLP discretization implementation +- **Extensibility**: Hyperparameter support and algorithm variants +- **Quality**: Comprehensive testing and CI/CD pipeline + +### Version 1.0.x +- **Foundation**: Initial stable implementation +- **Algorithm**: Core MDLP discretization functionality \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c9b73c..0db7a9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(fimdlp LANGUAGES CXX DESCRIPTION "Discretization algorithm based on the paper by Fayyad & Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning." HOMEPAGE_URL "https://github.com/rmontanana/mdlp" - VERSION 2.0.1 + VERSION 2.1.0 ) set(CMAKE_CXX_STANDARD 17) cmake_policy(SET CMP0135 NEW) diff --git a/src/BinDisc.cpp b/src/BinDisc.cpp index 4f13b09..edc13a2 100644 --- a/src/BinDisc.cpp +++ b/src/BinDisc.cpp @@ -22,13 +22,15 @@ namespace mdlp { BinDisc::~BinDisc() = default; void BinDisc::fit(samples_t& X) { - // y is included for compatibility with the Discretizer interface - cutPoints.clear(); + // Input validation if (X.empty()) { - cutPoints.push_back(0.0); - cutPoints.push_back(0.0); - return; + throw std::invalid_argument("Input data X cannot be empty"); } + if (X.size() < static_cast(n_bins)) { + throw std::invalid_argument("Input data size must be at least equal to n_bins"); + } + + cutPoints.clear(); if (strategy == strategy_t::QUANTILE) { direction = bound_dir_t::RIGHT; fit_quantile(X); @@ -39,10 +41,31 @@ namespace mdlp { } void BinDisc::fit(samples_t& X, labels_t& y) { + // Input validation for supervised interface + if (X.size() != y.size()) { + throw std::invalid_argument("X and y must have the same size"); + } + if (X.empty() || y.empty()) { + throw std::invalid_argument("X and y cannot be empty"); + } + + // BinDisc is inherently unsupervised, but we validate inputs for consistency + // Note: y parameter is validated but not used in binning strategy fit(X); } std::vector linspace(precision_t start, precision_t end, int num) { + // Input validation + if (num < 2) { + throw std::invalid_argument("Number of points must be at least 2 for linspace"); + } + if (std::isnan(start) || std::isnan(end)) { + throw std::invalid_argument("Start and end values cannot be NaN"); + } + if (std::isinf(start) || std::isinf(end)) { + throw std::invalid_argument("Start and end values cannot be infinite"); + } + if (start == end) { return { start, end }; } @@ -60,6 +83,14 @@ namespace mdlp { } std::vector percentile(samples_t& data, const std::vector& percentiles) { + // Input validation + if (data.empty()) { + throw std::invalid_argument("Data cannot be empty for percentile calculation"); + } + if (percentiles.empty()) { + throw std::invalid_argument("Percentiles cannot be empty"); + } + // Implementation taken from https://dpilger26.github.io/NumCpp/doxygen/html/percentile_8hpp_source.html std::vector results; bool first = true; diff --git a/src/CPPFImdlp.cpp b/src/CPPFImdlp.cpp index d972604..910fe43 100644 --- a/src/CPPFImdlp.cpp +++ b/src/CPPFImdlp.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "CPPFImdlp.h" namespace mdlp { @@ -18,6 +19,17 @@ namespace mdlp { max_depth(max_depth_), proposed_cuts(proposed) { + // Input validation for constructor parameters + if (min_length_ < 3) { + throw std::invalid_argument("min_length must be greater than 2"); + } + if (max_depth_ < 1) { + throw std::invalid_argument("max_depth must be greater than 0"); + } + if (proposed < 0.0f) { + throw std::invalid_argument("proposed_cuts must be non-negative"); + } + direction = bound_dir_t::RIGHT; } @@ -49,12 +61,6 @@ namespace mdlp { if (X.empty() || y.empty()) { throw invalid_argument("X and y must have at least one element"); } - if (min_length < 3) { - throw invalid_argument("min_length must be greater than 2"); - } - if (max_depth < 1) { - throw invalid_argument("max_depth must be greater than 0"); - } indices = sortIndices(X_, y_); metrics.setData(y, indices); computeCutPoints(0, X.size(), 1); @@ -81,26 +87,32 @@ namespace mdlp { precision_t previous; precision_t actual; precision_t next; - previous = X[indices[idxPrev]]; - actual = X[indices[cut]]; - next = X[indices[idxNext]]; + previous = safe_X_access(idxPrev); + actual = safe_X_access(cut); + next = safe_X_access(idxNext); // definition 2 of the paper => X[t-1] < X[t] // get the first equal value of X in the interval while (idxPrev > start && actual == previous) { - previous = X[indices[--idxPrev]]; + --idxPrev; + previous = safe_X_access(idxPrev); } backWall = idxPrev == start && actual == previous; // get the last equal value of X in the interval while (idxNext < end - 1 && actual == next) { - next = X[indices[++idxNext]]; + ++idxNext; + next = safe_X_access(idxNext); } // # of duplicates before cutpoint - n = cut - 1 - idxPrev; + n = safe_subtract(safe_subtract(cut, 1), idxPrev); // # of duplicates after cutpoint - m = idxNext - cut - 1; + m = safe_subtract(safe_subtract(idxNext, cut), 1); // Decide which values to use - cut = cut + (backWall ? m + 1 : -n); - actual = X[indices[cut]]; + if (backWall) { + cut = cut + m + 1; + } else { + cut = safe_subtract(cut, n); + } + actual = safe_X_access(cut); return { (actual + previous) / 2, cut }; } @@ -109,7 +121,7 @@ namespace mdlp { size_t cut; pair result; // Check if the interval length and the depth are Ok - if (end - start < min_length || depth_ > max_depth) + if (end < start || safe_subtract(end, start) < min_length || depth_ > max_depth) return; depth = depth_ > depth ? depth_ : depth; cut = getCandidate(start, end); @@ -129,14 +141,14 @@ namespace mdlp { /* Definition 1: A binary discretization for A is determined by selecting the cut point TA for which E(A, TA; S) is minimal amongst all the candidate cut points. */ size_t candidate = numeric_limits::max(); - size_t elements = end - start; + size_t elements = safe_subtract(end, start); bool sameValues = true; precision_t entropy_left; precision_t entropy_right; precision_t minEntropy; // Check if all the values of the variable in the interval are the same for (size_t idx = start + 1; idx < end; idx++) { - if (X[indices[idx]] != X[indices[start]]) { + if (safe_X_access(idx) != safe_X_access(start)) { sameValues = false; break; } @@ -146,7 +158,7 @@ namespace mdlp { minEntropy = metrics.entropy(start, end); for (size_t idx = start + 1; idx < end; idx++) { // Cutpoints are always on boundaries (definition 2) - if (y[indices[idx]] == y[indices[idx - 1]]) + if (safe_y_access(idx) == safe_y_access(idx - 1)) continue; entropy_left = precision_t(idx - start) / static_cast(elements) * metrics.entropy(start, idx); entropy_right = precision_t(end - idx) / static_cast(elements) * metrics.entropy(idx, end); @@ -168,7 +180,7 @@ namespace mdlp { precision_t ent; precision_t ent1; precision_t ent2; - auto N = precision_t(end - start); + auto N = precision_t(safe_subtract(end, start)); k = metrics.computeNumClasses(start, end); k1 = metrics.computeNumClasses(start, cut); k2 = metrics.computeNumClasses(cut, end); @@ -188,6 +200,9 @@ namespace mdlp { indices_t idx(X_.size()); std::iota(idx.begin(), idx.end(), 0); stable_sort(idx.begin(), idx.end(), [&X_, &y_](size_t i1, size_t i2) { + if (i1 >= X_.size() || i2 >= X_.size() || i1 >= y_.size() || i2 >= y_.size()) { + throw std::out_of_range("Index out of bounds in sort comparison"); + } if (X_[i1] == X_[i2]) return y_[i1] < y_[i2]; else @@ -206,7 +221,7 @@ namespace mdlp { size_t end; for (size_t idx = 0; idx < cutPoints.size(); idx++) { end = begin; - while (X[indices[end]] < cutPoints[idx] && end < X.size()) + while (end < indices.size() && safe_X_access(end) < cutPoints[idx] && end < X.size()) end++; entropy = metrics.entropy(begin, end); if (entropy > maxEntropy) { diff --git a/src/CPPFImdlp.h b/src/CPPFImdlp.h index 45fa65c..7a8c0ab 100644 --- a/src/CPPFImdlp.h +++ b/src/CPPFImdlp.h @@ -39,6 +39,33 @@ namespace mdlp { size_t getCandidate(size_t, size_t); size_t compute_max_num_cut_points() const; pair valueCutPoint(size_t, size_t, size_t); + private: + inline precision_t safe_X_access(size_t idx) const { + if (idx >= indices.size()) { + throw std::out_of_range("Index out of bounds for indices array"); + } + size_t real_idx = indices[idx]; + if (real_idx >= X.size()) { + throw std::out_of_range("Index out of bounds for X array"); + } + return X[real_idx]; + } + inline label_t safe_y_access(size_t idx) const { + if (idx >= indices.size()) { + throw std::out_of_range("Index out of bounds for indices array"); + } + size_t real_idx = indices[idx]; + if (real_idx >= y.size()) { + throw std::out_of_range("Index out of bounds for y array"); + } + return y[real_idx]; + } + inline size_t safe_subtract(size_t a, size_t b) const { + if (b > a) { + throw std::underflow_error("Subtraction would cause underflow"); + } + return a - b; + } }; } #endif diff --git a/src/Discretizer.cpp b/src/Discretizer.cpp index 1522fb2..f8c1ab0 100644 --- a/src/Discretizer.cpp +++ b/src/Discretizer.cpp @@ -10,6 +10,14 @@ namespace mdlp { labels_t& Discretizer::transform(const samples_t& data) { + // Input validation + if (data.empty()) { + throw std::invalid_argument("Data for transformation cannot be empty"); + } + if (cutPoints.size() < 2) { + throw std::runtime_error("Discretizer not fitted yet or no valid cut points found"); + } + discretizedData.clear(); discretizedData.reserve(data.size()); // CutPoints always have at least two items @@ -31,6 +39,26 @@ namespace mdlp { } void Discretizer::fit_t(const torch::Tensor& X_, const torch::Tensor& y_) { + // Validate tensor properties for security + if (!X_.is_contiguous() || !y_.is_contiguous()) { + throw std::invalid_argument("Tensors must be contiguous"); + } + if (X_.sizes().size() != 1 || y_.sizes().size() != 1) { + throw std::invalid_argument("Only 1D tensors supported"); + } + if (X_.dtype() != torch::kFloat32) { + throw std::invalid_argument("X tensor must be Float32 type"); + } + if (y_.dtype() != torch::kInt32) { + throw std::invalid_argument("y tensor must be Int32 type"); + } + if (X_.numel() != y_.numel()) { + throw std::invalid_argument("X and y tensors must have same number of elements"); + } + if (X_.numel() == 0) { + throw std::invalid_argument("Tensors cannot be empty"); + } + auto num_elements = X_.numel(); samples_t X(X_.data_ptr(), X_.data_ptr() + num_elements); labels_t y(y_.data_ptr(), y_.data_ptr() + num_elements); @@ -38,6 +66,20 @@ namespace mdlp { } torch::Tensor Discretizer::transform_t(const torch::Tensor& X_) { + // Validate tensor properties for security + if (!X_.is_contiguous()) { + throw std::invalid_argument("Tensor must be contiguous"); + } + if (X_.sizes().size() != 1) { + throw std::invalid_argument("Only 1D tensors supported"); + } + if (X_.dtype() != torch::kFloat32) { + throw std::invalid_argument("X tensor must be Float32 type"); + } + if (X_.numel() == 0) { + throw std::invalid_argument("Tensor cannot be empty"); + } + auto num_elements = X_.numel(); samples_t X(X_.data_ptr(), X_.data_ptr() + num_elements); auto result = transform(X); @@ -45,6 +87,26 @@ namespace mdlp { } torch::Tensor Discretizer::fit_transform_t(const torch::Tensor& X_, const torch::Tensor& y_) { + // Validate tensor properties for security + if (!X_.is_contiguous() || !y_.is_contiguous()) { + throw std::invalid_argument("Tensors must be contiguous"); + } + if (X_.sizes().size() != 1 || y_.sizes().size() != 1) { + throw std::invalid_argument("Only 1D tensors supported"); + } + if (X_.dtype() != torch::kFloat32) { + throw std::invalid_argument("X tensor must be Float32 type"); + } + if (y_.dtype() != torch::kInt32) { + throw std::invalid_argument("y tensor must be Int32 type"); + } + if (X_.numel() != y_.numel()) { + throw std::invalid_argument("X and y tensors must have same number of elements"); + } + if (X_.numel() == 0) { + throw std::invalid_argument("Tensors cannot be empty"); + } + auto num_elements = X_.numel(); samples_t X(X_.data_ptr(), X_.data_ptr() + num_elements); labels_t y(y_.data_ptr(), y_.data_ptr() + num_elements); diff --git a/src/Metrics.cpp b/src/Metrics.cpp index b76fbcc..2231e82 100644 --- a/src/Metrics.cpp +++ b/src/Metrics.cpp @@ -26,6 +26,7 @@ namespace mdlp { void Metrics::setData(const labels_t& y_, const indices_t& indices_) { + std::lock_guard lock(cache_mutex); indices = indices_; y = y_; numClasses = computeNumClasses(0, indices.size()); @@ -35,15 +36,23 @@ namespace mdlp { precision_t Metrics::entropy(size_t start, size_t end) { + if (end - start < 2) + return 0; + + // Check cache first with read lock + { + std::lock_guard lock(cache_mutex); + if (entropyCache.find({ start, end }) != entropyCache.end()) { + return entropyCache[{start, end}]; + } + } + + // Compute entropy outside of lock precision_t p; precision_t ventropy = 0; int nElements = 0; labels_t counts(numClasses + 1, 0); - if (end - start < 2) - return 0; - if (entropyCache.find({ start, end }) != entropyCache.end()) { - return entropyCache[{start, end}]; - } + for (auto i = &indices[start]; i != &indices[end]; ++i) { counts[y[*i]]++; nElements++; @@ -54,12 +63,27 @@ namespace mdlp { ventropy -= p * log2(p); } } - entropyCache[{start, end}] = ventropy; + + // Update cache with write lock + { + std::lock_guard lock(cache_mutex); + entropyCache[{start, end}] = ventropy; + } + return ventropy; } precision_t Metrics::informationGain(size_t start, size_t cut, size_t end) { + // Check cache first with read lock + { + std::lock_guard lock(cache_mutex); + if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { + return igCache[make_tuple(start, cut, end)]; + } + } + + // Compute information gain outside of lock precision_t iGain; precision_t entropyInterval; precision_t entropyLeft; @@ -67,9 +91,7 @@ namespace mdlp { size_t nElementsLeft = cut - start; size_t nElementsRight = end - cut; size_t nElements = end - start; - if (igCache.find(make_tuple(start, cut, end)) != igCache.end()) { - return igCache[make_tuple(start, cut, end)]; - } + entropyInterval = entropy(start, end); entropyLeft = entropy(start, cut); entropyRight = entropy(cut, end); @@ -77,7 +99,13 @@ namespace mdlp { (static_cast(nElementsLeft) * entropyLeft + static_cast(nElementsRight) * entropyRight) / static_cast(nElements); - igCache[make_tuple(start, cut, end)] = iGain; + + // Update cache with write lock + { + std::lock_guard lock(cache_mutex); + igCache[make_tuple(start, cut, end)] = iGain; + } + return iGain; } diff --git a/src/Metrics.h b/src/Metrics.h index eea4d4b..d24769b 100644 --- a/src/Metrics.h +++ b/src/Metrics.h @@ -8,6 +8,7 @@ #define CCMETRICS_H #include "typesFImdlp.h" +#include namespace mdlp { class Metrics { @@ -15,6 +16,7 @@ namespace mdlp { labels_t& y; indices_t& indices; int numClasses; + mutable std::mutex cache_mutex; cacheEnt_t entropyCache = cacheEnt_t(); cacheIg_t igCache = cacheIg_t(); public: