2 Commits

Author SHA1 Message Date
6c63bb6e84 Fix entropy occasional error 2025-08-22 19:50:13 +02:00
Ricardo Montañana Gómez
42b91d1391 Create version 2.1.1 (#12)
* Update version and dependencies

* Fix conan and create new version (#11)

* First approach

* Fix debug conan build target

* Add viewcoverage and fix coverage generation

* Add more tests to cover new integrity checks

* Add tests to accomplish 100%

* Fix conan-create makefile target

* Update debug build

* Fix release build

* Update github build workflow

* Update github workflow

* Update github workflow

* Update github workflow

* Update github workflow remove coverage report
2025-07-19 22:04:10 +02:00
9 changed files with 154 additions and 18 deletions

View File

@@ -5,6 +5,16 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [2.1.2] - 2025-08-22
### Added
- make info now gives info about the build status
### Fix
- Mistake in entropy computation
## [2.1.1] - 2025-07-17 ## [2.1.1] - 2025-07-17
### Internal Changes ### Internal Changes

View File

@@ -1,10 +1,10 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.20)
project(fimdlp project(fimdlp
VERSION 2.1.2
LANGUAGES CXX LANGUAGES CXX
DESCRIPTION "Discretization algorithm based on the paper by Fayyad & Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning." DESCRIPTION "Discretization algorithm based on the paper by Fayyad & Irani Multi-Interval Discretization of Continuous-Valued Attributes for Classification Learning."
HOMEPAGE_URL "https://github.com/rmontanana/mdlp" HOMEPAGE_URL "https://github.com/rmontanana/mdlp"
VERSION 2.1.1
) )
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
cmake_policy(SET CMP0135 NEW) cmake_policy(SET CMP0135 NEW)

View File

@@ -8,6 +8,18 @@ f_release = build_release
genhtml = genhtml genhtml = genhtml
docscdir = docs docscdir = docs
# Set the number of parallel jobs to the number of available processors minus 7
CPUS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null \
|| nproc --all 2>/dev/null \
|| sysctl -n hw.ncpu)
JOBS := $(shell n=$(CPUS); [ $${n} -gt 7 ] && echo $$((n-7)) || echo 1)
# Colors for output
GREEN = \033[0;32m
YELLOW = \033[1;33m
RED = \033[0;31m
NC = \033[0m # No Color
define build_target define build_target
@echo ">>> Building the project for $(1)..." @echo ">>> Building the project for $(1)..."
@if [ -d $(2) ]; then rm -fr $(2); fi @if [ -d $(2) ]; then rm -fr $(2); fi
@@ -16,6 +28,21 @@ define build_target
@cmake --build $(2) --config $(1) -j 8 @cmake --build $(2) --config $(1) -j 8
endef endef
define status_file_folder
@if [ -d $(1) ]; then \
st1="$(GREEN)"; \
else \
st1="$(RED)"; \
fi; \
if [ -f $(1)/libfimdlp.a ]; then \
st2="$(GREEN)"; \
else \
st2="$(RED)"; \
fi; \
printf " $(YELLOW)$(2):$(NC) $$st1 Folder $(NC) $$st2 Library $(NC)\n"
endef
debug: ## Build Debug version of the library debug: ## Build Debug version of the library
@$(call build_target,"Debug","$(f_debug)", "ENABLE_TESTING=ON", "-o enable_testing=True") @$(call build_target,"Debug","$(f_debug)", "ENABLE_TESTING=ON", "-o enable_testing=True")
@@ -61,6 +88,22 @@ viewcoverage: ## View the html coverage report
@xdg-open $(docscdir)/coverage/index.html || open $(docscdir)/coverage/index.html 2>/dev/null @xdg-open $(docscdir)/coverage/index.html || open $(docscdir)/coverage/index.html 2>/dev/null
@echo ">>> Done"; @echo ">>> Done";
info: ## Show project information
@version=$$(grep -A1 "project(fimdlp" CMakeLists.txt | grep "VERSION" | sed 's/.*VERSION \([0-9.]*\).*/\1/'); \
printf "$(GREEN)FImdlp Library: $(YELLOW)ver. $$version$(NC)\n"
@echo ""
@printf "$(GREEN)Project folders:$(NC)\n"
$(call status_file_folder, $(f_release), "Build\ Release")
$(call status_file_folder, $(f_debug), "Build\ Debug\ \ ")
@echo ""
@printf "$(GREEN)Build commands:$(NC)\n"
@printf " $(YELLOW)make release$(NC) - Build library for release\n"
@printf " $(YELLOW)make debug$(NC) - Build library for debug\n"
@printf " $(YELLOW)make test$(NC) - Run tests\n"
@printf " $(YELLOW)Usage:$(NC) make help\n"
@echo ""
@printf " $(YELLOW)Parallel Jobs: $(GREEN)$(JOBS)$(NC)\n"
conan-create: ## Create the conan package conan-create: ## Create the conan package
@echo ">>> Creating the conan package..." @echo ">>> Creating the conan package..."
conan create . --build=missing -tf "" -s:a build_type=Release conan create . --build=missing -tf "" -s:a build_type=Release
@@ -69,7 +112,7 @@ conan-create: ## Create the conan package
help: ## Show help message help: ## Show help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ help_lines=(`grep -Fh "##" $(MAKEFILE_LIST) | grep -Fv fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
printf "%s\n\n" "Usage: make [task]"; \ printf "%s\n\n" "Usage: make [task]"; \
printf "%-20s %s\n" "task" "help" ; \ printf "%-20s %s\n" "task" "help" ; \
printf "%-20s %s\n" "------" "----" ; \ printf "%-20s %s\n" "------" "----" ; \

View File

@@ -98,7 +98,7 @@ namespace mdlp {
const precision_t fraction = const precision_t fraction =
(percentile / 100.0 - percentI) / (percentile / 100.0 - percentI) /
(static_cast<precision_t>(indexLower + 1) / static_cast<precision_t>(data.size() - 1) - percentI); (static_cast<precision_t>(indexLower + 1) / static_cast<precision_t>(data.size() - 1) - percentI);
if (const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; value != results.back() || first) // first needed as results.back() return is undefined for empty vectors if (const auto value = data[indexLower] + (data[indexLower + 1] - data[indexLower]) * fraction; first || results.empty() || value != results.back()) // Check empty before calling back()
results.push_back(value); results.push_back(value);
first = false; first = false;
} }

View File

@@ -41,6 +41,9 @@ namespace mdlp {
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t); pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
inline precision_t safe_X_access(size_t idx) const inline precision_t safe_X_access(size_t idx) const
{ {
if (indices.empty()) {
throw std::out_of_range("Indices array is empty");
}
if (idx >= indices.size()) { if (idx >= indices.size()) {
throw std::out_of_range("Index out of bounds for indices array"); throw std::out_of_range("Index out of bounds for indices array");
} }
@@ -52,6 +55,9 @@ namespace mdlp {
} }
inline label_t safe_y_access(size_t idx) const inline label_t safe_y_access(size_t idx) const
{ {
if (indices.empty()) {
throw std::out_of_range("Indices array is empty");
}
if (idx >= indices.size()) { if (idx >= indices.size()) {
throw std::out_of_range("Index out of bounds for indices array"); throw std::out_of_range("Index out of bounds for indices array");
} }

View File

@@ -7,6 +7,7 @@
#include "Metrics.h" #include "Metrics.h"
#include <set> #include <set>
#include <cmath> #include <cmath>
#include <iostream>
using namespace std; using namespace std;
namespace mdlp { namespace mdlp {
@@ -18,8 +19,13 @@ namespace mdlp {
int Metrics::computeNumClasses(size_t start, size_t end) int Metrics::computeNumClasses(size_t start, size_t end)
{ {
set<int> nClasses; set<int> nClasses;
if (indices.empty() || start >= indices.size() || end > indices.size()) {
return 0;
}
for (auto i = start; i < end; ++i) { for (auto i = start; i < end; ++i) {
nClasses.insert(y[indices[i]]); if (i < indices.size() && indices[i] < y.size()) {
nClasses.insert(y[indices[i]]);
}
} }
return static_cast<int>(nClasses.size()); return static_cast<int>(nClasses.size());
} }
@@ -38,7 +44,7 @@ namespace mdlp {
{ {
if (end - start < 2) if (end - start < 2)
return 0; return 0;
// Check cache first with read lock // Check cache first with read lock
{ {
std::lock_guard<std::mutex> lock(cache_mutex); std::lock_guard<std::mutex> lock(cache_mutex);
@@ -46,15 +52,37 @@ namespace mdlp {
return entropyCache[{start, end}]; return entropyCache[{start, end}];
} }
} }
// Compute entropy outside of lock // Compute entropy outside of lock
precision_t p; precision_t p;
precision_t ventropy = 0; precision_t ventropy = 0;
int nElements = 0; int nElements = 0;
labels_t counts(numClasses + 1, 0);
for (auto i = &indices[start]; i != &indices[end]; ++i) { if (indices.empty() || start >= indices.size() || end > indices.size()) {
counts[y[*i]]++; return 0;
}
// First pass: find max label to size counts array properly
size_t max_label = 0;
for (size_t i = start; i < end; ++i) {
if (i >= indices.size()) break;
size_t idx = indices[i];
if (idx >= y.size()) continue;
size_t label = y[idx];
if (label > max_label) {
max_label = label;
}
}
labels_t counts(max_label + 1, 0);
// Second pass: count occurrences
for (size_t i = start; i < end; ++i) {
if (i >= indices.size()) break;
size_t idx = indices[i];
if (idx >= y.size()) continue;
size_t label = y[idx];
counts[label]++;
nElements++; nElements++;
} }
for (auto count : counts) { for (auto count : counts) {
@@ -63,13 +91,13 @@ namespace mdlp {
ventropy -= p * log2(p); ventropy -= p * log2(p);
} }
} }
// Update cache with write lock // Update cache with write lock
{ {
std::lock_guard<std::mutex> lock(cache_mutex); std::lock_guard<std::mutex> lock(cache_mutex);
entropyCache[{start, end}] = ventropy; entropyCache[{start, end}] = ventropy;
} }
return ventropy; return ventropy;
} }
@@ -82,7 +110,7 @@ namespace mdlp {
return igCache[make_tuple(start, cut, end)]; return igCache[make_tuple(start, cut, end)];
} }
} }
// Compute information gain outside of lock // Compute information gain outside of lock
precision_t iGain; precision_t iGain;
precision_t entropyInterval; precision_t entropyInterval;
@@ -91,7 +119,7 @@ namespace mdlp {
size_t nElementsLeft = cut - start; size_t nElementsLeft = cut - start;
size_t nElementsRight = end - cut; size_t nElementsRight = end - cut;
size_t nElements = end - start; size_t nElements = end - start;
entropyInterval = entropy(start, end); entropyInterval = entropy(start, end);
entropyLeft = entropy(start, cut); entropyLeft = entropy(start, cut);
entropyRight = entropy(cut, end); entropyRight = entropy(cut, end);
@@ -99,13 +127,13 @@ namespace mdlp {
(static_cast<precision_t>(nElementsLeft) * entropyLeft + (static_cast<precision_t>(nElementsLeft) * entropyLeft +
static_cast<precision_t>(nElementsRight) * entropyRight) / static_cast<precision_t>(nElementsRight) * entropyRight) /
static_cast<precision_t>(nElements); static_cast<precision_t>(nElements);
// Update cache with write lock // Update cache with write lock
{ {
std::lock_guard<std::mutex> lock(cache_mutex); std::lock_guard<std::mutex> lock(cache_mutex);
igCache[make_tuple(start, cut, end)] = iGain; igCache[make_tuple(start, cut, end)] = iGain;
} }
return iGain; return iGain;
} }

View File

@@ -41,7 +41,7 @@ namespace mdlp {
Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM); Discretizer* disc = new BinDisc(4, strategy_t::UNIFORM);
auto version = disc->version(); auto version = disc->version();
delete disc; delete disc;
EXPECT_EQ("2.1.1", version); EXPECT_EQ("2.1.2", version);
} }
TEST(Discretizer, BinIrisUniform) TEST(Discretizer, BinIrisUniform)
{ {

View File

@@ -417,6 +417,28 @@ namespace mdlp {
EXPECT_THROW_WITH_MESSAGE(safe_y_access(2), std::out_of_range, "Index out of bounds for y array"); EXPECT_THROW_WITH_MESSAGE(safe_y_access(2), std::out_of_range, "Index out of bounds for y array");
} }
TEST_F(TestFImdlp, SafeXAccessEmptyIndices)
{
// Test safe_X_access with empty indices array
X = { 1.0f, 2.0f, 3.0f };
y = { 1, 2, 3 };
indices = {}; // empty indices array
// This should trigger the indices.empty() exception in safe_X_access
EXPECT_THROW_WITH_MESSAGE(safe_X_access(0), std::out_of_range, "Indices array is empty");
}
TEST_F(TestFImdlp, SafeYAccessEmptyIndices)
{
// Test safe_y_access with empty indices array
X = { 1.0f, 2.0f, 3.0f };
y = { 1, 2, 3 };
indices = {}; // empty indices array
// This should trigger the indices.empty() exception in safe_y_access
EXPECT_THROW_WITH_MESSAGE(safe_y_access(0), std::out_of_range, "Indices array is empty");
}
TEST_F(TestFImdlp, SafeSubtractUnderflow) TEST_F(TestFImdlp, SafeSubtractUnderflow)
{ {
// Test safe_subtract with underflow condition (b > a) // Test safe_subtract with underflow condition (b > a)

View File

@@ -37,12 +37,15 @@ namespace mdlp {
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
setData(y, indices); setData(y, indices);
ASSERT_NEAR(0.468996f, entropy(0, 10), precision); ASSERT_NEAR(0.468996f, entropy(0, 10), precision);
y = { 0, 0, 1, 2, 3 };
ASSERT_NEAR(1.5f, entropy(0, 4), precision);
ASSERT_NEAR(1.921928f, entropy(0, 5), precision);
} }
TEST_F(TestMetrics, EntropyDouble) TEST_F(TestMetrics, EntropyDouble)
{ {
y = { 0, 0, 1, 2, 3 }; y = { 0, 0, 1, 2, 3 };
samples_t expected_entropies = { 0.0, 0.0, 0.91829583, 1.5, 1.4575424759098898 }; samples_t expected_entropies = { 0.0, 0.0, 0.91829583, 1.5, 1.9219280948873623 };
for (auto idx = 0; idx < y.size(); ++idx) { for (auto idx = 0; idx < y.size(); ++idx) {
ASSERT_NEAR(expected_entropies[idx], entropy(0, idx + 1), precision); ASSERT_NEAR(expected_entropies[idx], entropy(0, idx + 1), precision);
} }
@@ -56,4 +59,28 @@ namespace mdlp {
setData(y, indices); setData(y, indices);
ASSERT_NEAR(0.108032f, informationGain(0, 5, 10), precision); ASSERT_NEAR(0.108032f, informationGain(0, 5, 10), precision);
} }
TEST_F(TestMetrics, EntropyBoundsChecking)
{
// Test the conditions that cause entropy to return 0
// Test 1: Empty indices array
indices_t empty_indices = {};
labels_t test_y = { 1, 2, 3 };
setData(test_y, empty_indices);
EXPECT_EQ(0, entropy(0, 1)) << "Should return 0 when indices is empty";
// Test 2: start >= indices.size()
indices_t small_indices = { 0, 1 };
setData(test_y, small_indices);
EXPECT_EQ(0, entropy(2, 3)) << "Should return 0 when start >= indices.size()";
EXPECT_EQ(0, entropy(5, 6)) << "Should return 0 when start >> indices.size()";
// Test 3: end > indices.size()
EXPECT_EQ(0, entropy(0, 3)) << "Should return 0 when end > indices.size()";
EXPECT_EQ(0, entropy(1, 5)) << "Should return 0 when end >> indices.size()";
// Test edge case: start == indices.size()
EXPECT_EQ(0, entropy(2, 2)) << "Should return 0 when start == indices.size()";
}
} }