6 Commits
v1.2.0 ... main

12 changed files with 242 additions and 81 deletions

View File

@@ -0,0 +1,12 @@
{
"permissions": {
"allow": [
"Bash(find:*)",
"Bash(mkdir:*)",
"Bash(cmake:*)",
"Bash(make:*)",
"Bash(cat:*)"
],
"deny": []
}
}

3
.gitignore vendored
View File

@@ -37,4 +37,5 @@ build_*/**
cmake-build*/**
.idea
puml/**
.vscode/settings.json
.vscode/settings.json
CMakeUserPresets.json

View File

@@ -10,6 +10,8 @@
#include <cctype> // std::isdigit
#include <algorithm> // std::all_of std::transform
#include <filesystem> // For file size checking
#include "arffFiles_config.h"
// Summary information structure for ARFF files
struct ArffSummary {
@@ -24,28 +26,28 @@ struct ArffSummary {
/**
* @brief Header-only C++17 library for parsing ARFF (Attribute-Relation File Format) files
*
*
* This class provides functionality to load and parse ARFF files, automatically detecting
* numeric vs categorical features and performing factorization of categorical attributes.
*
*
* @warning THREAD SAFETY: This class is NOT thread-safe!
*
*
* Thread Safety Considerations:
* - Multiple instances can be used safely in different threads (each instance is independent)
* - A single instance MUST NOT be accessed concurrently from multiple threads
* - All member functions (including getters) modify or access mutable state
* - Static methods (summary, trim, split) are thread-safe as they don't access instance state
*
*
* Memory Safety:
* - Built-in protection against resource exhaustion with configurable limits
* - File size limit: 100 MB (DEFAULT_MAX_FILE_SIZE)
* - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES)
* - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES)
* - Feature count limit: 10,000 features (DEFAULT_MAX_FEATURES)
*
*
* Usage Patterns:
* - Single-threaded: Create one instance, call load(), then access data via getters
* - Multi-threaded: Create separate instances per thread, or use external synchronization
*
*
* @example
* // Thread-safe usage pattern:
* void processFile(const std::string& filename) {
@@ -55,24 +57,23 @@ struct ArffSummary {
* auto y = arff.getY();
* // Process data...
* }
*
* @example
*
* @example
* // UNSAFE usage pattern:
* ArffFiles globalArff; // Global instance
* // Thread 1: globalArff.load("file1.arff"); // UNSAFE!
* // Thread 2: globalArff.load("file2.arff"); // UNSAFE!
*/
class ArffFiles {
const std::string VERSION = "1.1.0";
private:
// Memory usage limits (configurable via environment variables)
static constexpr size_t DEFAULT_MAX_FILE_SIZE = 100 * 1024 * 1024; // 100 MB
static constexpr size_t DEFAULT_MAX_SAMPLES = 1000000; // 1 million samples
static constexpr size_t DEFAULT_MAX_FEATURES = 10000; // 10k features
public:
ArffFiles() = default;
// Move constructor
ArffFiles(ArffFiles&& other) noexcept
: lines(std::move(other.lines))
@@ -86,7 +87,7 @@ public:
{
// Other object is left in a valid but unspecified state
}
// Move assignment operator
ArffFiles& operator=(ArffFiles&& other) noexcept
{
@@ -102,13 +103,13 @@ public:
}
return *this;
}
// Copy constructor (explicitly defaulted)
ArffFiles(const ArffFiles& other) = default;
// Copy assignment operator (explicitly defaulted)
ArffFiles& operator=(const ArffFiles& other) = default;
// Copy constructor (explicitly delete)
ArffFiles(const ArffFiles& other) = delete;
// Copy assignment operator (explicitly deleted)
ArffFiles& operator=(const ArffFiles& other) = delete;
// Destructor (explicitly defaulted)
~ArffFiles() = default;
void load(const std::string& fileName, bool classLast = true)
@@ -231,7 +232,7 @@ public:
const std::vector<int>& getY() const { return y; }
const std::map<std::string, bool>& getNumericAttributes() const { return numeric_features; }
const std::vector<std::pair<std::string, std::string>>& getAttributes() const { return attributes; };
// Move-enabled getters for efficient data transfer
// WARNING: These methods move data OUT of the object, leaving it in an empty but valid state
// Use these when you want to transfer ownership of large data structures for performance
@@ -241,7 +242,7 @@ public:
std::map<std::string, std::vector<std::string>> moveStates() noexcept { return std::move(states); }
std::vector<std::pair<std::string, std::string>> moveAttributes() noexcept { return std::move(attributes); }
std::map<std::string, bool> moveNumericAttributes() noexcept { return std::move(numeric_features); }
std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
@@ -252,26 +253,27 @@ public:
}
return result;
}
std::string version() const { return VERSION; }
std::string version() const { return ARFFLIB_VERSION; }
private:
// Helper function to validate file path for security
static void validateFilePath(const std::string& fileName) {
static void validateFilePath(const std::string& fileName)
{
if (fileName.empty()) {
throw std::invalid_argument("File path cannot be empty");
}
// Check for path traversal attempts
if (fileName.find("..") != std::string::npos) {
throw std::invalid_argument("Path traversal detected in file path: " + fileName);
}
// Check for absolute paths starting with / (Unix) or drive letters (Windows)
if (fileName[0] == '/' || (fileName.length() >= 3 && fileName[1] == ':')) {
// Allow absolute paths but log a warning - this is for user awareness
// In production, you might want to restrict this based on your security requirements
}
// Check for suspicious characters that could be used in path manipulation
const std::string suspiciousChars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
for (char c : suspiciousChars) {
@@ -279,33 +281,35 @@ private:
throw std::invalid_argument("Invalid character detected in file path");
}
}
// Check for excessively long paths (potential buffer overflow attempts)
constexpr size_t MAX_PATH_LENGTH = 4096; // Common filesystem limit
if (fileName.length() > MAX_PATH_LENGTH) {
throw std::invalid_argument("File path too long (exceeds " + std::to_string(MAX_PATH_LENGTH) + " characters)");
}
// Additional validation using filesystem operations when available
try {
// Check if the file exists and validate its canonical path
if (std::filesystem::exists(fileName)) {
std::filesystem::path normalizedPath = std::filesystem::canonical(fileName);
std::string normalizedStr = normalizedPath.string();
// Check if normalized path still contains traversal attempts
if (normalizedStr.find("..") != std::string::npos) {
throw std::invalid_argument("Path traversal detected after normalization: " + normalizedStr);
}
}
} catch (const std::filesystem::filesystem_error& e) {
}
catch (const std::filesystem::filesystem_error& e) {
// If filesystem operations fail, we can still proceed with basic validation
// This ensures compatibility with systems where filesystem might not be fully available
}
}
// Helper function to validate resource usage limits
static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0) {
static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0)
{
// Check file size limit
try {
if (std::filesystem::exists(fileName)) {
@@ -314,16 +318,17 @@ private:
throw std::invalid_argument("File size (" + std::to_string(fileSize) + " bytes) exceeds maximum allowed size (" + std::to_string(DEFAULT_MAX_FILE_SIZE) + " bytes)");
}
}
} catch (const std::filesystem::filesystem_error&) {
}
catch (const std::filesystem::filesystem_error&) {
// If filesystem operations fail, continue without size checking
// This ensures compatibility with systems where filesystem might not be available
}
// Check sample count limit
if (sampleCount > DEFAULT_MAX_SAMPLES) {
throw std::invalid_argument("Number of samples (" + std::to_string(sampleCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_SAMPLES) + ")");
}
// Check feature count limit
if (featureCount > DEFAULT_MAX_FEATURES) {
throw std::invalid_argument("Number of features (" + std::to_string(featureCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_FEATURES) + ")");
@@ -352,12 +357,12 @@ private:
continue;
auto values = attribute.second;
std::transform(values.begin(), values.end(), values.begin(), ::toupper);
// Enhanced attribute type detection
bool isNumeric = values == "REAL" || values == "INTEGER" || values == "NUMERIC";
bool isDate = values.find("DATE") != std::string::npos;
bool isString = values == "STRING";
// For now, treat DATE and STRING as categorical (non-numeric)
// This provides basic compatibility while maintaining existing functionality
numeric_features[feature] = isNumeric;
@@ -490,7 +495,7 @@ private:
// Validate file path for security
validateFilePath(fileName);
// Validate file size before processing
validateResourceLimits(fileName);
@@ -507,13 +512,13 @@ private:
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
// Skip sparse data format for now (lines starting with '{')
// Future enhancement: implement full sparse data support
if (!line.empty() && line[0] == '{') {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
ss >> keyword >> attribute;
@@ -564,7 +569,7 @@ private:
if (lines.empty()) {
throw std::invalid_argument("No data samples found in file");
}
// Validate loaded data dimensions against limits
validateResourceLimits(fileName, lines.size(), attributes.size());
@@ -621,15 +626,16 @@ private:
}
// Common helper function to parse ARFF file attributes and count samples
static int parseArffFile(const std::string& fileName,
std::vector<std::pair<std::string, std::string>>& attributes,
std::set<std::string>& uniqueClasses,
size_t& sampleCount,
int classIndex = -1,
const std::string& classNameToFind = "") {
static int parseArffFile(const std::string& fileName,
std::vector<std::pair<std::string, std::string>>& attributes,
std::set<std::string>& uniqueClasses,
size_t& sampleCount,
int classIndex = -1,
const std::string& classNameToFind = "")
{
// Validate file path for security
validateFilePath(fileName);
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName);
@@ -645,12 +651,12 @@ private:
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
// Skip sparse data format for now (lines starting with '{')
if (!line.empty() && line[0] == '{') {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
std::string keyword, attribute, type_w;
@@ -717,7 +723,7 @@ private:
// Use specific index
classValue = trim(tokens[actualClassIndex]);
}
if (!classValue.empty()) {
uniqueClasses.insert(classValue);
sampleCount++;
@@ -726,7 +732,7 @@ private:
}
}
while (getline(file, line));
return actualClassIndex;
}

View File

@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [1.2.1] 2025-07-15 Bug Fixes and Improvements
### Added
- Library version from CMake projecto to `ArffFiles.hpp`
- Library `catch2` as a conan test requirement
- Install target for CMake
## [1.2.0] 2025-06-27 Refactoring and Improvements
### Added

View File

@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.20)
project(ArffFiles
VERSION 1.2.0
VERSION 1.2.1
DESCRIPTION "Library to read Arff Files and return STL vectors with the data read."
HOMEPAGE_URL "https://github.com/rmontanana/ArffFiles"
LANGUAGES CXX
@@ -41,14 +41,60 @@ add_subdirectory(config)
# -------
if (ENABLE_TESTING)
MESSAGE("Testing enabled")
Include(FetchContent)
FetchContent_Declare(Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
GIT_TAG v3.3.2
)
FetchContent_MakeAvailable(Catch2)
find_package(Catch2 REQUIRED)
include(CTest)
add_subdirectory(tests)
endif (ENABLE_TESTING)
add_library(ArffFiles INTERFACE ArffFiles.hpp)
target_include_directories(ArffFiles INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/configured_files/include>
$<INSTALL_INTERFACE:include>
)
# Install
# -------
install(TARGETS ArffFiles EXPORT ArffFilesTargets
INCLUDES DESTINATION include
)
install(EXPORT ArffFilesTargets
FILE ArffFilesTargets.cmake
NAMESPACE ArffFiles::
DESTINATION lib/cmake/ArffFiles
)
# Install the main header file
install(FILES ArffFiles.hpp
DESTINATION include
)
# Install the generated configuration header
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/configured_files/include/arffFiles_config.h"
DESTINATION include
)
# Install documentation files
install(FILES LICENSE README.md
DESTINATION share/doc/ArffFiles
)
# Create and install package configuration files
include(CMakePackageConfigHelpers)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/ArffFilesConfigVersion.cmake"
VERSION ${PROJECT_VERSION}
COMPATIBILITY AnyNewerVersion
)
configure_package_config_file(
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/ArffFilesConfig.cmake.in"
"${CMAKE_CURRENT_BINARY_DIR}/ArffFilesConfig.cmake"
INSTALL_DESTINATION lib/cmake/ArffFiles
)
install(FILES
"${CMAKE_CURRENT_BINARY_DIR}/ArffFilesConfig.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/ArffFilesConfigVersion.cmake"
DESTINATION lib/cmake/ArffFiles
)

11
CMakeLists_conan.txt Normal file
View File

@@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 3.20)
project(ArffFiles
VERSION 1.2.1
DESCRIPTION "Library to read Arff Files and return STL vectors with the data read."
HOMEPAGE_URL "https://github.com/rmontanana/ArffFiles"
LANGUAGES CXX
)
# Subdirectories
add_subdirectory(config)

View File

@@ -25,10 +25,12 @@ clean: ## Clean the tests info
@echo ">>> Done";
build: ## Build a debug version of the project
@echo ">>> Building Debug ArffFiles...";
@if [ -d ./$(f_debug) ]; then rm -rf ./$(f_debug); fi
@echo ">>> Building Debug Folding...";
@if [ -d $(f_debug) ]; then rm -rf $(f_debug); fi
@mkdir $(f_debug);
@cmake -S . -B $(f_debug) -D CMAKE_BUILD_TYPE=Debug -D ENABLE_TESTING=ON -D CODE_COVERAGE=ON
conan install . -of $(f_debug) -s build_type=Debug -b missing
cmake -B $(f_debug) -S . -DCMAKE_BUILD_TYPE=Debug -DCMAKE_TOOLCHAIN_FILE=$(f_debug)/conan_toolchain.cmake -DENABLE_TESTING=ON
cmake --build $(f_debug) -t $(test_targets) $(n_procs)
@echo ">>> Done";
opt = ""

View File

@@ -29,10 +29,10 @@ A modern C++17 header-only library to read **ARFF (Attribute-Relation File Forma
```bash
# Add the package to your conanfile.txt
[requires]
arff-files/1.0.1
arff-files/1.2.1
# Or install directly
conan install arff-files/1.0.1@
conan install arff-files/1.2.1@
```
### Manual Installation

View File

@@ -0,0 +1,5 @@
@PACKAGE_INIT@
include("${CMAKE_CURRENT_LIST_DIR}/ArffFilesTargets.cmake")
check_required_components(ArffFiles)

View File

@@ -1,21 +1,29 @@
import re
from conan import ConanFile
from conan.tools.files import copy
from conan.tools.cmake import CMakeToolchain, CMakeDeps
class ArffFilesConan(ConanFile):
name = "arff-files"
version = "X.X.X"
description = (
"Header-only library to read ARFF (Attribute-Relation File Format) files and return STL vectors with the data read."
)
description = "Header-only library to read ARFF (Attribute-Relation \
File Format) files and return STL vectors with the data read."
url = "https://github.com/rmontanana/ArffFiles"
license = "MIT"
homepage = "https://github.com/rmontanana/ArffFiles"
topics = ("arff", "data-processing", "file-parsing", "header-only", "cpp17")
no_copy_source = True
exports_sources = "ArffFiles.hpp", "LICENSE", "README.md"
exports_sources = (
"ArffFiles.hpp",
"LICENSE",
"README.md",
"CMakeLists.txt",
"config/*",
"cmake/*",
)
package_type = "header-library"
settings = "build_type", "compiler", "arch", "os"
def init(self):
# Read the CMakeLists.txt file to get the version
@@ -28,12 +36,76 @@ class ArffFilesConan(ConanFile):
if match:
self.version = match.group(1)
def build_requirements(self):
self.tool_requires("cmake/[>=3.15]")
self.test_requires("catch2/3.8.1")
def layout(self):
# Only use cmake_layout for conan packaging, not for development builds
# This can be detected by checking if we're in a conan cache folder
if (
hasattr(self, "folders")
and hasattr(self.folders, "base_build")
and self.folders.base_build
and ".conan2" in self.folders.base_build
):
from conan.tools.cmake import cmake_layout
cmake_layout(self)
def generate(self):
# Generate CMake toolchain file
tc = CMakeToolchain(self)
tc.generate()
# Generate CMake dependencies file (needed for test requirements like catch2)
deps = CMakeDeps(self)
deps.generate()
def build(self):
# Use CMake to generate the config file through existing config system
from conan.tools.cmake import CMake
cmake = CMake(self)
# Configure with minimal options - just enough to generate the config file
cmake.configure(
build_script_folder=None,
cli_args=["-DENABLE_TESTING=OFF", "-DCODE_COVERAGE=OFF"],
)
# No need to build anything, just configure to generate the config file
def package(self):
# Copy header file to include directory
copy(self, "*.hpp", src=self.source_folder, dst=self.package_folder, keep_path=False)
# Copy header file
copy(
self,
"ArffFiles.hpp",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
# Copy the generated config file from CMake build folder
copy(
self,
"arffFiles_config.h",
src=f"{self.build_folder}/configured_files/include",
dst=self.package_folder,
keep_path=False,
)
# Copy license and readme for package documentation
copy(self, "LICENSE", src=self.source_folder, dst=self.package_folder, keep_path=False)
copy(self, "README.md", src=self.source_folder, dst=self.package_folder, keep_path=False)
copy(
self,
"LICENSE",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
copy(
self,
"README.md",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
def package_info(self):
# Header-only library configuration

View File

@@ -1,11 +1,10 @@
#pragma once
#include <string>
#include <string_view>
#define ARFFLIB_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
#define ARFFLIB_VERSION_MINOR @PROJECT_VERSION_MINOR@
#define ARFFLIB_VERSION_PATCH @PROJECT_VERSION_PATCH@
#define PROJECT_VERSION_MAJOR @PROJECT_VERSION_MAJOR @
#define PROJECT_VERSION_MINOR @PROJECT_VERSION_MINOR @
#define PROJECT_VERSION_PATCH @PROJECT_VERSION_PATCH @
#define ARFFLIB_VERSION "@PROJECT_VERSION@"
static constexpr std::string_view arffFiles_project_name = "@PROJECT_NAME@";
static constexpr std::string_view arffFiles_project_version = "@PROJECT_VERSION@";

View File

@@ -3,7 +3,6 @@
#include <catch2/generators/catch_generators.hpp>
#include <catch2/matchers/catch_matchers_string.hpp>
#include "ArffFiles.hpp"
#include "arffFiles_config.h"
#include <iostream>
class Paths {
@@ -28,7 +27,7 @@ public:
TEST_CASE("Version Test", "[ArffFiles]")
{
ArffFiles arff;
REQUIRE(arff.version() == "1.1.0");
REQUIRE(arff.version() == "1.2.1");
}
TEST_CASE("Load Test", "[ArffFiles]")
{