Fix version number

This commit is contained in:
2025-07-01 10:39:48 +02:00
parent 4d6cad8f08
commit 81f2e706d0
2 changed files with 82 additions and 54 deletions

View File

@@ -24,28 +24,28 @@ struct ArffSummary {
/** /**
* @brief Header-only C++17 library for parsing ARFF (Attribute-Relation File Format) files * @brief Header-only C++17 library for parsing ARFF (Attribute-Relation File Format) files
* *
* This class provides functionality to load and parse ARFF files, automatically detecting * This class provides functionality to load and parse ARFF files, automatically detecting
* numeric vs categorical features and performing factorization of categorical attributes. * numeric vs categorical features and performing factorization of categorical attributes.
* *
* @warning THREAD SAFETY: This class is NOT thread-safe! * @warning THREAD SAFETY: This class is NOT thread-safe!
* *
* Thread Safety Considerations: * Thread Safety Considerations:
* - Multiple instances can be used safely in different threads (each instance is independent) * - Multiple instances can be used safely in different threads (each instance is independent)
* - A single instance MUST NOT be accessed concurrently from multiple threads * - A single instance MUST NOT be accessed concurrently from multiple threads
* - All member functions (including getters) modify or access mutable state * - All member functions (including getters) modify or access mutable state
* - Static methods (summary, trim, split) are thread-safe as they don't access instance state * - Static methods (summary, trim, split) are thread-safe as they don't access instance state
* *
* Memory Safety: * Memory Safety:
* - Built-in protection against resource exhaustion with configurable limits * - Built-in protection against resource exhaustion with configurable limits
* - File size limit: 100 MB (DEFAULT_MAX_FILE_SIZE) * - File size limit: 100 MB (DEFAULT_MAX_FILE_SIZE)
* - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES) * - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES)
* - Feature count limit: 10,000 features (DEFAULT_MAX_FEATURES) * - Feature count limit: 10,000 features (DEFAULT_MAX_FEATURES)
* *
* Usage Patterns: * Usage Patterns:
* - Single-threaded: Create one instance, call load(), then access data via getters * - Single-threaded: Create one instance, call load(), then access data via getters
* - Multi-threaded: Create separate instances per thread, or use external synchronization * - Multi-threaded: Create separate instances per thread, or use external synchronization
* *
* @example * @example
* // Thread-safe usage pattern: * // Thread-safe usage pattern:
* void processFile(const std::string& filename) { * void processFile(const std::string& filename) {
@@ -55,24 +55,24 @@ struct ArffSummary {
* auto y = arff.getY(); * auto y = arff.getY();
* // Process data... * // Process data...
* } * }
* *
* @example * @example
* // UNSAFE usage pattern: * // UNSAFE usage pattern:
* ArffFiles globalArff; // Global instance * ArffFiles globalArff; // Global instance
* // Thread 1: globalArff.load("file1.arff"); // UNSAFE! * // Thread 1: globalArff.load("file1.arff"); // UNSAFE!
* // Thread 2: globalArff.load("file2.arff"); // UNSAFE! * // Thread 2: globalArff.load("file2.arff"); // UNSAFE!
*/ */
class ArffFiles { class ArffFiles {
const std::string VERSION = "1.1.0"; const std::string VERSION = "1.2.0";
// Memory usage limits (configurable via environment variables) // Memory usage limits (configurable via environment variables)
static constexpr size_t DEFAULT_MAX_FILE_SIZE = 100 * 1024 * 1024; // 100 MB static constexpr size_t DEFAULT_MAX_FILE_SIZE = 100 * 1024 * 1024; // 100 MB
static constexpr size_t DEFAULT_MAX_SAMPLES = 1000000; // 1 million samples static constexpr size_t DEFAULT_MAX_SAMPLES = 1000000; // 1 million samples
static constexpr size_t DEFAULT_MAX_FEATURES = 10000; // 10k features static constexpr size_t DEFAULT_MAX_FEATURES = 10000; // 10k features
public: public:
ArffFiles() = default; ArffFiles() = default;
// Move constructor // Move constructor
ArffFiles(ArffFiles&& other) noexcept ArffFiles(ArffFiles&& other) noexcept
: lines(std::move(other.lines)) : lines(std::move(other.lines))
@@ -86,7 +86,7 @@ public:
{ {
// Other object is left in a valid but unspecified state // Other object is left in a valid but unspecified state
} }
// Move assignment operator // Move assignment operator
ArffFiles& operator=(ArffFiles&& other) noexcept ArffFiles& operator=(ArffFiles&& other) noexcept
{ {
@@ -102,13 +102,13 @@ public:
} }
return *this; return *this;
} }
// Copy constructor (explicitly defaulted) // Copy constructor (explicitly defaulted)
ArffFiles(const ArffFiles& other) = default; ArffFiles(const ArffFiles& other) = default;
// Copy assignment operator (explicitly defaulted) // Copy assignment operator (explicitly defaulted)
ArffFiles& operator=(const ArffFiles& other) = default; ArffFiles& operator=(const ArffFiles& other) = default;
// Destructor (explicitly defaulted) // Destructor (explicitly defaulted)
~ArffFiles() = default; ~ArffFiles() = default;
void load(const std::string& fileName, bool classLast = true) void load(const std::string& fileName, bool classLast = true)
@@ -231,7 +231,7 @@ public:
const std::vector<int>& getY() const { return y; } const std::vector<int>& getY() const { return y; }
const std::map<std::string, bool>& getNumericAttributes() const { return numeric_features; } const std::map<std::string, bool>& getNumericAttributes() const { return numeric_features; }
const std::vector<std::pair<std::string, std::string>>& getAttributes() const { return attributes; }; const std::vector<std::pair<std::string, std::string>>& getAttributes() const { return attributes; };
// Move-enabled getters for efficient data transfer // Move-enabled getters for efficient data transfer
// WARNING: These methods move data OUT of the object, leaving it in an empty but valid state // WARNING: These methods move data OUT of the object, leaving it in an empty but valid state
// Use these when you want to transfer ownership of large data structures for performance // Use these when you want to transfer ownership of large data structures for performance
@@ -241,7 +241,7 @@ public:
std::map<std::string, std::vector<std::string>> moveStates() noexcept { return std::move(states); } std::map<std::string, std::vector<std::string>> moveStates() noexcept { return std::move(states); }
std::vector<std::pair<std::string, std::string>> moveAttributes() noexcept { return std::move(attributes); } std::vector<std::pair<std::string, std::string>> moveAttributes() noexcept { return std::move(attributes); }
std::map<std::string, bool> moveNumericAttributes() noexcept { return std::move(numeric_features); } std::map<std::string, bool> moveNumericAttributes() noexcept { return std::move(numeric_features); }
std::vector<std::string> split(const std::string& text, char delimiter) std::vector<std::string> split(const std::string& text, char delimiter)
{ {
std::vector<std::string> result; std::vector<std::string> result;
@@ -256,22 +256,23 @@ public:
private: private:
// Helper function to validate file path for security // Helper function to validate file path for security
static void validateFilePath(const std::string& fileName) { static void validateFilePath(const std::string& fileName)
{
if (fileName.empty()) { if (fileName.empty()) {
throw std::invalid_argument("File path cannot be empty"); throw std::invalid_argument("File path cannot be empty");
} }
// Check for path traversal attempts // Check for path traversal attempts
if (fileName.find("..") != std::string::npos) { if (fileName.find("..") != std::string::npos) {
throw std::invalid_argument("Path traversal detected in file path: " + fileName); throw std::invalid_argument("Path traversal detected in file path: " + fileName);
} }
// Check for absolute paths starting with / (Unix) or drive letters (Windows) // Check for absolute paths starting with / (Unix) or drive letters (Windows)
if (fileName[0] == '/' || (fileName.length() >= 3 && fileName[1] == ':')) { if (fileName[0] == '/' || (fileName.length() >= 3 && fileName[1] == ':')) {
// Allow absolute paths but log a warning - this is for user awareness // Allow absolute paths but log a warning - this is for user awareness
// In production, you might want to restrict this based on your security requirements // In production, you might want to restrict this based on your security requirements
} }
// Check for suspicious characters that could be used in path manipulation // Check for suspicious characters that could be used in path manipulation
const std::string suspiciousChars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"; const std::string suspiciousChars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
for (char c : suspiciousChars) { for (char c : suspiciousChars) {
@@ -279,33 +280,35 @@ private:
throw std::invalid_argument("Invalid character detected in file path"); throw std::invalid_argument("Invalid character detected in file path");
} }
} }
// Check for excessively long paths (potential buffer overflow attempts) // Check for excessively long paths (potential buffer overflow attempts)
constexpr size_t MAX_PATH_LENGTH = 4096; // Common filesystem limit constexpr size_t MAX_PATH_LENGTH = 4096; // Common filesystem limit
if (fileName.length() > MAX_PATH_LENGTH) { if (fileName.length() > MAX_PATH_LENGTH) {
throw std::invalid_argument("File path too long (exceeds " + std::to_string(MAX_PATH_LENGTH) + " characters)"); throw std::invalid_argument("File path too long (exceeds " + std::to_string(MAX_PATH_LENGTH) + " characters)");
} }
// Additional validation using filesystem operations when available // Additional validation using filesystem operations when available
try { try {
// Check if the file exists and validate its canonical path // Check if the file exists and validate its canonical path
if (std::filesystem::exists(fileName)) { if (std::filesystem::exists(fileName)) {
std::filesystem::path normalizedPath = std::filesystem::canonical(fileName); std::filesystem::path normalizedPath = std::filesystem::canonical(fileName);
std::string normalizedStr = normalizedPath.string(); std::string normalizedStr = normalizedPath.string();
// Check if normalized path still contains traversal attempts // Check if normalized path still contains traversal attempts
if (normalizedStr.find("..") != std::string::npos) { if (normalizedStr.find("..") != std::string::npos) {
throw std::invalid_argument("Path traversal detected after normalization: " + normalizedStr); throw std::invalid_argument("Path traversal detected after normalization: " + normalizedStr);
} }
} }
} catch (const std::filesystem::filesystem_error& e) { }
catch (const std::filesystem::filesystem_error& e) {
// If filesystem operations fail, we can still proceed with basic validation // If filesystem operations fail, we can still proceed with basic validation
// This ensures compatibility with systems where filesystem might not be fully available // This ensures compatibility with systems where filesystem might not be fully available
} }
} }
// Helper function to validate resource usage limits // Helper function to validate resource usage limits
static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0) { static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0)
{
// Check file size limit // Check file size limit
try { try {
if (std::filesystem::exists(fileName)) { if (std::filesystem::exists(fileName)) {
@@ -314,16 +317,17 @@ private:
throw std::invalid_argument("File size (" + std::to_string(fileSize) + " bytes) exceeds maximum allowed size (" + std::to_string(DEFAULT_MAX_FILE_SIZE) + " bytes)"); throw std::invalid_argument("File size (" + std::to_string(fileSize) + " bytes) exceeds maximum allowed size (" + std::to_string(DEFAULT_MAX_FILE_SIZE) + " bytes)");
} }
} }
} catch (const std::filesystem::filesystem_error&) { }
catch (const std::filesystem::filesystem_error&) {
// If filesystem operations fail, continue without size checking // If filesystem operations fail, continue without size checking
// This ensures compatibility with systems where filesystem might not be available // This ensures compatibility with systems where filesystem might not be available
} }
// Check sample count limit // Check sample count limit
if (sampleCount > DEFAULT_MAX_SAMPLES) { if (sampleCount > DEFAULT_MAX_SAMPLES) {
throw std::invalid_argument("Number of samples (" + std::to_string(sampleCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_SAMPLES) + ")"); throw std::invalid_argument("Number of samples (" + std::to_string(sampleCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_SAMPLES) + ")");
} }
// Check feature count limit // Check feature count limit
if (featureCount > DEFAULT_MAX_FEATURES) { if (featureCount > DEFAULT_MAX_FEATURES) {
throw std::invalid_argument("Number of features (" + std::to_string(featureCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_FEATURES) + ")"); throw std::invalid_argument("Number of features (" + std::to_string(featureCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_FEATURES) + ")");
@@ -352,12 +356,12 @@ private:
continue; continue;
auto values = attribute.second; auto values = attribute.second;
std::transform(values.begin(), values.end(), values.begin(), ::toupper); std::transform(values.begin(), values.end(), values.begin(), ::toupper);
// Enhanced attribute type detection // Enhanced attribute type detection
bool isNumeric = values == "REAL" || values == "INTEGER" || values == "NUMERIC"; bool isNumeric = values == "REAL" || values == "INTEGER" || values == "NUMERIC";
bool isDate = values.find("DATE") != std::string::npos; bool isDate = values.find("DATE") != std::string::npos;
bool isString = values == "STRING"; bool isString = values == "STRING";
// For now, treat DATE and STRING as categorical (non-numeric) // For now, treat DATE and STRING as categorical (non-numeric)
// This provides basic compatibility while maintaining existing functionality // This provides basic compatibility while maintaining existing functionality
numeric_features[feature] = isNumeric; numeric_features[feature] = isNumeric;
@@ -490,7 +494,7 @@ private:
// Validate file path for security // Validate file path for security
validateFilePath(fileName); validateFilePath(fileName);
// Validate file size before processing // Validate file size before processing
validateResourceLimits(fileName); validateResourceLimits(fileName);
@@ -507,13 +511,13 @@ private:
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue; continue;
} }
// Skip sparse data format for now (lines starting with '{') // Skip sparse data format for now (lines starting with '{')
// Future enhancement: implement full sparse data support // Future enhancement: implement full sparse data support
if (!line.empty() && line[0] == '{') { if (!line.empty() && line[0] == '{') {
continue; continue;
} }
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line); std::stringstream ss(line);
ss >> keyword >> attribute; ss >> keyword >> attribute;
@@ -564,7 +568,7 @@ private:
if (lines.empty()) { if (lines.empty()) {
throw std::invalid_argument("No data samples found in file"); throw std::invalid_argument("No data samples found in file");
} }
// Validate loaded data dimensions against limits // Validate loaded data dimensions against limits
validateResourceLimits(fileName, lines.size(), attributes.size()); validateResourceLimits(fileName, lines.size(), attributes.size());
@@ -621,15 +625,16 @@ private:
} }
// Common helper function to parse ARFF file attributes and count samples // Common helper function to parse ARFF file attributes and count samples
static int parseArffFile(const std::string& fileName, static int parseArffFile(const std::string& fileName,
std::vector<std::pair<std::string, std::string>>& attributes, std::vector<std::pair<std::string, std::string>>& attributes,
std::set<std::string>& uniqueClasses, std::set<std::string>& uniqueClasses,
size_t& sampleCount, size_t& sampleCount,
int classIndex = -1, int classIndex = -1,
const std::string& classNameToFind = "") { const std::string& classNameToFind = "")
{
// Validate file path for security // Validate file path for security
validateFilePath(fileName); validateFilePath(fileName);
std::ifstream file(fileName); std::ifstream file(fileName);
if (!file.is_open()) { if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName); throw std::invalid_argument("Unable to open file: " + fileName);
@@ -645,12 +650,12 @@ private:
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue; continue;
} }
// Skip sparse data format for now (lines starting with '{') // Skip sparse data format for now (lines starting with '{')
if (!line.empty() && line[0] == '{') { if (!line.empty() && line[0] == '{') {
continue; continue;
} }
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line); std::stringstream ss(line);
std::string keyword, attribute, type_w; std::string keyword, attribute, type_w;
@@ -717,7 +722,7 @@ private:
// Use specific index // Use specific index
classValue = trim(tokens[actualClassIndex]); classValue = trim(tokens[actualClassIndex]);
} }
if (!classValue.empty()) { if (!classValue.empty()) {
uniqueClasses.insert(classValue); uniqueClasses.insert(classValue);
sampleCount++; sampleCount++;
@@ -726,7 +731,7 @@ private:
} }
} }
while (getline(file, line)); while (getline(file, line));
return actualClassIndex; return actualClassIndex;
} }

View File

@@ -6,9 +6,7 @@ from conan.tools.files import copy
class ArffFilesConan(ConanFile): class ArffFilesConan(ConanFile):
name = "arff-files" name = "arff-files"
version = "X.X.X" version = "X.X.X"
description = ( description = "Header-only library to read ARFF (Attribute-Relation File Format) files and return STL vectors with the data read."
"Header-only library to read ARFF (Attribute-Relation File Format) files and return STL vectors with the data read."
)
url = "https://github.com/rmontanana/ArffFiles" url = "https://github.com/rmontanana/ArffFiles"
license = "MIT" license = "MIT"
homepage = "https://github.com/rmontanana/ArffFiles" homepage = "https://github.com/rmontanana/ArffFiles"
@@ -30,10 +28,35 @@ class ArffFilesConan(ConanFile):
def package(self): def package(self):
# Copy header file to include directory # Copy header file to include directory
copy(self, "*.hpp", src=self.source_folder, dst=self.package_folder, keep_path=False) copy(
self,
"*.hpp",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
# Copy license and readme for package documentation # Copy license and readme for package documentation
copy(self, "LICENSE", src=self.source_folder, dst=self.package_folder, keep_path=False) copy(
copy(self, "README.md", src=self.source_folder, dst=self.package_folder, keep_path=False) self,
"LICENSE",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
copy(
self,
"README.md",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
copy(
self,
"CMakeLists.txt",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
def package_info(self): def package_info(self):
# Header-only library configuration # Header-only library configuration