Fix version number

This commit is contained in:
2025-07-01 10:39:48 +02:00
parent 4d6cad8f08
commit 81f2e706d0
2 changed files with 82 additions and 54 deletions

View File

@@ -24,28 +24,28 @@ struct ArffSummary {
/**
* @brief Header-only C++17 library for parsing ARFF (Attribute-Relation File Format) files
*
*
* This class provides functionality to load and parse ARFF files, automatically detecting
* numeric vs categorical features and performing factorization of categorical attributes.
*
*
* @warning THREAD SAFETY: This class is NOT thread-safe!
*
*
* Thread Safety Considerations:
* - Multiple instances can be used safely in different threads (each instance is independent)
* - A single instance MUST NOT be accessed concurrently from multiple threads
* - All member functions (including getters) modify or access mutable state
* - Static methods (summary, trim, split) are thread-safe as they don't access instance state
*
*
* Memory Safety:
* - Built-in protection against resource exhaustion with configurable limits
* - File size limit: 100 MB (DEFAULT_MAX_FILE_SIZE)
* - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES)
* - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES)
* - Feature count limit: 10,000 features (DEFAULT_MAX_FEATURES)
*
*
* Usage Patterns:
* - Single-threaded: Create one instance, call load(), then access data via getters
* - Multi-threaded: Create separate instances per thread, or use external synchronization
*
*
* @example
* // Thread-safe usage pattern:
* void processFile(const std::string& filename) {
@@ -55,24 +55,24 @@ struct ArffSummary {
* auto y = arff.getY();
* // Process data...
* }
*
* @example
*
* @example
* // UNSAFE usage pattern:
* ArffFiles globalArff; // Global instance
* // Thread 1: globalArff.load("file1.arff"); // UNSAFE!
* // Thread 2: globalArff.load("file2.arff"); // UNSAFE!
*/
class ArffFiles {
const std::string VERSION = "1.1.0";
const std::string VERSION = "1.2.0";
// Memory usage limits (configurable via environment variables)
static constexpr size_t DEFAULT_MAX_FILE_SIZE = 100 * 1024 * 1024; // 100 MB
static constexpr size_t DEFAULT_MAX_SAMPLES = 1000000; // 1 million samples
static constexpr size_t DEFAULT_MAX_FEATURES = 10000; // 10k features
public:
ArffFiles() = default;
// Move constructor
ArffFiles(ArffFiles&& other) noexcept
: lines(std::move(other.lines))
@@ -86,7 +86,7 @@ public:
{
// Other object is left in a valid but unspecified state
}
// Move assignment operator
ArffFiles& operator=(ArffFiles&& other) noexcept
{
@@ -102,13 +102,13 @@ public:
}
return *this;
}
// Copy constructor (explicitly defaulted)
ArffFiles(const ArffFiles& other) = default;
// Copy assignment operator (explicitly defaulted)
ArffFiles& operator=(const ArffFiles& other) = default;
// Destructor (explicitly defaulted)
~ArffFiles() = default;
void load(const std::string& fileName, bool classLast = true)
@@ -231,7 +231,7 @@ public:
const std::vector<int>& getY() const { return y; }
const std::map<std::string, bool>& getNumericAttributes() const { return numeric_features; }
const std::vector<std::pair<std::string, std::string>>& getAttributes() const { return attributes; };
// Move-enabled getters for efficient data transfer
// WARNING: These methods move data OUT of the object, leaving it in an empty but valid state
// Use these when you want to transfer ownership of large data structures for performance
@@ -241,7 +241,7 @@ public:
std::map<std::string, std::vector<std::string>> moveStates() noexcept { return std::move(states); }
std::vector<std::pair<std::string, std::string>> moveAttributes() noexcept { return std::move(attributes); }
std::map<std::string, bool> moveNumericAttributes() noexcept { return std::move(numeric_features); }
std::vector<std::string> split(const std::string& text, char delimiter)
{
std::vector<std::string> result;
@@ -256,22 +256,23 @@ public:
private:
// Helper function to validate file path for security
static void validateFilePath(const std::string& fileName) {
static void validateFilePath(const std::string& fileName)
{
if (fileName.empty()) {
throw std::invalid_argument("File path cannot be empty");
}
// Check for path traversal attempts
if (fileName.find("..") != std::string::npos) {
throw std::invalid_argument("Path traversal detected in file path: " + fileName);
}
// Check for absolute paths starting with / (Unix) or drive letters (Windows)
if (fileName[0] == '/' || (fileName.length() >= 3 && fileName[1] == ':')) {
// Allow absolute paths but log a warning - this is for user awareness
// In production, you might want to restrict this based on your security requirements
}
// Check for suspicious characters that could be used in path manipulation
const std::string suspiciousChars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
for (char c : suspiciousChars) {
@@ -279,33 +280,35 @@ private:
throw std::invalid_argument("Invalid character detected in file path");
}
}
// Check for excessively long paths (potential buffer overflow attempts)
constexpr size_t MAX_PATH_LENGTH = 4096; // Common filesystem limit
if (fileName.length() > MAX_PATH_LENGTH) {
throw std::invalid_argument("File path too long (exceeds " + std::to_string(MAX_PATH_LENGTH) + " characters)");
}
// Additional validation using filesystem operations when available
try {
// Check if the file exists and validate its canonical path
if (std::filesystem::exists(fileName)) {
std::filesystem::path normalizedPath = std::filesystem::canonical(fileName);
std::string normalizedStr = normalizedPath.string();
// Check if normalized path still contains traversal attempts
if (normalizedStr.find("..") != std::string::npos) {
throw std::invalid_argument("Path traversal detected after normalization: " + normalizedStr);
}
}
} catch (const std::filesystem::filesystem_error& e) {
}
catch (const std::filesystem::filesystem_error& e) {
// If filesystem operations fail, we can still proceed with basic validation
// This ensures compatibility with systems where filesystem might not be fully available
}
}
// Helper function to validate resource usage limits
static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0) {
static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0)
{
// Check file size limit
try {
if (std::filesystem::exists(fileName)) {
@@ -314,16 +317,17 @@ private:
throw std::invalid_argument("File size (" + std::to_string(fileSize) + " bytes) exceeds maximum allowed size (" + std::to_string(DEFAULT_MAX_FILE_SIZE) + " bytes)");
}
}
} catch (const std::filesystem::filesystem_error&) {
}
catch (const std::filesystem::filesystem_error&) {
// If filesystem operations fail, continue without size checking
// This ensures compatibility with systems where filesystem might not be available
}
// Check sample count limit
if (sampleCount > DEFAULT_MAX_SAMPLES) {
throw std::invalid_argument("Number of samples (" + std::to_string(sampleCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_SAMPLES) + ")");
}
// Check feature count limit
if (featureCount > DEFAULT_MAX_FEATURES) {
throw std::invalid_argument("Number of features (" + std::to_string(featureCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_FEATURES) + ")");
@@ -352,12 +356,12 @@ private:
continue;
auto values = attribute.second;
std::transform(values.begin(), values.end(), values.begin(), ::toupper);
// Enhanced attribute type detection
bool isNumeric = values == "REAL" || values == "INTEGER" || values == "NUMERIC";
bool isDate = values.find("DATE") != std::string::npos;
bool isString = values == "STRING";
// For now, treat DATE and STRING as categorical (non-numeric)
// This provides basic compatibility while maintaining existing functionality
numeric_features[feature] = isNumeric;
@@ -490,7 +494,7 @@ private:
// Validate file path for security
validateFilePath(fileName);
// Validate file size before processing
validateResourceLimits(fileName);
@@ -507,13 +511,13 @@ private:
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
// Skip sparse data format for now (lines starting with '{')
// Future enhancement: implement full sparse data support
if (!line.empty() && line[0] == '{') {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
ss >> keyword >> attribute;
@@ -564,7 +568,7 @@ private:
if (lines.empty()) {
throw std::invalid_argument("No data samples found in file");
}
// Validate loaded data dimensions against limits
validateResourceLimits(fileName, lines.size(), attributes.size());
@@ -621,15 +625,16 @@ private:
}
// Common helper function to parse ARFF file attributes and count samples
static int parseArffFile(const std::string& fileName,
std::vector<std::pair<std::string, std::string>>& attributes,
std::set<std::string>& uniqueClasses,
size_t& sampleCount,
int classIndex = -1,
const std::string& classNameToFind = "") {
static int parseArffFile(const std::string& fileName,
std::vector<std::pair<std::string, std::string>>& attributes,
std::set<std::string>& uniqueClasses,
size_t& sampleCount,
int classIndex = -1,
const std::string& classNameToFind = "")
{
// Validate file path for security
validateFilePath(fileName);
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName);
@@ -645,12 +650,12 @@ private:
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
// Skip sparse data format for now (lines starting with '{')
if (!line.empty() && line[0] == '{') {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
std::string keyword, attribute, type_w;
@@ -717,7 +722,7 @@ private:
// Use specific index
classValue = trim(tokens[actualClassIndex]);
}
if (!classValue.empty()) {
uniqueClasses.insert(classValue);
sampleCount++;
@@ -726,7 +731,7 @@ private:
}
}
while (getline(file, line));
return actualClassIndex;
}

View File

@@ -6,9 +6,7 @@ from conan.tools.files import copy
class ArffFilesConan(ConanFile):
name = "arff-files"
version = "X.X.X"
description = (
"Header-only library to read ARFF (Attribute-Relation File Format) files and return STL vectors with the data read."
)
description = "Header-only library to read ARFF (Attribute-Relation File Format) files and return STL vectors with the data read."
url = "https://github.com/rmontanana/ArffFiles"
license = "MIT"
homepage = "https://github.com/rmontanana/ArffFiles"
@@ -30,10 +28,35 @@ class ArffFilesConan(ConanFile):
def package(self):
# Copy header file to include directory
copy(self, "*.hpp", src=self.source_folder, dst=self.package_folder, keep_path=False)
copy(
self,
"*.hpp",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
# Copy license and readme for package documentation
copy(self, "LICENSE", src=self.source_folder, dst=self.package_folder, keep_path=False)
copy(self, "README.md", src=self.source_folder, dst=self.package_folder, keep_path=False)
copy(
self,
"LICENSE",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
copy(
self,
"README.md",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
copy(
self,
"CMakeLists.txt",
src=self.source_folder,
dst=self.package_folder,
keep_path=False,
)
def package_info(self):
# Header-only library configuration