Fix version number

2025-07-01 10:39:48 +02:00
parent 4d6cad8f08
commit 81f2e706d0
2 changed files with 82 additions and 54 deletions
--- a/ArffFiles.hpp
+++ b/ArffFiles.hpp
@@ -24,28 +24,28 @@ struct ArffSummary {
 /**
 * @brief Header-only C++17 library for parsing ARFF (Attribute-Relation File Format) files
- * 
+ *
 * This class provides functionality to load and parse ARFF files, automatically detecting
 * numeric vs categorical features and performing factorization of categorical attributes.
- * 
+ *
 * @warning THREAD SAFETY: This class is NOT thread-safe!
- * 
+ *
 * Thread Safety Considerations:
 * - Multiple instances can be used safely in different threads (each instance is independent)
 * - A single instance MUST NOT be accessed concurrently from multiple threads
 * - All member functions (including getters) modify or access mutable state
 * - Static methods (summary, trim, split) are thread-safe as they don't access instance state
- * 
+ *
 * Memory Safety:
 * - Built-in protection against resource exhaustion with configurable limits
 * - File size limit: 100 MB (DEFAULT_MAX_FILE_SIZE)
- * - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES)  
+ * - Sample count limit: 1 million samples (DEFAULT_MAX_SAMPLES)
 * - Feature count limit: 10,000 features (DEFAULT_MAX_FEATURES)
- * 
+ *
 * Usage Patterns:
 * - Single-threaded: Create one instance, call load(), then access data via getters
 * - Multi-threaded: Create separate instances per thread, or use external synchronization
- * 
+ *
 * @example
 * // Thread-safe usage pattern:
 * void processFile(const std::string& filename) {
@@ -55,24 +55,24 @@ struct ArffSummary {
 *     auto y = arff.getY();
 *     // Process data...
 * }
- * 
+ *
- * @example  
+ * @example
 * // UNSAFE usage pattern:
 * ArffFiles globalArff;  // Global instance
 * // Thread 1: globalArff.load("file1.arff");  // UNSAFE!
 * // Thread 2: globalArff.load("file2.arff");  // UNSAFE!
 */
 class ArffFiles {
-    const std::string VERSION = "1.1.0";
+    const std::string VERSION = "1.2.0";
-    
+
    // Memory usage limits (configurable via environment variables)
    static constexpr size_t DEFAULT_MAX_FILE_SIZE = 100 * 1024 * 1024; // 100 MB
    static constexpr size_t DEFAULT_MAX_SAMPLES = 1000000; // 1 million samples
    static constexpr size_t DEFAULT_MAX_FEATURES = 10000; // 10k features
-    
+
 public:
    ArffFiles() = default;
-    
+
    // Move constructor
    ArffFiles(ArffFiles&& other) noexcept
        : lines(std::move(other.lines))
@@ -86,7 +86,7 @@ public:
    {
        // Other object is left in a valid but unspecified state
    }
-    
+
    // Move assignment operator
    ArffFiles& operator=(ArffFiles&& other) noexcept
    {
@@ -102,13 +102,13 @@ public:
        }
        return *this;
    }
-    
+
    // Copy constructor (explicitly defaulted)
    ArffFiles(const ArffFiles& other) = default;
-    
+
    // Copy assignment operator (explicitly defaulted)
    ArffFiles& operator=(const ArffFiles& other) = default;
-    
+
    // Destructor (explicitly defaulted)
    ~ArffFiles() = default;
    void load(const std::string& fileName, bool classLast = true)
@@ -231,7 +231,7 @@ public:
    const std::vector<int>& getY() const { return y; }
    const std::map<std::string, bool>& getNumericAttributes() const { return numeric_features; }
    const std::vector<std::pair<std::string, std::string>>& getAttributes() const { return attributes; };
-    
+
    // Move-enabled getters for efficient data transfer
    // WARNING: These methods move data OUT of the object, leaving it in an empty but valid state
    // Use these when you want to transfer ownership of large data structures for performance
@@ -241,7 +241,7 @@ public:
    std::map<std::string, std::vector<std::string>> moveStates() noexcept { return std::move(states); }
    std::vector<std::pair<std::string, std::string>> moveAttributes() noexcept { return std::move(attributes); }
    std::map<std::string, bool> moveNumericAttributes() noexcept { return std::move(numeric_features); }
-    
+
    std::vector<std::string> split(const std::string& text, char delimiter)
    {
        std::vector<std::string> result;
@@ -256,22 +256,23 @@ public:
 private:
    // Helper function to validate file path for security
-    static void validateFilePath(const std::string& fileName) {
+    static void validateFilePath(const std::string& fileName)
    {
        if (fileName.empty()) {
            throw std::invalid_argument("File path cannot be empty");
        }
-        
+
        // Check for path traversal attempts
        if (fileName.find("..") != std::string::npos) {
            throw std::invalid_argument("Path traversal detected in file path: " + fileName);
        }
-        
+
        // Check for absolute paths starting with / (Unix) or drive letters (Windows)
        if (fileName[0] == '/' || (fileName.length() >= 3 && fileName[1] == ':')) {
            // Allow absolute paths but log a warning - this is for user awareness
            // In production, you might want to restrict this based on your security requirements
        }
-        
+
        // Check for suspicious characters that could be used in path manipulation
        const std::string suspiciousChars = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
        for (char c : suspiciousChars) {
@@ -279,33 +280,35 @@ private:
                throw std::invalid_argument("Invalid character detected in file path");
            }
        }
-        
+
        // Check for excessively long paths (potential buffer overflow attempts)
        constexpr size_t MAX_PATH_LENGTH = 4096; // Common filesystem limit
        if (fileName.length() > MAX_PATH_LENGTH) {
            throw std::invalid_argument("File path too long (exceeds " + std::to_string(MAX_PATH_LENGTH) + " characters)");
        }
-        
+
        // Additional validation using filesystem operations when available
        try {
            // Check if the file exists and validate its canonical path
            if (std::filesystem::exists(fileName)) {
                std::filesystem::path normalizedPath = std::filesystem::canonical(fileName);
                std::string normalizedStr = normalizedPath.string();
-                
+
                // Check if normalized path still contains traversal attempts
                if (normalizedStr.find("..") != std::string::npos) {
                    throw std::invalid_argument("Path traversal detected after normalization: " + normalizedStr);
                }
            }
-        } catch (const std::filesystem::filesystem_error& e) {
+        }
        catch (const std::filesystem::filesystem_error& e) {
            // If filesystem operations fail, we can still proceed with basic validation
            // This ensures compatibility with systems where filesystem might not be fully available
        }
    }
    // Helper function to validate resource usage limits
-    static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0) {
+    static void validateResourceLimits(const std::string& fileName, size_t sampleCount = 0, size_t featureCount = 0)
    {
        // Check file size limit
        try {
            if (std::filesystem::exists(fileName)) {
@@ -314,16 +317,17 @@ private:
                    throw std::invalid_argument("File size (" + std::to_string(fileSize) + " bytes) exceeds maximum allowed size (" + std::to_string(DEFAULT_MAX_FILE_SIZE) + " bytes)");
                }
            }
-        } catch (const std::filesystem::filesystem_error&) {
+        }
        catch (const std::filesystem::filesystem_error&) {
            // If filesystem operations fail, continue without size checking
            // This ensures compatibility with systems where filesystem might not be available
        }
-        
+
        // Check sample count limit
        if (sampleCount > DEFAULT_MAX_SAMPLES) {
            throw std::invalid_argument("Number of samples (" + std::to_string(sampleCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_SAMPLES) + ")");
        }
-        
+
        // Check feature count limit
        if (featureCount > DEFAULT_MAX_FEATURES) {
            throw std::invalid_argument("Number of features (" + std::to_string(featureCount) + ") exceeds maximum allowed (" + std::to_string(DEFAULT_MAX_FEATURES) + ")");
@@ -352,12 +356,12 @@ private:
                continue;
            auto values = attribute.second;
            std::transform(values.begin(), values.end(), values.begin(), ::toupper);
-            
+
            // Enhanced attribute type detection
            bool isNumeric = values == "REAL" || values == "INTEGER" || values == "NUMERIC";
            bool isDate = values.find("DATE") != std::string::npos;
            bool isString = values == "STRING";
-            
+
            // For now, treat DATE and STRING as categorical (non-numeric)
            // This provides basic compatibility while maintaining existing functionality
            numeric_features[feature] = isNumeric;
@@ -490,7 +494,7 @@ private:
        // Validate file path for security
        validateFilePath(fileName);
-        
+
        // Validate file size before processing
        validateResourceLimits(fileName);
@@ -507,13 +511,13 @@ private:
            if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
                continue;
            }
-            
+
            // Skip sparse data format for now (lines starting with '{')
            // Future enhancement: implement full sparse data support
            if (!line.empty() && line[0] == '{') {
                continue;
            }
-            
+
            if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
                std::stringstream ss(line);
                ss >> keyword >> attribute;
@@ -564,7 +568,7 @@ private:
        if (lines.empty()) {
            throw std::invalid_argument("No data samples found in file");
        }
-        
+
        // Validate loaded data dimensions against limits
        validateResourceLimits(fileName, lines.size(), attributes.size());
@@ -621,15 +625,16 @@ private:
    }
    // Common helper function to parse ARFF file attributes and count samples
-    static int parseArffFile(const std::string& fileName, 
+    static int parseArffFile(const std::string& fileName,
-                            std::vector<std::pair<std::string, std::string>>& attributes,
+        std::vector<std::pair<std::string, std::string>>& attributes,
-                            std::set<std::string>& uniqueClasses,
+        std::set<std::string>& uniqueClasses,
-                            size_t& sampleCount,
+        size_t& sampleCount,
-                            int classIndex = -1,
+        int classIndex = -1,
-                            const std::string& classNameToFind = "") {
+        const std::string& classNameToFind = "")
    {
        // Validate file path for security
        validateFilePath(fileName);
-        
+
        std::ifstream file(fileName);
        if (!file.is_open()) {
            throw std::invalid_argument("Unable to open file: " + fileName);
@@ -645,12 +650,12 @@ private:
            if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
                continue;
            }
-            
+
            // Skip sparse data format for now (lines starting with '{')
            if (!line.empty() && line[0] == '{') {
                continue;
            }
-            
+
            if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
                std::stringstream ss(line);
                std::string keyword, attribute, type_w;
@@ -717,7 +722,7 @@ private:
                        // Use specific index
                        classValue = trim(tokens[actualClassIndex]);
                    }
-                    
+
                    if (!classValue.empty()) {
                        uniqueClasses.insert(classValue);
                        sampleCount++;
@@ -726,7 +731,7 @@ private:
            }
        }
        while (getline(file, line));
-        
+
        return actualClassIndex;
    }
--- a/conanfile.py
+++ b/conanfile.py
@@ -6,9 +6,7 @@ from conan.tools.files import copy
 class ArffFilesConan(ConanFile):
    name = "arff-files"
    version = "X.X.X"
-    description = (
+    description = "Header-only library to read ARFF (Attribute-Relation File Format) files and return STL vectors with the data read."
        "Header-only library to read ARFF (Attribute-Relation File Format) files and return STL vectors with the data read."
    )
    url = "https://github.com/rmontanana/ArffFiles"
    license = "MIT"
    homepage = "https://github.com/rmontanana/ArffFiles"
@@ -30,10 +28,35 @@ class ArffFilesConan(ConanFile):
    def package(self):
        # Copy header file to include directory
-        copy(self, "*.hpp", src=self.source_folder, dst=self.package_folder, keep_path=False)
+        copy(
            self,
            "*.hpp",
            src=self.source_folder,
            dst=self.package_folder,
            keep_path=False,
        )
        # Copy license and readme for package documentation
-        copy(self, "LICENSE", src=self.source_folder, dst=self.package_folder, keep_path=False)
+        copy(
-        copy(self, "README.md", src=self.source_folder, dst=self.package_folder, keep_path=False)
+            self,
            "LICENSE",
            src=self.source_folder,
            dst=self.package_folder,
            keep_path=False,
        )
        copy(
            self,
            "README.md",
            src=self.source_folder,
            dst=self.package_folder,
            keep_path=False,
        )
        copy(
            self,
            "CMakeLists.txt",
            src=self.source_folder,
            dst=self.package_folder,
            keep_path=False,
        )
    def package_info(self):
        # Header-only library configuration