From 86bd37b4582680af7d25ec5c1e3134f92a36ca2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana=20G=C3=B3mez?= Date: Fri, 27 Jun 2025 20:09:20 +0200 Subject: [PATCH] Refactor sumarizeFile methods to extract duplicated code --- ArffFiles.hpp | 166 +++++++++++++++++++++----------------------------- 1 file changed, 70 insertions(+), 96 deletions(-) diff --git a/ArffFiles.hpp b/ArffFiles.hpp index 6c2fe91..f804dfc 100644 --- a/ArffFiles.hpp +++ b/ArffFiles.hpp @@ -496,19 +496,22 @@ private: return false; } - // Helper function for summary with classLast parameter - static ArffSummary summarizeFile(const std::string& fileName, bool classLast) - { + // Common helper function to parse ARFF file attributes and count samples + static int parseArffFile(const std::string& fileName, + std::vector>& attributes, + std::set& uniqueClasses, + size_t& sampleCount, + int classIndex = -1, + const std::string& classNameToFind = "") { std::ifstream file(fileName); if (!file.is_open()) { throw std::invalid_argument("Unable to open file: " + fileName); } - ArffSummary summary; - std::vector> attributes; - std::set uniqueClasses; std::string line; - size_t sampleCount = 0; + attributes.clear(); + uniqueClasses.clear(); + sampleCount = 0; // Parse header while (getline(file, line)) { @@ -550,6 +553,61 @@ private: throw std::invalid_argument("No attributes found in file"); } + // Find class index if class name is specified + int actualClassIndex = classIndex; + if (!classNameToFind.empty()) { + actualClassIndex = -1; + for (size_t i = 0; i < attributes.size(); ++i) { + if (attributes[i].first == classNameToFind) { + actualClassIndex = static_cast(i); + break; + } + } + if (actualClassIndex == -1) { + throw std::invalid_argument("Class name '" + classNameToFind + "' not found in attributes"); + } + } + + // Count samples and collect unique class values + do { + if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) { + auto tokens = splitStatic(line, ','); + if (!tokens.empty()) { + std::string classValue; + if (actualClassIndex == -1) { + // Use last token (default behavior) + classValue = trim(tokens.back()); + } else if (actualClassIndex == 0) { + // Use first token + classValue = trim(tokens.front()); + } else if (actualClassIndex > 0 && static_cast(actualClassIndex) < tokens.size()) { + // Use specific index + classValue = trim(tokens[actualClassIndex]); + } + + if (!classValue.empty()) { + uniqueClasses.insert(classValue); + sampleCount++; + } + } + } + } + while (getline(file, line)); + + return actualClassIndex; + } + + // Helper function for summary with classLast parameter + static ArffSummary summarizeFile(const std::string& fileName, bool classLast) + { + ArffSummary summary; + std::vector> attributes; + std::set uniqueClasses; + size_t sampleCount = 0; + + // Use common parsing function + parseArffFile(fileName, attributes, uniqueClasses, sampleCount, classLast ? -1 : 0); + // Determine class attribute if (classLast) { summary.className = attributes.back().first; @@ -568,25 +626,6 @@ private: summary.featureInfo.emplace_back(attr.first, attr.second); } - // Count samples and collect unique class values - do { - if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) { - auto tokens = splitStatic(line, ','); - if (!tokens.empty()) { - std::string classValue; - if (classLast) { - classValue = trim(tokens.back()); - } else { - classValue = trim(tokens.front()); - } - if (!classValue.empty()) { - uniqueClasses.insert(classValue); - sampleCount++; - } - } - } - } - while (getline(file, line)); summary.numSamples = sampleCount; @@ -599,67 +638,18 @@ private: // Helper function for summary with className parameter static ArffSummary summarizeFile(const std::string& fileName, const std::string& className) { - std::ifstream file(fileName); - if (!file.is_open()) { - throw std::invalid_argument("Unable to open file: " + fileName); - } - ArffSummary summary; std::vector> attributes; std::set uniqueClasses; - std::string line; size_t sampleCount = 0; int classIndex = -1; - // Parse header - while (getline(file, line)) { - if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { - continue; - } - if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { - std::stringstream ss(line); - std::string keyword, attribute, type_w; - ss >> keyword >> attribute; + // Use common parsing function to find class by name + classIndex = parseArffFile(fileName, attributes, uniqueClasses, sampleCount, -1, className); - if (attribute.empty()) { - throw std::invalid_argument("Empty attribute name in line: " + line); - } - - // Build type string - std::ostringstream typeStream; - while (ss >> type_w) { - if (typeStream.tellp() > 0) typeStream << " "; - typeStream << type_w; - } - std::string type = typeStream.str(); - - if (type.empty()) { - throw std::invalid_argument("Empty attribute type for attribute: " + attribute); - } - - attributes.emplace_back(trim(attribute), trim(type)); - - if (trim(attribute) == className) { - classIndex = attributes.size() - 1; - summary.className = trim(attribute); - summary.classType = trim(type); - } - continue; - } - if (line[0] == '@') { - continue; - } - // Start of data section - break; - } - - if (attributes.empty()) { - throw std::invalid_argument("No attributes found in file"); - } - - if (classIndex == -1) { - throw std::invalid_argument("Class name '" + className + "' not found in attributes"); - } + // Set class information from the found attribute + summary.className = attributes[classIndex].first; + summary.classType = attributes[classIndex].second; // Remove class attribute from features attributes.erase(attributes.begin() + classIndex); @@ -670,22 +660,6 @@ private: summary.featureInfo.emplace_back(attr.first, attr.second); } - // Count samples and collect unique class values - do { - if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) { - auto tokens = splitStatic(line, ','); - if (tokens.size() > static_cast(classIndex)) { - std::string classValue = trim(tokens[classIndex]); - if (!classValue.empty()) { - uniqueClasses.insert(classValue); - sampleCount++; - } - } - } - } - while (getline(file, line)); - - summary.numSamples = sampleCount; summary.numClasses = uniqueClasses.size(); summary.classLabels.assign(uniqueClasses.begin(), uniqueClasses.end());