diff --git a/ArffFiles.hpp b/ArffFiles.hpp index 8b72c60..9bc0c4d 100644 --- a/ArffFiles.hpp +++ b/ArffFiles.hpp @@ -4,11 +4,23 @@ #include #include #include +#include #include #include #include // std::isdigit #include // std::all_of std::transform +// Summary information structure for ARFF files +struct ArffSummary { + size_t numSamples; // Number of data samples + size_t numFeatures; // Number of feature attributes (excluding class) + size_t numClasses; // Number of different class values + std::string className; // Name of the class attribute + std::string classType; // Type/values of the class attribute + std::vector classLabels; // List of unique class values + std::vector> featureInfo; // Feature names and types +}; + class ArffFiles { const std::string VERSION = "1.1.0"; public: @@ -18,15 +30,15 @@ public: if (fileName.empty()) { throw std::invalid_argument("File name cannot be empty"); } - + int labelIndex; loadCommon(fileName); - + // Validate we have attributes before accessing them if (attributes.empty()) { throw std::invalid_argument("No attributes found in file"); } - + if (classLast) { className = std::get<0>(attributes.back()); classType = std::get<1>(attributes.back()); @@ -38,12 +50,12 @@ public: attributes.erase(attributes.begin()); labelIndex = 0; } - + // Validate class name is not empty if (className.empty()) { throw std::invalid_argument("Class attribute name cannot be empty"); } - + preprocessDataset(labelIndex); generateDataset(labelIndex); } @@ -55,15 +67,15 @@ public: if (name.empty()) { throw std::invalid_argument("Class name cannot be empty"); } - + int labelIndex; loadCommon(fileName); - + // Validate we have attributes before searching if (attributes.empty()) { throw std::invalid_argument("No attributes found in file"); } - + bool found = false; for (size_t i = 0; i < attributes.size(); ++i) { if (attributes[i].first == name) { @@ -81,6 +93,39 @@ public: preprocessDataset(labelIndex); generateDataset(labelIndex); } + + // Static method to get summary information without loading all data (default: class is last) + static ArffSummary summary(const std::string& fileName) + { + return summary(fileName, true); + } + + // Static method to get summary information without loading all data + static ArffSummary summary(const std::string& fileName, bool classLast) + { + if (fileName.empty()) { + throw std::invalid_argument("File name cannot be empty"); + } + return summarizeFile(fileName, classLast); + } + + // Static method to get summary information with specified class attribute (const char* overload) + static ArffSummary summary(const std::string& fileName, const char* className) + { + return summary(fileName, std::string(className)); + } + + // Static method to get summary information with specified class attribute + static ArffSummary summary(const std::string& fileName, const std::string& className) + { + if (fileName.empty()) { + throw std::invalid_argument("File name cannot be empty"); + } + if (className.empty()) { + throw std::invalid_argument("Class name cannot be empty"); + } + return summarizeFile(fileName, className); + } std::vector getLines() const { return lines; } unsigned long int getSize() const { return lines.size(); } std::string getClassName() const { return className; } @@ -160,7 +205,7 @@ private: { const size_t numSamples = lines.size(); const size_t numFeatures = attributes.size(); - + // Validate inputs if (numSamples == 0) { throw std::invalid_argument("No data samples found in file"); @@ -171,10 +216,10 @@ private: if (labelIndex < 0) { throw std::invalid_argument("Invalid label index: cannot be negative"); } - + // Pre-allocate with feature-major layout: X[feature][sample] X.assign(numFeatures, std::vector(numSamples)); - + // Temporary storage for categorical data per feature (only for non-numeric features) std::vector> categoricalData(numFeatures); for (size_t i = 0; i < numFeatures; ++i) { @@ -182,23 +227,23 @@ private: categoricalData[i].reserve(numSamples); } } - + std::vector yy; yy.reserve(numSamples); - + // Parse each sample for (size_t sampleIdx = 0; sampleIdx < numSamples; ++sampleIdx) { const auto tokens = split(lines[sampleIdx], ','); - + // Validate token count matches expected number (features + class) const size_t expectedTokens = numFeatures + 1; if (tokens.size() != expectedTokens) { throw std::invalid_argument("Sample " + std::to_string(sampleIdx) + " has " + std::to_string(tokens.size()) + " tokens, expected " + std::to_string(expectedTokens)); } - + int pos = 0; int featureIdx = 0; - + for (const auto& token : tokens) { if (pos++ == labelIndex) { if (token.empty()) { @@ -209,13 +254,14 @@ private: if (featureIdx >= static_cast(numFeatures)) { throw std::invalid_argument("Too many feature values at sample " + std::to_string(sampleIdx)); } - + const auto& featureName = attributes[featureIdx].first; if (numeric_features.at(featureName)) { // Parse numeric value with exception handling try { X[featureIdx][sampleIdx] = std::stof(token); - } catch (const std::exception& e) { + } + catch (const std::exception& e) { throw std::invalid_argument("Invalid numeric value '" + token + "' at sample " + std::to_string(sampleIdx) + ", feature " + featureName); } } else { @@ -229,20 +275,20 @@ private: } } } - + // Convert categorical features to numeric for (size_t featureIdx = 0; featureIdx < numFeatures; ++featureIdx) { if (!numeric_features[attributes[featureIdx].first]) { const auto& featureName = attributes[featureIdx].first; auto encodedValues = factorize(featureName, categoricalData[featureIdx]); - + // Copy encoded values to X[feature][sample] for (size_t sampleIdx = 0; sampleIdx < numSamples; ++sampleIdx) { X[featureIdx][sampleIdx] = static_cast(encodedValues[sampleIdx]); } } } - + y = factorize(className, yy); } void loadCommon(std::string fileName) @@ -252,7 +298,7 @@ private: attributes.clear(); states.clear(); numeric_features.clear(); - + std::ifstream file(fileName); if (!file.is_open()) { throw std::invalid_argument("Unable to open file: " + fileName); @@ -269,19 +315,19 @@ private: if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { std::stringstream ss(line); ss >> keyword >> attribute; - + // Validate attribute name if (attribute.empty()) { throw std::invalid_argument("Empty attribute name in line: " + line); } - + // Check for duplicate attribute names for (const auto& existing : attributes) { if (existing.first == attribute) { throw std::invalid_argument("Duplicate attribute name: " + attribute); } } - + // Efficiently build type string std::ostringstream typeStream; while (ss >> type_w) { @@ -289,12 +335,12 @@ private: typeStream << type_w; } type = typeStream.str(); - + // Validate type is not empty if (type.empty()) { throw std::invalid_argument("Empty attribute type for attribute: " + attribute); } - + attributes.emplace_back(trim(attribute), trim(type)); continue; } @@ -309,7 +355,7 @@ private: lines.push_back(line); } file.close(); - + // Final validation if (attributes.empty()) { throw std::invalid_argument("No attributes found in file"); @@ -317,21 +363,22 @@ private: if (lines.empty()) { throw std::invalid_argument("No data samples found in file"); } - + // Initialize states for all attributes for (const auto& attribute : attributes) { states[attribute.first] = std::vector(); } } - + // Helper function for better missing value detection - bool containsMissingValue(const std::string& line) { + bool containsMissingValue(const std::string& line) + { bool inQuotes = false; char quoteChar = '\0'; - + for (size_t i = 0; i < line.length(); ++i) { char c = line[i]; - + if (!inQuotes && (c == '\'' || c == '\"')) { inQuotes = true; quoteChar = c; @@ -345,6 +392,240 @@ private: } return false; } + + // Static version of missing value detection for summary methods + static bool containsMissingValueStatic(const std::string& line) + { + bool inQuotes = false; + char quoteChar = '\0'; + + for (size_t i = 0; i < line.length(); ++i) { + char c = line[i]; + + if (!inQuotes && (c == '\'' || c == '\"')) { + inQuotes = true; + quoteChar = c; + } else if (inQuotes && c == quoteChar) { + inQuotes = false; + quoteChar = '\0'; + } else if (!inQuotes && c == '?') { + // Found unquoted '?' - this is a missing value + return true; + } + } + return false; + } + + // Helper function for summary with classLast parameter + static ArffSummary summarizeFile(const std::string& fileName, bool classLast) + { + std::ifstream file(fileName); + if (!file.is_open()) { + throw std::invalid_argument("Unable to open file: " + fileName); + } + + ArffSummary summary; + std::vector> attributes; + std::set uniqueClasses; + std::string line; + size_t sampleCount = 0; + + // Parse header + while (getline(file, line)) { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { + std::stringstream ss(line); + std::string keyword, attribute, type_w; + ss >> keyword >> attribute; + + if (attribute.empty()) { + throw std::invalid_argument("Empty attribute name in line: " + line); + } + + // Build type string + std::ostringstream typeStream; + while (ss >> type_w) { + if (typeStream.tellp() > 0) typeStream << " "; + typeStream << type_w; + } + std::string type = typeStream.str(); + + if (type.empty()) { + throw std::invalid_argument("Empty attribute type for attribute: " + attribute); + } + + attributes.emplace_back(trim(attribute), trim(type)); + continue; + } + if (line[0] == '@') { + continue; + } + // Start of data section + break; + } + + if (attributes.empty()) { + throw std::invalid_argument("No attributes found in file"); + } + + // Determine class attribute + if (classLast) { + summary.className = attributes.back().first; + summary.classType = attributes.back().second; + attributes.pop_back(); + } else { + summary.className = attributes.front().first; + summary.classType = attributes.front().second; + attributes.erase(attributes.begin()); + } + + summary.numFeatures = attributes.size(); + + // Copy feature information + for (const auto& attr : attributes) { + summary.featureInfo.emplace_back(attr.first, attr.second); + } + + // Count samples and collect unique class values + do { + if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) { + auto tokens = splitStatic(line, ','); + if (!tokens.empty()) { + std::string classValue; + if (classLast) { + classValue = trim(tokens.back()); + } else { + classValue = trim(tokens.front()); + } + if (!classValue.empty()) { + uniqueClasses.insert(classValue); + sampleCount++; + } + } + } + } + while (getline(file, line)); + + file.close(); + + summary.numSamples = sampleCount; + summary.numClasses = uniqueClasses.size(); + summary.classLabels.assign(uniqueClasses.begin(), uniqueClasses.end()); + + return summary; + } + + // Helper function for summary with className parameter + static ArffSummary summarizeFile(const std::string& fileName, const std::string& className) + { + std::ifstream file(fileName); + if (!file.is_open()) { + throw std::invalid_argument("Unable to open file: " + fileName); + } + + ArffSummary summary; + std::vector> attributes; + std::set uniqueClasses; + std::string line; + size_t sampleCount = 0; + int classIndex = -1; + + // Parse header + while (getline(file, line)) { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { + continue; + } + if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) { + std::stringstream ss(line); + std::string keyword, attribute, type_w; + ss >> keyword >> attribute; + + if (attribute.empty()) { + throw std::invalid_argument("Empty attribute name in line: " + line); + } + + // Build type string + std::ostringstream typeStream; + while (ss >> type_w) { + if (typeStream.tellp() > 0) typeStream << " "; + typeStream << type_w; + } + std::string type = typeStream.str(); + + if (type.empty()) { + throw std::invalid_argument("Empty attribute type for attribute: " + attribute); + } + + attributes.emplace_back(trim(attribute), trim(type)); + + if (trim(attribute) == className) { + classIndex = attributes.size() - 1; + summary.className = trim(attribute); + summary.classType = trim(type); + } + continue; + } + if (line[0] == '@') { + continue; + } + // Start of data section + break; + } + + if (attributes.empty()) { + throw std::invalid_argument("No attributes found in file"); + } + + if (classIndex == -1) { + throw std::invalid_argument("Class name '" + className + "' not found in attributes"); + } + + // Remove class attribute from features + attributes.erase(attributes.begin() + classIndex); + summary.numFeatures = attributes.size(); + + // Copy feature information + for (const auto& attr : attributes) { + summary.featureInfo.emplace_back(attr.first, attr.second); + } + + // Count samples and collect unique class values + do { + if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) { + auto tokens = splitStatic(line, ','); + if (tokens.size() > static_cast(classIndex)) { + std::string classValue = trim(tokens[classIndex]); + if (!classValue.empty()) { + uniqueClasses.insert(classValue); + sampleCount++; + } + } + } + } + while (getline(file, line)); + + file.close(); + + summary.numSamples = sampleCount; + summary.numClasses = uniqueClasses.size(); + summary.classLabels.assign(uniqueClasses.begin(), uniqueClasses.end()); + + return summary; + } + + // Static helper function for split (needed by summarizeFile) + static std::vector splitStatic(const std::string& text, char delimiter) + { + std::vector result; + std::stringstream ss(text); + std::string token; + while (std::getline(ss, token, delimiter)) { + result.push_back(trim(token)); + } + return result; + } }; #endif diff --git a/CHANGELOG.md b/CHANGELOG.md index 83594dd..8ee8a8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Claude TECHNICAL_REPORT.md for detailed analysis - Claude CLAUDE.md for AI engine usage +- Method summary that returns the number of features, samples, and classes without loading the data ### Internal diff --git a/tests/TestArffFiles.cc b/tests/TestArffFiles.cc index 553f23f..b96b97e 100644 --- a/tests/TestArffFiles.cc +++ b/tests/TestArffFiles.cc @@ -14,7 +14,7 @@ public: std::string file_name = path + name + ".arff"; return file_name; } - + static std::string error_datasets(const std::string& name) { std::string path = { arffFiles_data_path.begin(), arffFiles_data_path.end() }; @@ -162,23 +162,25 @@ TEST_CASE("Adult dataset", "[ArffFiles]") TEST_CASE("Input Validation Errors", "[ArffFiles][Error]") { ArffFiles arff; - - SECTION("Empty filename") { + + SECTION("Empty filename") + { REQUIRE_THROWS_AS(arff.load(""), std::invalid_argument); REQUIRE_THROWS_WITH(arff.load(""), "File name cannot be empty"); } - - SECTION("Nonexistent file") { + + SECTION("Nonexistent file") + { REQUIRE_THROWS_AS(arff.load("nonexistent_file.arff"), std::invalid_argument); REQUIRE_THROWS_WITH(arff.load("nonexistent_file.arff"), Catch::Matchers::ContainsSubstring("Unable to open file")); } - + // TODO: These tests need refinement to trigger the validation conditions properly // SECTION("Empty class name") { // REQUIRE_THROWS_AS(arff.load(Paths::datasets("iris"), ""), std::invalid_argument); // REQUIRE_THROWS_WITH(arff.load(Paths::datasets("iris"), ""), "Class name cannot be empty"); // } - + // SECTION("Invalid class name") { // REQUIRE_THROWS_AS(arff.load(Paths::datasets("iris"), "nonexistent_class"), std::invalid_argument); // REQUIRE_THROWS_WITH(arff.load(Paths::datasets("iris"), "nonexistent_class"), @@ -189,73 +191,81 @@ TEST_CASE("Input Validation Errors", "[ArffFiles][Error]") TEST_CASE("File Structure Validation Errors", "[ArffFiles][Error]") { ArffFiles arff; - - SECTION("No attributes defined") { + + SECTION("No attributes defined") + { REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attributes")), std::invalid_argument); REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attributes")), "No attributes found in file"); } - - SECTION("No data samples") { + + SECTION("No data samples") + { REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("no_data")), std::invalid_argument); REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("no_data")), "No data samples found in file"); } - - SECTION("Duplicate attribute names") { + + SECTION("Duplicate attribute names") + { REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("duplicate_attributes")), std::invalid_argument); - REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("duplicate_attributes")), - Catch::Matchers::ContainsSubstring("Duplicate attribute name")); + REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("duplicate_attributes")), + Catch::Matchers::ContainsSubstring("Duplicate attribute name")); } - + // TODO: This test needs a better test case to trigger empty attribute name validation // SECTION("Empty attribute name") { // REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attribute_name")), std::invalid_argument); // REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attribute_name")), // Catch::Matchers::ContainsSubstring("Empty attribute name")); // } - - SECTION("Empty attribute type") { + + SECTION("Empty attribute type") + { REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attribute_type")), std::invalid_argument); - REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attribute_type")), - Catch::Matchers::ContainsSubstring("Empty attribute type")); + REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attribute_type")), + Catch::Matchers::ContainsSubstring("Empty attribute type")); } } TEST_CASE("Data Parsing Validation Errors", "[ArffFiles][Error]") { ArffFiles arff; - - SECTION("Wrong number of tokens") { + + SECTION("Wrong number of tokens") + { REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("wrong_token_count")), std::invalid_argument); - REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("wrong_token_count")), - Catch::Matchers::ContainsSubstring("has") && - Catch::Matchers::ContainsSubstring("tokens, expected")); + REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("wrong_token_count")), + Catch::Matchers::ContainsSubstring("has") && + Catch::Matchers::ContainsSubstring("tokens, expected")); } - - SECTION("Invalid numeric value") { + + SECTION("Invalid numeric value") + { REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("invalid_numeric")), std::invalid_argument); - REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("invalid_numeric")), - Catch::Matchers::ContainsSubstring("Invalid numeric value")); + REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("invalid_numeric")), + Catch::Matchers::ContainsSubstring("Invalid numeric value")); } - + // TODO: This test needs a better test case to trigger empty class label validation // SECTION("Empty class label") { // REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_class_label")), std::invalid_argument); // REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_class_label")), // Catch::Matchers::ContainsSubstring("Empty class label")); // } - - SECTION("Empty categorical value") { + + SECTION("Empty categorical value") + { REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_categorical")), std::invalid_argument); - REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_categorical")), - Catch::Matchers::ContainsSubstring("Empty categorical value")); + REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_categorical")), + Catch::Matchers::ContainsSubstring("Empty categorical value")); } } TEST_CASE("Missing Value Detection", "[ArffFiles][MissingValues]") { ArffFiles arff; - - SECTION("Quoted question marks should not be treated as missing") { + + SECTION("Quoted question marks should not be treated as missing") + { // This should NOT throw an error - quoted question marks are valid data REQUIRE_NOTHROW(arff.load(Paths::error_datasets("quoted_question_mark"))); // Note: This test would need a valid quoted string ARFF for string attributes @@ -263,3 +273,68 @@ TEST_CASE("Missing Value Detection", "[ArffFiles][MissingValues]") } } +TEST_CASE("Summary Functionality", "[ArffFiles][Summary]") +{ + SECTION("Basic summary with class last") + { + auto summary = ArffFiles::summary(Paths::datasets("iris")); + + REQUIRE(summary.numSamples == 150); + REQUIRE(summary.numFeatures == 4); + REQUIRE(summary.numClasses == 3); + REQUIRE(summary.className == "class"); + REQUIRE(summary.classType == "{Iris-setosa,Iris-versicolor,Iris-virginica}"); + REQUIRE(summary.classLabels.size() == 3); + REQUIRE(summary.featureInfo.size() == 4); + + // Check feature information + REQUIRE(summary.featureInfo[0].first == "sepallength"); + REQUIRE(summary.featureInfo[0].second == "REAL"); + REQUIRE(summary.featureInfo[1].first == "sepalwidth"); + REQUIRE(summary.featureInfo[1].second == "REAL"); + REQUIRE(summary.featureInfo[2].first == "petallength"); + REQUIRE(summary.featureInfo[2].second == "REAL"); + REQUIRE(summary.featureInfo[3].first == "petalwidth"); + REQUIRE(summary.featureInfo[3].second == "REAL"); + } + + SECTION("Summary with specific class name") + { + auto summary = ArffFiles::summary(Paths::datasets("glass"), "Type"); + + REQUIRE(summary.numSamples == 214); + REQUIRE(summary.numFeatures == 9); + REQUIRE(summary.numClasses == 6); + REQUIRE(summary.className == "Type"); + REQUIRE(summary.classType == "{ 'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps}"); + REQUIRE(summary.classLabels.size() == 6); + REQUIRE(summary.featureInfo.size() == 9); + } + + SECTION("Summary with class first") + { + auto summary = ArffFiles::summary(Paths::datasets("kdd_JapaneseVowels"), false); + + REQUIRE(summary.className == "speaker"); + REQUIRE(summary.numFeatures > 0); + REQUIRE(summary.numClasses > 0); + REQUIRE(summary.numSamples > 0); + } + + SECTION("Summary error handling") + { + REQUIRE_THROWS_AS(ArffFiles::summary(""), std::invalid_argument); + REQUIRE_THROWS_WITH(ArffFiles::summary(""), "File name cannot be empty"); + + REQUIRE_THROWS_AS(ArffFiles::summary("nonexistent.arff"), std::invalid_argument); + REQUIRE_THROWS_WITH(ArffFiles::summary("nonexistent.arff"), Catch::Matchers::ContainsSubstring("Unable to open file")); + + std::cout << "Now it's time to test class name errors" << std::endl; + REQUIRE_THROWS_AS(ArffFiles::summary(Paths::datasets("iris"), ""), std::invalid_argument); + REQUIRE_THROWS_WITH(ArffFiles::summary(Paths::datasets("iris"), ""), "Class name cannot be empty"); + + REQUIRE_THROWS_AS(ArffFiles::summary(Paths::datasets("iris"), "nonexistent"), std::invalid_argument); + REQUIRE_THROWS_WITH(ArffFiles::summary(Paths::datasets("iris"), "nonexistent"), "Class name 'nonexistent' not found in attributes"); + } +} +