Added summary of ArffFile and tests

This commit is contained in:
2025-06-27 19:48:56 +02:00
parent 9c1c427620
commit 7a69526409
3 changed files with 426 additions and 69 deletions

View File

@@ -4,11 +4,23 @@
#include <string>
#include <vector>
#include <map>
#include <set>
#include <sstream>
#include <fstream>
#include <cctype> // std::isdigit
#include <algorithm> // std::all_of std::transform
// Summary information structure for ARFF files
struct ArffSummary {
size_t numSamples; // Number of data samples
size_t numFeatures; // Number of feature attributes (excluding class)
size_t numClasses; // Number of different class values
std::string className; // Name of the class attribute
std::string classType; // Type/values of the class attribute
std::vector<std::string> classLabels; // List of unique class values
std::vector<std::pair<std::string, std::string>> featureInfo; // Feature names and types
};
class ArffFiles {
const std::string VERSION = "1.1.0";
public:
@@ -81,6 +93,39 @@ public:
preprocessDataset(labelIndex);
generateDataset(labelIndex);
}
// Static method to get summary information without loading all data (default: class is last)
static ArffSummary summary(const std::string& fileName)
{
return summary(fileName, true);
}
// Static method to get summary information without loading all data
static ArffSummary summary(const std::string& fileName, bool classLast)
{
if (fileName.empty()) {
throw std::invalid_argument("File name cannot be empty");
}
return summarizeFile(fileName, classLast);
}
// Static method to get summary information with specified class attribute (const char* overload)
static ArffSummary summary(const std::string& fileName, const char* className)
{
return summary(fileName, std::string(className));
}
// Static method to get summary information with specified class attribute
static ArffSummary summary(const std::string& fileName, const std::string& className)
{
if (fileName.empty()) {
throw std::invalid_argument("File name cannot be empty");
}
if (className.empty()) {
throw std::invalid_argument("Class name cannot be empty");
}
return summarizeFile(fileName, className);
}
std::vector<std::string> getLines() const { return lines; }
unsigned long int getSize() const { return lines.size(); }
std::string getClassName() const { return className; }
@@ -215,7 +260,8 @@ private:
// Parse numeric value with exception handling
try {
X[featureIdx][sampleIdx] = std::stof(token);
} catch (const std::exception& e) {
}
catch (const std::exception& e) {
throw std::invalid_argument("Invalid numeric value '" + token + "' at sample " + std::to_string(sampleIdx) + ", feature " + featureName);
}
} else {
@@ -325,7 +371,8 @@ private:
}
// Helper function for better missing value detection
bool containsMissingValue(const std::string& line) {
bool containsMissingValue(const std::string& line)
{
bool inQuotes = false;
char quoteChar = '\0';
@@ -345,6 +392,240 @@ private:
}
return false;
}
// Static version of missing value detection for summary methods
static bool containsMissingValueStatic(const std::string& line)
{
bool inQuotes = false;
char quoteChar = '\0';
for (size_t i = 0; i < line.length(); ++i) {
char c = line[i];
if (!inQuotes && (c == '\'' || c == '\"')) {
inQuotes = true;
quoteChar = c;
} else if (inQuotes && c == quoteChar) {
inQuotes = false;
quoteChar = '\0';
} else if (!inQuotes && c == '?') {
// Found unquoted '?' - this is a missing value
return true;
}
}
return false;
}
// Helper function for summary with classLast parameter
static ArffSummary summarizeFile(const std::string& fileName, bool classLast)
{
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName);
}
ArffSummary summary;
std::vector<std::pair<std::string, std::string>> attributes;
std::set<std::string> uniqueClasses;
std::string line;
size_t sampleCount = 0;
// Parse header
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
std::string keyword, attribute, type_w;
ss >> keyword >> attribute;
if (attribute.empty()) {
throw std::invalid_argument("Empty attribute name in line: " + line);
}
// Build type string
std::ostringstream typeStream;
while (ss >> type_w) {
if (typeStream.tellp() > 0) typeStream << " ";
typeStream << type_w;
}
std::string type = typeStream.str();
if (type.empty()) {
throw std::invalid_argument("Empty attribute type for attribute: " + attribute);
}
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
if (line[0] == '@') {
continue;
}
// Start of data section
break;
}
if (attributes.empty()) {
throw std::invalid_argument("No attributes found in file");
}
// Determine class attribute
if (classLast) {
summary.className = attributes.back().first;
summary.classType = attributes.back().second;
attributes.pop_back();
} else {
summary.className = attributes.front().first;
summary.classType = attributes.front().second;
attributes.erase(attributes.begin());
}
summary.numFeatures = attributes.size();
// Copy feature information
for (const auto& attr : attributes) {
summary.featureInfo.emplace_back(attr.first, attr.second);
}
// Count samples and collect unique class values
do {
if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) {
auto tokens = splitStatic(line, ',');
if (!tokens.empty()) {
std::string classValue;
if (classLast) {
classValue = trim(tokens.back());
} else {
classValue = trim(tokens.front());
}
if (!classValue.empty()) {
uniqueClasses.insert(classValue);
sampleCount++;
}
}
}
}
while (getline(file, line));
file.close();
summary.numSamples = sampleCount;
summary.numClasses = uniqueClasses.size();
summary.classLabels.assign(uniqueClasses.begin(), uniqueClasses.end());
return summary;
}
// Helper function for summary with className parameter
static ArffSummary summarizeFile(const std::string& fileName, const std::string& className)
{
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName);
}
ArffSummary summary;
std::vector<std::pair<std::string, std::string>> attributes;
std::set<std::string> uniqueClasses;
std::string line;
size_t sampleCount = 0;
int classIndex = -1;
// Parse header
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
std::string keyword, attribute, type_w;
ss >> keyword >> attribute;
if (attribute.empty()) {
throw std::invalid_argument("Empty attribute name in line: " + line);
}
// Build type string
std::ostringstream typeStream;
while (ss >> type_w) {
if (typeStream.tellp() > 0) typeStream << " ";
typeStream << type_w;
}
std::string type = typeStream.str();
if (type.empty()) {
throw std::invalid_argument("Empty attribute type for attribute: " + attribute);
}
attributes.emplace_back(trim(attribute), trim(type));
if (trim(attribute) == className) {
classIndex = attributes.size() - 1;
summary.className = trim(attribute);
summary.classType = trim(type);
}
continue;
}
if (line[0] == '@') {
continue;
}
// Start of data section
break;
}
if (attributes.empty()) {
throw std::invalid_argument("No attributes found in file");
}
if (classIndex == -1) {
throw std::invalid_argument("Class name '" + className + "' not found in attributes");
}
// Remove class attribute from features
attributes.erase(attributes.begin() + classIndex);
summary.numFeatures = attributes.size();
// Copy feature information
for (const auto& attr : attributes) {
summary.featureInfo.emplace_back(attr.first, attr.second);
}
// Count samples and collect unique class values
do {
if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) {
auto tokens = splitStatic(line, ',');
if (tokens.size() > static_cast<size_t>(classIndex)) {
std::string classValue = trim(tokens[classIndex]);
if (!classValue.empty()) {
uniqueClasses.insert(classValue);
sampleCount++;
}
}
}
}
while (getline(file, line));
file.close();
summary.numSamples = sampleCount;
summary.numClasses = uniqueClasses.size();
summary.classLabels.assign(uniqueClasses.begin(), uniqueClasses.end());
return summary;
}
// Static helper function for split (needed by summarizeFile)
static std::vector<std::string> splitStatic(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(trim(token));
}
return result;
}
};
#endif

View File

@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Claude TECHNICAL_REPORT.md for detailed analysis
- Claude CLAUDE.md for AI engine usage
- Method summary that returns the number of features, samples, and classes without loading the data
### Internal

View File

@@ -163,12 +163,14 @@ TEST_CASE("Input Validation Errors", "[ArffFiles][Error]")
{
ArffFiles arff;
SECTION("Empty filename") {
SECTION("Empty filename")
{
REQUIRE_THROWS_AS(arff.load(""), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(""), "File name cannot be empty");
}
SECTION("Nonexistent file") {
SECTION("Nonexistent file")
{
REQUIRE_THROWS_AS(arff.load("nonexistent_file.arff"), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load("nonexistent_file.arff"), Catch::Matchers::ContainsSubstring("Unable to open file"));
}
@@ -190,20 +192,23 @@ TEST_CASE("File Structure Validation Errors", "[ArffFiles][Error]")
{
ArffFiles arff;
SECTION("No attributes defined") {
SECTION("No attributes defined")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attributes")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attributes")), "No attributes found in file");
}
SECTION("No data samples") {
SECTION("No data samples")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("no_data")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("no_data")), "No data samples found in file");
}
SECTION("Duplicate attribute names") {
SECTION("Duplicate attribute names")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("duplicate_attributes")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("duplicate_attributes")),
Catch::Matchers::ContainsSubstring("Duplicate attribute name"));
Catch::Matchers::ContainsSubstring("Duplicate attribute name"));
}
// TODO: This test needs a better test case to trigger empty attribute name validation
@@ -213,10 +218,11 @@ TEST_CASE("File Structure Validation Errors", "[ArffFiles][Error]")
// Catch::Matchers::ContainsSubstring("Empty attribute name"));
// }
SECTION("Empty attribute type") {
SECTION("Empty attribute type")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attribute_type")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attribute_type")),
Catch::Matchers::ContainsSubstring("Empty attribute type"));
Catch::Matchers::ContainsSubstring("Empty attribute type"));
}
}
@@ -224,17 +230,19 @@ TEST_CASE("Data Parsing Validation Errors", "[ArffFiles][Error]")
{
ArffFiles arff;
SECTION("Wrong number of tokens") {
SECTION("Wrong number of tokens")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("wrong_token_count")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("wrong_token_count")),
Catch::Matchers::ContainsSubstring("has") &&
Catch::Matchers::ContainsSubstring("tokens, expected"));
Catch::Matchers::ContainsSubstring("has") &&
Catch::Matchers::ContainsSubstring("tokens, expected"));
}
SECTION("Invalid numeric value") {
SECTION("Invalid numeric value")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("invalid_numeric")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("invalid_numeric")),
Catch::Matchers::ContainsSubstring("Invalid numeric value"));
Catch::Matchers::ContainsSubstring("Invalid numeric value"));
}
// TODO: This test needs a better test case to trigger empty class label validation
@@ -244,10 +252,11 @@ TEST_CASE("Data Parsing Validation Errors", "[ArffFiles][Error]")
// Catch::Matchers::ContainsSubstring("Empty class label"));
// }
SECTION("Empty categorical value") {
SECTION("Empty categorical value")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_categorical")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_categorical")),
Catch::Matchers::ContainsSubstring("Empty categorical value"));
Catch::Matchers::ContainsSubstring("Empty categorical value"));
}
}
@@ -255,7 +264,8 @@ TEST_CASE("Missing Value Detection", "[ArffFiles][MissingValues]")
{
ArffFiles arff;
SECTION("Quoted question marks should not be treated as missing") {
SECTION("Quoted question marks should not be treated as missing")
{
// This should NOT throw an error - quoted question marks are valid data
REQUIRE_NOTHROW(arff.load(Paths::error_datasets("quoted_question_mark")));
// Note: This test would need a valid quoted string ARFF for string attributes
@@ -263,3 +273,68 @@ TEST_CASE("Missing Value Detection", "[ArffFiles][MissingValues]")
}
}
TEST_CASE("Summary Functionality", "[ArffFiles][Summary]")
{
SECTION("Basic summary with class last")
{
auto summary = ArffFiles::summary(Paths::datasets("iris"));
REQUIRE(summary.numSamples == 150);
REQUIRE(summary.numFeatures == 4);
REQUIRE(summary.numClasses == 3);
REQUIRE(summary.className == "class");
REQUIRE(summary.classType == "{Iris-setosa,Iris-versicolor,Iris-virginica}");
REQUIRE(summary.classLabels.size() == 3);
REQUIRE(summary.featureInfo.size() == 4);
// Check feature information
REQUIRE(summary.featureInfo[0].first == "sepallength");
REQUIRE(summary.featureInfo[0].second == "REAL");
REQUIRE(summary.featureInfo[1].first == "sepalwidth");
REQUIRE(summary.featureInfo[1].second == "REAL");
REQUIRE(summary.featureInfo[2].first == "petallength");
REQUIRE(summary.featureInfo[2].second == "REAL");
REQUIRE(summary.featureInfo[3].first == "petalwidth");
REQUIRE(summary.featureInfo[3].second == "REAL");
}
SECTION("Summary with specific class name")
{
auto summary = ArffFiles::summary(Paths::datasets("glass"), "Type");
REQUIRE(summary.numSamples == 214);
REQUIRE(summary.numFeatures == 9);
REQUIRE(summary.numClasses == 6);
REQUIRE(summary.className == "Type");
REQUIRE(summary.classType == "{ 'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps}");
REQUIRE(summary.classLabels.size() == 6);
REQUIRE(summary.featureInfo.size() == 9);
}
SECTION("Summary with class first")
{
auto summary = ArffFiles::summary(Paths::datasets("kdd_JapaneseVowels"), false);
REQUIRE(summary.className == "speaker");
REQUIRE(summary.numFeatures > 0);
REQUIRE(summary.numClasses > 0);
REQUIRE(summary.numSamples > 0);
}
SECTION("Summary error handling")
{
REQUIRE_THROWS_AS(ArffFiles::summary(""), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary(""), "File name cannot be empty");
REQUIRE_THROWS_AS(ArffFiles::summary("nonexistent.arff"), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary("nonexistent.arff"), Catch::Matchers::ContainsSubstring("Unable to open file"));
std::cout << "Now it's time to test class name errors" << std::endl;
REQUIRE_THROWS_AS(ArffFiles::summary(Paths::datasets("iris"), ""), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary(Paths::datasets("iris"), ""), "Class name cannot be empty");
REQUIRE_THROWS_AS(ArffFiles::summary(Paths::datasets("iris"), "nonexistent"), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary(Paths::datasets("iris"), "nonexistent"), "Class name 'nonexistent' not found in attributes");
}
}