Added summary of ArffFile and tests

This commit is contained in:
2025-06-27 19:48:56 +02:00
parent 9c1c427620
commit 7a69526409
3 changed files with 426 additions and 69 deletions

View File

@@ -4,11 +4,23 @@
#include <string>
#include <vector>
#include <map>
#include <set>
#include <sstream>
#include <fstream>
#include <cctype> // std::isdigit
#include <algorithm> // std::all_of std::transform
// Summary information structure for ARFF files
struct ArffSummary {
size_t numSamples; // Number of data samples
size_t numFeatures; // Number of feature attributes (excluding class)
size_t numClasses; // Number of different class values
std::string className; // Name of the class attribute
std::string classType; // Type/values of the class attribute
std::vector<std::string> classLabels; // List of unique class values
std::vector<std::pair<std::string, std::string>> featureInfo; // Feature names and types
};
class ArffFiles {
const std::string VERSION = "1.1.0";
public:
@@ -18,15 +30,15 @@ public:
if (fileName.empty()) {
throw std::invalid_argument("File name cannot be empty");
}
int labelIndex;
loadCommon(fileName);
// Validate we have attributes before accessing them
if (attributes.empty()) {
throw std::invalid_argument("No attributes found in file");
}
if (classLast) {
className = std::get<0>(attributes.back());
classType = std::get<1>(attributes.back());
@@ -38,12 +50,12 @@ public:
attributes.erase(attributes.begin());
labelIndex = 0;
}
// Validate class name is not empty
if (className.empty()) {
throw std::invalid_argument("Class attribute name cannot be empty");
}
preprocessDataset(labelIndex);
generateDataset(labelIndex);
}
@@ -55,15 +67,15 @@ public:
if (name.empty()) {
throw std::invalid_argument("Class name cannot be empty");
}
int labelIndex;
loadCommon(fileName);
// Validate we have attributes before searching
if (attributes.empty()) {
throw std::invalid_argument("No attributes found in file");
}
bool found = false;
for (size_t i = 0; i < attributes.size(); ++i) {
if (attributes[i].first == name) {
@@ -81,6 +93,39 @@ public:
preprocessDataset(labelIndex);
generateDataset(labelIndex);
}
// Static method to get summary information without loading all data (default: class is last)
static ArffSummary summary(const std::string& fileName)
{
return summary(fileName, true);
}
// Static method to get summary information without loading all data
static ArffSummary summary(const std::string& fileName, bool classLast)
{
if (fileName.empty()) {
throw std::invalid_argument("File name cannot be empty");
}
return summarizeFile(fileName, classLast);
}
// Static method to get summary information with specified class attribute (const char* overload)
static ArffSummary summary(const std::string& fileName, const char* className)
{
return summary(fileName, std::string(className));
}
// Static method to get summary information with specified class attribute
static ArffSummary summary(const std::string& fileName, const std::string& className)
{
if (fileName.empty()) {
throw std::invalid_argument("File name cannot be empty");
}
if (className.empty()) {
throw std::invalid_argument("Class name cannot be empty");
}
return summarizeFile(fileName, className);
}
std::vector<std::string> getLines() const { return lines; }
unsigned long int getSize() const { return lines.size(); }
std::string getClassName() const { return className; }
@@ -160,7 +205,7 @@ private:
{
const size_t numSamples = lines.size();
const size_t numFeatures = attributes.size();
// Validate inputs
if (numSamples == 0) {
throw std::invalid_argument("No data samples found in file");
@@ -171,10 +216,10 @@ private:
if (labelIndex < 0) {
throw std::invalid_argument("Invalid label index: cannot be negative");
}
// Pre-allocate with feature-major layout: X[feature][sample]
X.assign(numFeatures, std::vector<float>(numSamples));
// Temporary storage for categorical data per feature (only for non-numeric features)
std::vector<std::vector<std::string>> categoricalData(numFeatures);
for (size_t i = 0; i < numFeatures; ++i) {
@@ -182,23 +227,23 @@ private:
categoricalData[i].reserve(numSamples);
}
}
std::vector<std::string> yy;
yy.reserve(numSamples);
// Parse each sample
for (size_t sampleIdx = 0; sampleIdx < numSamples; ++sampleIdx) {
const auto tokens = split(lines[sampleIdx], ',');
// Validate token count matches expected number (features + class)
const size_t expectedTokens = numFeatures + 1;
if (tokens.size() != expectedTokens) {
throw std::invalid_argument("Sample " + std::to_string(sampleIdx) + " has " + std::to_string(tokens.size()) + " tokens, expected " + std::to_string(expectedTokens));
}
int pos = 0;
int featureIdx = 0;
for (const auto& token : tokens) {
if (pos++ == labelIndex) {
if (token.empty()) {
@@ -209,13 +254,14 @@ private:
if (featureIdx >= static_cast<int>(numFeatures)) {
throw std::invalid_argument("Too many feature values at sample " + std::to_string(sampleIdx));
}
const auto& featureName = attributes[featureIdx].first;
if (numeric_features.at(featureName)) {
// Parse numeric value with exception handling
try {
X[featureIdx][sampleIdx] = std::stof(token);
} catch (const std::exception& e) {
}
catch (const std::exception& e) {
throw std::invalid_argument("Invalid numeric value '" + token + "' at sample " + std::to_string(sampleIdx) + ", feature " + featureName);
}
} else {
@@ -229,20 +275,20 @@ private:
}
}
}
// Convert categorical features to numeric
for (size_t featureIdx = 0; featureIdx < numFeatures; ++featureIdx) {
if (!numeric_features[attributes[featureIdx].first]) {
const auto& featureName = attributes[featureIdx].first;
auto encodedValues = factorize(featureName, categoricalData[featureIdx]);
// Copy encoded values to X[feature][sample]
for (size_t sampleIdx = 0; sampleIdx < numSamples; ++sampleIdx) {
X[featureIdx][sampleIdx] = static_cast<float>(encodedValues[sampleIdx]);
}
}
}
y = factorize(className, yy);
}
void loadCommon(std::string fileName)
@@ -252,7 +298,7 @@ private:
attributes.clear();
states.clear();
numeric_features.clear();
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName);
@@ -269,19 +315,19 @@ private:
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
ss >> keyword >> attribute;
// Validate attribute name
if (attribute.empty()) {
throw std::invalid_argument("Empty attribute name in line: " + line);
}
// Check for duplicate attribute names
for (const auto& existing : attributes) {
if (existing.first == attribute) {
throw std::invalid_argument("Duplicate attribute name: " + attribute);
}
}
// Efficiently build type string
std::ostringstream typeStream;
while (ss >> type_w) {
@@ -289,12 +335,12 @@ private:
typeStream << type_w;
}
type = typeStream.str();
// Validate type is not empty
if (type.empty()) {
throw std::invalid_argument("Empty attribute type for attribute: " + attribute);
}
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
@@ -309,7 +355,7 @@ private:
lines.push_back(line);
}
file.close();
// Final validation
if (attributes.empty()) {
throw std::invalid_argument("No attributes found in file");
@@ -317,21 +363,22 @@ private:
if (lines.empty()) {
throw std::invalid_argument("No data samples found in file");
}
// Initialize states for all attributes
for (const auto& attribute : attributes) {
states[attribute.first] = std::vector<std::string>();
}
}
// Helper function for better missing value detection
bool containsMissingValue(const std::string& line) {
bool containsMissingValue(const std::string& line)
{
bool inQuotes = false;
char quoteChar = '\0';
for (size_t i = 0; i < line.length(); ++i) {
char c = line[i];
if (!inQuotes && (c == '\'' || c == '\"')) {
inQuotes = true;
quoteChar = c;
@@ -345,6 +392,240 @@ private:
}
return false;
}
// Static version of missing value detection for summary methods
static bool containsMissingValueStatic(const std::string& line)
{
bool inQuotes = false;
char quoteChar = '\0';
for (size_t i = 0; i < line.length(); ++i) {
char c = line[i];
if (!inQuotes && (c == '\'' || c == '\"')) {
inQuotes = true;
quoteChar = c;
} else if (inQuotes && c == quoteChar) {
inQuotes = false;
quoteChar = '\0';
} else if (!inQuotes && c == '?') {
// Found unquoted '?' - this is a missing value
return true;
}
}
return false;
}
// Helper function for summary with classLast parameter
static ArffSummary summarizeFile(const std::string& fileName, bool classLast)
{
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName);
}
ArffSummary summary;
std::vector<std::pair<std::string, std::string>> attributes;
std::set<std::string> uniqueClasses;
std::string line;
size_t sampleCount = 0;
// Parse header
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
std::string keyword, attribute, type_w;
ss >> keyword >> attribute;
if (attribute.empty()) {
throw std::invalid_argument("Empty attribute name in line: " + line);
}
// Build type string
std::ostringstream typeStream;
while (ss >> type_w) {
if (typeStream.tellp() > 0) typeStream << " ";
typeStream << type_w;
}
std::string type = typeStream.str();
if (type.empty()) {
throw std::invalid_argument("Empty attribute type for attribute: " + attribute);
}
attributes.emplace_back(trim(attribute), trim(type));
continue;
}
if (line[0] == '@') {
continue;
}
// Start of data section
break;
}
if (attributes.empty()) {
throw std::invalid_argument("No attributes found in file");
}
// Determine class attribute
if (classLast) {
summary.className = attributes.back().first;
summary.classType = attributes.back().second;
attributes.pop_back();
} else {
summary.className = attributes.front().first;
summary.classType = attributes.front().second;
attributes.erase(attributes.begin());
}
summary.numFeatures = attributes.size();
// Copy feature information
for (const auto& attr : attributes) {
summary.featureInfo.emplace_back(attr.first, attr.second);
}
// Count samples and collect unique class values
do {
if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) {
auto tokens = splitStatic(line, ',');
if (!tokens.empty()) {
std::string classValue;
if (classLast) {
classValue = trim(tokens.back());
} else {
classValue = trim(tokens.front());
}
if (!classValue.empty()) {
uniqueClasses.insert(classValue);
sampleCount++;
}
}
}
}
while (getline(file, line));
file.close();
summary.numSamples = sampleCount;
summary.numClasses = uniqueClasses.size();
summary.classLabels.assign(uniqueClasses.begin(), uniqueClasses.end());
return summary;
}
// Helper function for summary with className parameter
static ArffSummary summarizeFile(const std::string& fileName, const std::string& className)
{
std::ifstream file(fileName);
if (!file.is_open()) {
throw std::invalid_argument("Unable to open file: " + fileName);
}
ArffSummary summary;
std::vector<std::pair<std::string, std::string>> attributes;
std::set<std::string> uniqueClasses;
std::string line;
size_t sampleCount = 0;
int classIndex = -1;
// Parse header
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != std::string::npos || line.find("@ATTRIBUTE") != std::string::npos) {
std::stringstream ss(line);
std::string keyword, attribute, type_w;
ss >> keyword >> attribute;
if (attribute.empty()) {
throw std::invalid_argument("Empty attribute name in line: " + line);
}
// Build type string
std::ostringstream typeStream;
while (ss >> type_w) {
if (typeStream.tellp() > 0) typeStream << " ";
typeStream << type_w;
}
std::string type = typeStream.str();
if (type.empty()) {
throw std::invalid_argument("Empty attribute type for attribute: " + attribute);
}
attributes.emplace_back(trim(attribute), trim(type));
if (trim(attribute) == className) {
classIndex = attributes.size() - 1;
summary.className = trim(attribute);
summary.classType = trim(type);
}
continue;
}
if (line[0] == '@') {
continue;
}
// Start of data section
break;
}
if (attributes.empty()) {
throw std::invalid_argument("No attributes found in file");
}
if (classIndex == -1) {
throw std::invalid_argument("Class name '" + className + "' not found in attributes");
}
// Remove class attribute from features
attributes.erase(attributes.begin() + classIndex);
summary.numFeatures = attributes.size();
// Copy feature information
for (const auto& attr : attributes) {
summary.featureInfo.emplace_back(attr.first, attr.second);
}
// Count samples and collect unique class values
do {
if (!line.empty() && line[0] != '@' && line[0] != '%' && !containsMissingValueStatic(line)) {
auto tokens = splitStatic(line, ',');
if (tokens.size() > static_cast<size_t>(classIndex)) {
std::string classValue = trim(tokens[classIndex]);
if (!classValue.empty()) {
uniqueClasses.insert(classValue);
sampleCount++;
}
}
}
}
while (getline(file, line));
file.close();
summary.numSamples = sampleCount;
summary.numClasses = uniqueClasses.size();
summary.classLabels.assign(uniqueClasses.begin(), uniqueClasses.end());
return summary;
}
// Static helper function for split (needed by summarizeFile)
static std::vector<std::string> splitStatic(const std::string& text, char delimiter)
{
std::vector<std::string> result;
std::stringstream ss(text);
std::string token;
while (std::getline(ss, token, delimiter)) {
result.push_back(trim(token));
}
return result;
}
};
#endif

View File

@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Claude TECHNICAL_REPORT.md for detailed analysis
- Claude CLAUDE.md for AI engine usage
- Method summary that returns the number of features, samples, and classes without loading the data
### Internal

View File

@@ -14,7 +14,7 @@ public:
std::string file_name = path + name + ".arff";
return file_name;
}
static std::string error_datasets(const std::string& name)
{
std::string path = { arffFiles_data_path.begin(), arffFiles_data_path.end() };
@@ -162,23 +162,25 @@ TEST_CASE("Adult dataset", "[ArffFiles]")
TEST_CASE("Input Validation Errors", "[ArffFiles][Error]")
{
ArffFiles arff;
SECTION("Empty filename") {
SECTION("Empty filename")
{
REQUIRE_THROWS_AS(arff.load(""), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(""), "File name cannot be empty");
}
SECTION("Nonexistent file") {
SECTION("Nonexistent file")
{
REQUIRE_THROWS_AS(arff.load("nonexistent_file.arff"), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load("nonexistent_file.arff"), Catch::Matchers::ContainsSubstring("Unable to open file"));
}
// TODO: These tests need refinement to trigger the validation conditions properly
// SECTION("Empty class name") {
// REQUIRE_THROWS_AS(arff.load(Paths::datasets("iris"), ""), std::invalid_argument);
// REQUIRE_THROWS_WITH(arff.load(Paths::datasets("iris"), ""), "Class name cannot be empty");
// }
// SECTION("Invalid class name") {
// REQUIRE_THROWS_AS(arff.load(Paths::datasets("iris"), "nonexistent_class"), std::invalid_argument);
// REQUIRE_THROWS_WITH(arff.load(Paths::datasets("iris"), "nonexistent_class"),
@@ -189,73 +191,81 @@ TEST_CASE("Input Validation Errors", "[ArffFiles][Error]")
TEST_CASE("File Structure Validation Errors", "[ArffFiles][Error]")
{
ArffFiles arff;
SECTION("No attributes defined") {
SECTION("No attributes defined")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attributes")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attributes")), "No attributes found in file");
}
SECTION("No data samples") {
SECTION("No data samples")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("no_data")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("no_data")), "No data samples found in file");
}
SECTION("Duplicate attribute names") {
SECTION("Duplicate attribute names")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("duplicate_attributes")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("duplicate_attributes")),
Catch::Matchers::ContainsSubstring("Duplicate attribute name"));
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("duplicate_attributes")),
Catch::Matchers::ContainsSubstring("Duplicate attribute name"));
}
// TODO: This test needs a better test case to trigger empty attribute name validation
// SECTION("Empty attribute name") {
// REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attribute_name")), std::invalid_argument);
// REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attribute_name")),
// Catch::Matchers::ContainsSubstring("Empty attribute name"));
// }
SECTION("Empty attribute type") {
SECTION("Empty attribute type")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_attribute_type")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attribute_type")),
Catch::Matchers::ContainsSubstring("Empty attribute type"));
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_attribute_type")),
Catch::Matchers::ContainsSubstring("Empty attribute type"));
}
}
TEST_CASE("Data Parsing Validation Errors", "[ArffFiles][Error]")
{
ArffFiles arff;
SECTION("Wrong number of tokens") {
SECTION("Wrong number of tokens")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("wrong_token_count")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("wrong_token_count")),
Catch::Matchers::ContainsSubstring("has") &&
Catch::Matchers::ContainsSubstring("tokens, expected"));
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("wrong_token_count")),
Catch::Matchers::ContainsSubstring("has") &&
Catch::Matchers::ContainsSubstring("tokens, expected"));
}
SECTION("Invalid numeric value") {
SECTION("Invalid numeric value")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("invalid_numeric")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("invalid_numeric")),
Catch::Matchers::ContainsSubstring("Invalid numeric value"));
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("invalid_numeric")),
Catch::Matchers::ContainsSubstring("Invalid numeric value"));
}
// TODO: This test needs a better test case to trigger empty class label validation
// SECTION("Empty class label") {
// REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_class_label")), std::invalid_argument);
// REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_class_label")),
// Catch::Matchers::ContainsSubstring("Empty class label"));
// }
SECTION("Empty categorical value") {
SECTION("Empty categorical value")
{
REQUIRE_THROWS_AS(arff.load(Paths::error_datasets("empty_categorical")), std::invalid_argument);
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_categorical")),
Catch::Matchers::ContainsSubstring("Empty categorical value"));
REQUIRE_THROWS_WITH(arff.load(Paths::error_datasets("empty_categorical")),
Catch::Matchers::ContainsSubstring("Empty categorical value"));
}
}
TEST_CASE("Missing Value Detection", "[ArffFiles][MissingValues]")
{
ArffFiles arff;
SECTION("Quoted question marks should not be treated as missing") {
SECTION("Quoted question marks should not be treated as missing")
{
// This should NOT throw an error - quoted question marks are valid data
REQUIRE_NOTHROW(arff.load(Paths::error_datasets("quoted_question_mark")));
// Note: This test would need a valid quoted string ARFF for string attributes
@@ -263,3 +273,68 @@ TEST_CASE("Missing Value Detection", "[ArffFiles][MissingValues]")
}
}
TEST_CASE("Summary Functionality", "[ArffFiles][Summary]")
{
SECTION("Basic summary with class last")
{
auto summary = ArffFiles::summary(Paths::datasets("iris"));
REQUIRE(summary.numSamples == 150);
REQUIRE(summary.numFeatures == 4);
REQUIRE(summary.numClasses == 3);
REQUIRE(summary.className == "class");
REQUIRE(summary.classType == "{Iris-setosa,Iris-versicolor,Iris-virginica}");
REQUIRE(summary.classLabels.size() == 3);
REQUIRE(summary.featureInfo.size() == 4);
// Check feature information
REQUIRE(summary.featureInfo[0].first == "sepallength");
REQUIRE(summary.featureInfo[0].second == "REAL");
REQUIRE(summary.featureInfo[1].first == "sepalwidth");
REQUIRE(summary.featureInfo[1].second == "REAL");
REQUIRE(summary.featureInfo[2].first == "petallength");
REQUIRE(summary.featureInfo[2].second == "REAL");
REQUIRE(summary.featureInfo[3].first == "petalwidth");
REQUIRE(summary.featureInfo[3].second == "REAL");
}
SECTION("Summary with specific class name")
{
auto summary = ArffFiles::summary(Paths::datasets("glass"), "Type");
REQUIRE(summary.numSamples == 214);
REQUIRE(summary.numFeatures == 9);
REQUIRE(summary.numClasses == 6);
REQUIRE(summary.className == "Type");
REQUIRE(summary.classType == "{ 'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps}");
REQUIRE(summary.classLabels.size() == 6);
REQUIRE(summary.featureInfo.size() == 9);
}
SECTION("Summary with class first")
{
auto summary = ArffFiles::summary(Paths::datasets("kdd_JapaneseVowels"), false);
REQUIRE(summary.className == "speaker");
REQUIRE(summary.numFeatures > 0);
REQUIRE(summary.numClasses > 0);
REQUIRE(summary.numSamples > 0);
}
SECTION("Summary error handling")
{
REQUIRE_THROWS_AS(ArffFiles::summary(""), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary(""), "File name cannot be empty");
REQUIRE_THROWS_AS(ArffFiles::summary("nonexistent.arff"), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary("nonexistent.arff"), Catch::Matchers::ContainsSubstring("Unable to open file"));
std::cout << "Now it's time to test class name errors" << std::endl;
REQUIRE_THROWS_AS(ArffFiles::summary(Paths::datasets("iris"), ""), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary(Paths::datasets("iris"), ""), "Class name cannot be empty");
REQUIRE_THROWS_AS(ArffFiles::summary(Paths::datasets("iris"), "nonexistent"), std::invalid_argument);
REQUIRE_THROWS_WITH(ArffFiles::summary(Paths::datasets("iris"), "nonexistent"), "Class name 'nonexistent' not found in attributes");
}
}