From f0845c5bd1dbfbd0af8729cdf743817b8fbdaf92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 18 Mar 2023 18:40:10 +0100 Subject: [PATCH] Fix mistake in class type of ArffFiles Add some type casting to CPPFImdlp Add additional path to datasets in tests Fix some smells in sample Join CMakeLists --- .vscode/launch.json | 8 +++-- .vscode/tasks.json | 29 ---------------- CMakeLists.txt | 4 ++- CPPFImdlp.cpp | 11 +++--- sample/.vscode/launch.json | 19 +++-------- sample/CMakeLists.txt | 2 -- sample/sample.cpp | 70 ++++++++++++++++++++------------------ tests/ArffFiles.cpp | 22 ++++++------ tests/ArffFiles.h | 4 +-- tests/CMakeLists.txt | 7 ++-- tests/FImdlp_unittest.cpp | 22 +++++++++--- tests/datasets/glass.arff | 2 +- 12 files changed, 87 insertions(+), 113 deletions(-) delete mode 100644 .vscode/tasks.json diff --git a/.vscode/launch.json b/.vscode/launch.json index c865479..d84105c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,12 +5,14 @@ "version": "0.2.0", "configurations": [ { - "name": "(lldb) Launch", - "type": "cppdbg", + "name": "lldb samplex", + "type": "lldb", "request": "launch", + "targetArchitecture": "arm64", "program": "${workspaceRoot}/sample/build/sample", "args": [ - "mfeat-factors" + "-f", + "glass" ], "stopAtEntry": false, "cwd": "${workspaceRoot}/sample/build/", diff --git a/.vscode/tasks.json b/.vscode/tasks.json deleted file mode 100644 index 5318667..0000000 --- a/.vscode/tasks.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "tasks": [ - { - "type": "cppbuild", - "label": "C/C++: clang++ build active file", - "command": "/usr/bin/clang++", - "args": [ - "-fcolor-diagnostics", - "-fansi-escape-codes", - "-g", - "${file}", - "-o", - "${fileDirname}/${fileBasenameNoExtension}" - ], - "options": { - "cwd": "${fileDirname}" - }, - "problemMatcher": [ - "$gcc" - ], - "group": { - "kind": "build", - "isDefault": true - }, - "detail": "Task generated by Debugger." - } - ], - "version": "2.0.0" -} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index ff48211..a711c93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,5 +3,7 @@ project(mdlp) set(CMAKE_CXX_STANDARD 11) -add_library(mdlp CPPFImdlp.cpp Metrics.cpp) +add_library(mdlp CPPFImdlp.cpp Metrics.cpp sample/sample.cpp) +add_subdirectory(sample) +add_subdirectory(tests) diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index c7b4a08..0f73173 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include "CPPFImdlp.h" #include "Metrics.h" namespace mdlp { @@ -21,7 +20,7 @@ namespace mdlp { if (proposed_cuts == 0) { return numeric_limits::max(); } - if (proposed_cuts < 0 || proposed_cuts > X.size()) { + if (proposed_cuts < 0 || proposed_cuts > static_cast(X.size())) { throw invalid_argument("wrong proposed num_cuts value"); } if (proposed_cuts < 1) @@ -125,8 +124,8 @@ namespace mdlp { // Cutpoints are always on boundaries (definition 2) if (y[indices[idx]] == y[indices[idx - 1]]) continue; - entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx); - entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end); + entropy_left = precision_t(idx - start) / static_cast(elements) * metrics.entropy(start, idx); + entropy_right = precision_t(end - idx) / static_cast(elements) * metrics.entropy(idx, end); if (entropy_left + entropy_right < minEntropy) { minEntropy = entropy_left + entropy_right; candidate = idx; @@ -148,8 +147,8 @@ namespace mdlp { ent1 = metrics.entropy(start, cut); ent2 = metrics.entropy(cut, end); ig = metrics.informationGain(start, cut, end); - delta = log2(pow(3, precision_t(k)) - 2) - - (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); + delta = static_cast(log2(pow(3, precision_t(k)) - 2) - + (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2)); precision_t term = 1 / N * (log2(N - 1) + delta); return ig > term; } diff --git a/sample/.vscode/launch.json b/sample/.vscode/launch.json index 30e5379..a573994 100644 --- a/sample/.vscode/launch.json +++ b/sample/.vscode/launch.json @@ -1,30 +1,21 @@ { - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ { - "name": "Launch sample", + "name": "lldb puro", "type": "cppdbg", + // "targetArchitecture": "arm64", "request": "launch", "program": "${workspaceRoot}/build/sample", "args": [ "-f", - "glass" - ], - "setupCommands": [ - { - "description": "Enable pretty-printing for gdb", - "text": "-enable-pretty-printing", - "ignoreFailures": true - } + "iris" ], "stopAtEntry": false, "cwd": "${workspaceRoot}/build/", "environment": [], "externalConsole": false, - "MIMode": "gdb", - } + "MIMode": "lldb" + }, ] } \ No newline at end of file diff --git a/sample/CMakeLists.txt b/sample/CMakeLists.txt index 68ba4df..4f6fe11 100644 --- a/sample/CMakeLists.txt +++ b/sample/CMakeLists.txt @@ -1,5 +1,3 @@ -cmake_minimum_required(VERSION 3.20) -project(main) set(CMAKE_CXX_STANDARD 11) diff --git a/sample/sample.cpp b/sample/sample.cpp index 0a25f05..cd2bbde 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -14,39 +14,41 @@ using namespace mdlp; const string PATH = "../../tests/datasets/"; /* print a description of all supported options */ -void usage(const char* path) -{ +void usage(const char *path) { /* take only the last portion of the path */ - const char* basename = strrchr(path, '/'); + const char *basename = strrchr(path, '/'); basename = basename ? basename + 1 : path; cout << "usage: " << basename << "[OPTION]" << endl; cout << " -h, --help\t\t Print this help and exit." << endl; - cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl; + cout + << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." + << endl; cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; - cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl; + cout + << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" + << endl; cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; } -tuple parse_arguments(int argc, char** argv) -{ +tuple parse_arguments(int argc, char **argv) { string file_name; string path = PATH; int max_depth = numeric_limits::max(); int min_length = 3; float max_cutpoints = 0; static struct option long_options[] = { - { "help", no_argument, 0, 'h' }, - { "file", required_argument, 0, 'f' }, - { "path", required_argument, 0, 'p' }, - { "max_depth", required_argument, 0, 'm' }, - { "max_cutpoints", required_argument, 0, 'c' }, - { "min_length", required_argument, 0, 'n' }, - { 0, 0, 0, 0 } + {"help", no_argument, nullptr, 'h'}, + {"file", required_argument, nullptr, 'f'}, + {"path", required_argument, nullptr, 'p'}, + {"max_depth", required_argument, nullptr, 'm'}, + {"max_cutpoints", required_argument, nullptr, 'c'}, + {"min_length", required_argument, nullptr, 'n'}, + {nullptr, 0, nullptr, 0} }; - while (1) { - auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0); + while (true) { + auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr); if (c == -1) break; switch (c) { @@ -57,13 +59,13 @@ tuple parse_arguments(int argc, char** argv) file_name = optarg; break; case 'm': - max_depth = atoi(optarg); + max_depth = (int) strtol(optarg, nullptr, 10); break; case 'n': - min_length = atoi(optarg); + min_length = (int) strtol(optarg, nullptr, 10); break; case 'c': - max_cutpoints = atof(optarg); + max_cutpoints = strtof(optarg, nullptr); break; case 'p': path = optarg; @@ -84,8 +86,8 @@ tuple parse_arguments(int argc, char** argv) return make_tuple(file_name, path, max_depth, min_length, max_cutpoints); } -void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints) -{ +void process_file(const string &path, const string &file_name, bool class_last, int max_depth, int min_length, + float max_cutpoints) { ArffFiles file; file.load(path + file_name + ".arff", class_last); @@ -93,16 +95,16 @@ void process_file(string path, string file_name, bool class_last, int max_depth, int items = file.getSize(); cout << "Number of lines: " << items << endl; cout << "Attributes: " << endl; - for (auto attribute : attributes) { + for (auto attribute: attributes) { cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl; } cout << "Class name: " << file.getClassName() << endl; cout << "Class type: " << file.getClassType() << endl; cout << "Data: " << endl; - vector& X = file.getX(); - labels_t& y = file.getY(); + vector &X = file.getX(); + labels_t &y = file.getY(); for (int i = 0; i < 5; i++) { - for (auto feature : X) { + for (auto feature: X) { cout << fixed << setprecision(1) << feature[i] << " "; } cout << y[i] << endl; @@ -115,7 +117,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth, cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; cout << "--------------------------" << setprecision(3) << endl; test.fit(X[i], y); - for (auto item : test.getCutPoints()) { + for (auto item: test.getCutPoints()) { cout << item << endl; } total += test.getCutPoints().size(); @@ -124,17 +126,18 @@ void process_file(string path, string file_name, bool class_last, int max_depth, cout << "Total feature states: " << total + attributes.size() << endl; } -void process_all_files(map datasets, string path, int max_depth, int min_length, float max_cutpoints) -{ - cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl; +void process_all_files(const map &datasets, const string &path, int max_depth, int min_length, + float max_cutpoints) { + cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: " + << max_cutpoints << endl << endl; printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); printf("==================== ==== ==== ========\n"); - for (auto dataset : datasets) { + for (const auto &dataset: datasets) { ArffFiles file; file.load(path + dataset.first + ".arff", dataset.second); auto attributes = file.getAttributes(); - vector& X = file.getX(); - labels_t& y = file.getY(); + vector &X = file.getX(); + labels_t &y = file.getY(); size_t timing = 0; int cut_points = 0; for (auto i = 0; i < attributes.size(); i++) { @@ -150,8 +153,7 @@ void process_all_files(map datasets, string path, int max_depth, i } -int main(int argc, char** argv) -{ +int main(int argc, char **argv) { map datasets = { {"glass", true}, {"iris", true}, diff --git a/tests/ArffFiles.cpp b/tests/ArffFiles.cpp index 4fbca78..a7286eb 100644 --- a/tests/ArffFiles.cpp +++ b/tests/ArffFiles.cpp @@ -2,13 +2,10 @@ #include #include #include -#include using namespace std; -ArffFiles::ArffFiles() -{ -} +ArffFiles::ArffFiles() = default; vector ArffFiles::getLines() { return lines; @@ -37,19 +34,22 @@ vector& ArffFiles::getY() { return y; } -void ArffFiles::load(string fileName, bool classLast) +void ArffFiles::load(const string fileName, bool classLast) { ifstream file(fileName); if (file.is_open()) { - string line, keyword, attribute, type; + string line, keyword, attribute, type, type_w; while (getline(file, line)) { if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { continue; } if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { stringstream ss(line); - ss >> keyword >> attribute >> type; - attributes.push_back({ attribute, type }); + ss >> keyword >> attribute; + type = ""; + while(ss >> type_w) + type += type_w + " "; + attributes.emplace_back(attribute, type ); continue; } if (line[0] == '@') { @@ -77,7 +77,7 @@ void ArffFiles::generateDataset(bool classLast) { X = vector>(attributes.size(), vector(lines.size())); vector yy = vector(lines.size(), ""); - int labelIndex = classLast ? attributes.size() : 0; + int labelIndex = classLast ? static_cast(attributes.size()) : 0; for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; @@ -92,7 +92,7 @@ void ArffFiles::generateDataset(bool classLast) } y = factorize(yy); } -string ArffFiles::trim(const string& source) +string ArffFiles::trim(const string& source) { string s(source); s.erase(0, s.find_first_not_of(" \n\r\t")); @@ -105,7 +105,7 @@ vector ArffFiles::factorize(const vector& labels_t) yy.reserve(labels_t.size()); map labelMap; int i = 0; - for (string label : labels_t) { + for (const string &label : labels_t) { if (labelMap.find(label) == labelMap.end()) { labelMap[label] = i++; } diff --git a/tests/ArffFiles.h b/tests/ArffFiles.h index b56d28d..8590d04 100644 --- a/tests/ArffFiles.h +++ b/tests/ArffFiles.h @@ -18,10 +18,10 @@ public: unsigned long int getSize(); string getClassName(); string getClassType(); - string trim(const string&); + static string trim(const string&); vector>& getX(); vector& getY(); vector> getAttributes(); - vector factorize(const vector& labels_t); + static vector factorize(const vector& labels_t); }; #endif \ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 58b1e86..67cc084 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,8 +1,5 @@ -cmake_minimum_required(VERSION 3.14) -project(FImdlp) - # GoogleTest requires at least C++14 -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) include(FetchContent) include_directories(${GTEST_INCLUDE_DIRS}) @@ -18,7 +15,7 @@ FetchContent_MakeAvailable(googletest) enable_testing() add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp) -add_executable(FImdlp_unittest ../CPPFImdlp.cpp ../ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp) +add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp) target_link_libraries(Metrics_unittest GTest::gtest_main) target_link_libraries(FImdlp_unittest GTest::gtest_main) target_compile_options(Metrics_unittest PRIVATE --coverage) diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index ac805bd..56e6c1e 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -1,8 +1,9 @@ #include "gtest/gtest.h" #include "../Metrics.h" #include "../CPPFImdlp.h" -#include "ArffFiles.h" +#include #include +#include "ArffFiles.h" #define EXPECT_THROW_WITH_MESSAGE(stmt, etype, whatstring) EXPECT_THROW( \ try { \ stmt; \ @@ -17,11 +18,23 @@ namespace mdlp { public: precision_t precision = 0.000001; TestFImdlp(): CPPFImdlp() {} + string data_path; void SetUp() { X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; fit(X, y); + data_path = set_data_path(); + } + string set_data_path() + { + string path = "../datasets/"; + ifstream file(path+"iris.arff"); + if (file.is_open()) { + file.close(); + return path; + } + return "../../tests/datasets/"; } void checkSortedVector() { @@ -37,6 +50,7 @@ namespace mdlp { { EXPECT_EQ(computed.size(), expected.size()); for (unsigned long i = 0; i < computed.size(); i++) { + cout << "(" << computed[i] << ", " << expected[i] << ") "; EXPECT_NEAR(computed[i], expected[i], precision); } } @@ -64,7 +78,7 @@ namespace mdlp { void test_dataset(CPPFImdlp& test, string filename, vector& expected, int depths[]) { ArffFiles file; - file.load("../datasets/" + filename + ".arff", true); + file.load(data_path + filename + ".arff", true); vector& X = file.getX(); labels_t& y = file.getY(); auto attributes = file.getAttributes(); @@ -73,10 +87,8 @@ namespace mdlp { EXPECT_EQ(test.get_depth(), depths[feature]); auto computed = test.getCutPoints(); cout << "Feature " << feature << ": "; - for (auto item : computed) - cout << item << " "; - cout << endl; checkCutPoints(computed, expected[feature]); + cout << endl; } } }; diff --git a/tests/datasets/glass.arff b/tests/datasets/glass.arff index abd9e3c..3bcb091 100755 --- a/tests/datasets/glass.arff +++ b/tests/datasets/glass.arff @@ -114,7 +114,7 @@ @attribute 'Ca' real @attribute 'Ba' real @attribute 'Fe' real -@attribute 'Type' { 'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps} +@attribute 'Type' {'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps} @data 1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0,0,'build wind float' 1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0,0,'vehic wind float'