Fix mistake in class type of ArffFiles

Add some type casting to CPPFImdlp
Add additional path to datasets in tests
Fix some smells in sample
Join CMakeLists
This commit is contained in:
2023-03-18 18:40:10 +01:00
parent 1f4abade2c
commit f0845c5bd1
12 changed files with 87 additions and 113 deletions

8
.vscode/launch.json vendored
View File

@@ -5,12 +5,14 @@
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{ {
"name": "(lldb) Launch", "name": "lldb samplex",
"type": "cppdbg", "type": "lldb",
"request": "launch", "request": "launch",
"targetArchitecture": "arm64",
"program": "${workspaceRoot}/sample/build/sample", "program": "${workspaceRoot}/sample/build/sample",
"args": [ "args": [
"mfeat-factors" "-f",
"glass"
], ],
"stopAtEntry": false, "stopAtEntry": false,
"cwd": "${workspaceRoot}/sample/build/", "cwd": "${workspaceRoot}/sample/build/",

29
.vscode/tasks.json vendored
View File

@@ -1,29 +0,0 @@
{
"tasks": [
{
"type": "cppbuild",
"label": "C/C++: clang++ build active file",
"command": "/usr/bin/clang++",
"args": [
"-fcolor-diagnostics",
"-fansi-escape-codes",
"-g",
"${file}",
"-o",
"${fileDirname}/${fileBasenameNoExtension}"
],
"options": {
"cwd": "${fileDirname}"
},
"problemMatcher": [
"$gcc"
],
"group": {
"kind": "build",
"isDefault": true
},
"detail": "Task generated by Debugger."
}
],
"version": "2.0.0"
}

View File

@@ -3,5 +3,7 @@ project(mdlp)
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
add_library(mdlp CPPFImdlp.cpp Metrics.cpp) add_library(mdlp CPPFImdlp.cpp Metrics.cpp sample/sample.cpp)
add_subdirectory(sample)
add_subdirectory(tests)

View File

@@ -3,7 +3,6 @@
#include <set> #include <set>
#include <cmath> #include <cmath>
#include <limits> #include <limits>
#include <cmath>
#include "CPPFImdlp.h" #include "CPPFImdlp.h"
#include "Metrics.h" #include "Metrics.h"
namespace mdlp { namespace mdlp {
@@ -21,7 +20,7 @@ namespace mdlp {
if (proposed_cuts == 0) { if (proposed_cuts == 0) {
return numeric_limits<size_t>::max(); return numeric_limits<size_t>::max();
} }
if (proposed_cuts < 0 || proposed_cuts > X.size()) { if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
throw invalid_argument("wrong proposed num_cuts value"); throw invalid_argument("wrong proposed num_cuts value");
} }
if (proposed_cuts < 1) if (proposed_cuts < 1)
@@ -125,8 +124,8 @@ namespace mdlp {
// Cutpoints are always on boundaries (definition 2) // Cutpoints are always on boundaries (definition 2)
if (y[indices[idx]] == y[indices[idx - 1]]) if (y[indices[idx]] == y[indices[idx - 1]])
continue; continue;
entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx); entropy_left = precision_t(idx - start) / static_cast<float>(elements) * metrics.entropy(start, idx);
entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end); entropy_right = precision_t(end - idx) / static_cast<float>(elements) * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) { if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right; minEntropy = entropy_left + entropy_right;
candidate = idx; candidate = idx;
@@ -148,8 +147,8 @@ namespace mdlp {
ent1 = metrics.entropy(start, cut); ent1 = metrics.entropy(start, cut);
ent2 = metrics.entropy(cut, end); ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end); ig = metrics.informationGain(start, cut, end);
delta = log2(pow(3, precision_t(k)) - 2) - delta = static_cast<float>(log2(pow(3, precision_t(k)) - 2) -
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2); (precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
precision_t term = 1 / N * (log2(N - 1) + delta); precision_t term = 1 / N * (log2(N - 1) + delta);
return ig > term; return ig > term;
} }

View File

@@ -1,30 +1,21 @@
{ {
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{ {
"name": "Launch sample", "name": "lldb puro",
"type": "cppdbg", "type": "cppdbg",
// "targetArchitecture": "arm64",
"request": "launch", "request": "launch",
"program": "${workspaceRoot}/build/sample", "program": "${workspaceRoot}/build/sample",
"args": [ "args": [
"-f", "-f",
"glass" "iris"
],
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
}
], ],
"stopAtEntry": false, "stopAtEntry": false,
"cwd": "${workspaceRoot}/build/", "cwd": "${workspaceRoot}/build/",
"environment": [], "environment": [],
"externalConsole": false, "externalConsole": false,
"MIMode": "gdb", "MIMode": "lldb"
} },
] ]
} }

View File

@@ -1,5 +1,3 @@
cmake_minimum_required(VERSION 3.20)
project(main)
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)

View File

@@ -14,39 +14,41 @@ using namespace mdlp;
const string PATH = "../../tests/datasets/"; const string PATH = "../../tests/datasets/";
/* print a description of all supported options */ /* print a description of all supported options */
void usage(const char* path) void usage(const char *path) {
{
/* take only the last portion of the path */ /* take only the last portion of the path */
const char* basename = strrchr(path, '/'); const char *basename = strrchr(path, '/');
basename = basename ? basename + 1 : path; basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl; cout << "usage: " << basename << "[OPTION]" << endl;
cout << " -h, --help\t\t Print this help and exit." << endl; cout << " -h, --help\t\t Print this help and exit." << endl;
cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl; cout
<< " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl; cout
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any"
<< endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
} }
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv) tuple<string, string, int, int, float> parse_arguments(int argc, char **argv) {
{
string file_name; string file_name;
string path = PATH; string path = PATH;
int max_depth = numeric_limits<int>::max(); int max_depth = numeric_limits<int>::max();
int min_length = 3; int min_length = 3;
float max_cutpoints = 0; float max_cutpoints = 0;
static struct option long_options[] = { static struct option long_options[] = {
{ "help", no_argument, 0, 'h' }, {"help", no_argument, nullptr, 'h'},
{ "file", required_argument, 0, 'f' }, {"file", required_argument, nullptr, 'f'},
{ "path", required_argument, 0, 'p' }, {"path", required_argument, nullptr, 'p'},
{ "max_depth", required_argument, 0, 'm' }, {"max_depth", required_argument, nullptr, 'm'},
{ "max_cutpoints", required_argument, 0, 'c' }, {"max_cutpoints", required_argument, nullptr, 'c'},
{ "min_length", required_argument, 0, 'n' }, {"min_length", required_argument, nullptr, 'n'},
{ 0, 0, 0, 0 } {nullptr, 0, nullptr, 0}
}; };
while (1) { while (true) {
auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0); auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr);
if (c == -1) if (c == -1)
break; break;
switch (c) { switch (c) {
@@ -57,13 +59,13 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
file_name = optarg; file_name = optarg;
break; break;
case 'm': case 'm':
max_depth = atoi(optarg); max_depth = (int) strtol(optarg, nullptr, 10);
break; break;
case 'n': case 'n':
min_length = atoi(optarg); min_length = (int) strtol(optarg, nullptr, 10);
break; break;
case 'c': case 'c':
max_cutpoints = atof(optarg); max_cutpoints = strtof(optarg, nullptr);
break; break;
case 'p': case 'p':
path = optarg; path = optarg;
@@ -84,8 +86,8 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
return make_tuple(file_name, path, max_depth, min_length, max_cutpoints); return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
} }
void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints) void process_file(const string &path, const string &file_name, bool class_last, int max_depth, int min_length,
{ float max_cutpoints) {
ArffFiles file; ArffFiles file;
file.load(path + file_name + ".arff", class_last); file.load(path + file_name + ".arff", class_last);
@@ -93,16 +95,16 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
int items = file.getSize(); int items = file.getSize();
cout << "Number of lines: " << items << endl; cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl; cout << "Attributes: " << endl;
for (auto attribute : attributes) { for (auto attribute: attributes) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl; cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
} }
cout << "Class name: " << file.getClassName() << endl; cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl; cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl; cout << "Data: " << endl;
vector<samples_t>& X = file.getX(); vector<samples_t> &X = file.getX();
labels_t& y = file.getY(); labels_t &y = file.getY();
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
for (auto feature : X) { for (auto feature: X) {
cout << fixed << setprecision(1) << feature[i] << " "; cout << fixed << setprecision(1) << feature[i] << " ";
} }
cout << y[i] << endl; cout << y[i] << endl;
@@ -115,7 +117,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
cout << "--------------------------" << setprecision(3) << endl; cout << "--------------------------" << setprecision(3) << endl;
test.fit(X[i], y); test.fit(X[i], y);
for (auto item : test.getCutPoints()) { for (auto item: test.getCutPoints()) {
cout << item << endl; cout << item << endl;
} }
total += test.getCutPoints().size(); total += test.getCutPoints().size();
@@ -124,17 +126,18 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
cout << "Total feature states: " << total + attributes.size() << endl; cout << "Total feature states: " << total + attributes.size() << endl;
} }
void process_all_files(map<string, bool> datasets, string path, int max_depth, int min_length, float max_cutpoints) void process_all_files(const map<string, bool> &datasets, const string &path, int max_depth, int min_length,
{ float max_cutpoints) {
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl; cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << endl << endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
printf("==================== ==== ==== ========\n"); printf("==================== ==== ==== ========\n");
for (auto dataset : datasets) { for (const auto &dataset: datasets) {
ArffFiles file; ArffFiles file;
file.load(path + dataset.first + ".arff", dataset.second); file.load(path + dataset.first + ".arff", dataset.second);
auto attributes = file.getAttributes(); auto attributes = file.getAttributes();
vector<samples_t>& X = file.getX(); vector<samples_t> &X = file.getX();
labels_t& y = file.getY(); labels_t &y = file.getY();
size_t timing = 0; size_t timing = 0;
int cut_points = 0; int cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
@@ -150,8 +153,7 @@ void process_all_files(map<string, bool> datasets, string path, int max_depth, i
} }
int main(int argc, char** argv) int main(int argc, char **argv) {
{
map<string, bool> datasets = { map<string, bool> datasets = {
{"glass", true}, {"glass", true},
{"iris", true}, {"iris", true},

View File

@@ -2,13 +2,10 @@
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include <map> #include <map>
#include <iostream>
using namespace std; using namespace std;
ArffFiles::ArffFiles() ArffFiles::ArffFiles() = default;
{
}
vector<string> ArffFiles::getLines() vector<string> ArffFiles::getLines()
{ {
return lines; return lines;
@@ -37,19 +34,22 @@ vector<int>& ArffFiles::getY()
{ {
return y; return y;
} }
void ArffFiles::load(string fileName, bool classLast) void ArffFiles::load(const string fileName, bool classLast)
{ {
ifstream file(fileName); ifstream file(fileName);
if (file.is_open()) { if (file.is_open()) {
string line, keyword, attribute, type; string line, keyword, attribute, type, type_w;
while (getline(file, line)) { while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue; continue;
} }
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line); stringstream ss(line);
ss >> keyword >> attribute >> type; ss >> keyword >> attribute;
attributes.push_back({ attribute, type }); type = "";
while(ss >> type_w)
type += type_w + " ";
attributes.emplace_back(attribute, type );
continue; continue;
} }
if (line[0] == '@') { if (line[0] == '@') {
@@ -77,7 +77,7 @@ void ArffFiles::generateDataset(bool classLast)
{ {
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size())); X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), ""); vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0; int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
for (size_t i = 0; i < lines.size(); i++) { for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]); stringstream ss(lines[i]);
string value; string value;
@@ -105,7 +105,7 @@ vector<int> ArffFiles::factorize(const vector<string>& labels_t)
yy.reserve(labels_t.size()); yy.reserve(labels_t.size());
map<string, int> labelMap; map<string, int> labelMap;
int i = 0; int i = 0;
for (string label : labels_t) { for (const string &label : labels_t) {
if (labelMap.find(label) == labelMap.end()) { if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++; labelMap[label] = i++;
} }

View File

@@ -18,10 +18,10 @@ public:
unsigned long int getSize(); unsigned long int getSize();
string getClassName(); string getClassName();
string getClassType(); string getClassType();
string trim(const string&); static string trim(const string&);
vector<vector<float>>& getX(); vector<vector<float>>& getX();
vector<int>& getY(); vector<int>& getY();
vector<pair<string, string>> getAttributes(); vector<pair<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels_t); static vector<int> factorize(const vector<string>& labels_t);
}; };
#endif #endif

View File

@@ -1,8 +1,5 @@
cmake_minimum_required(VERSION 3.14)
project(FImdlp)
# GoogleTest requires at least C++14 # GoogleTest requires at least C++14
set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD 11)
include(FetchContent) include(FetchContent)
include_directories(${GTEST_INCLUDE_DIRS}) include_directories(${GTEST_INCLUDE_DIRS})
@@ -18,7 +15,7 @@ FetchContent_MakeAvailable(googletest)
enable_testing() enable_testing()
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp) add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ../ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp) add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
target_link_libraries(Metrics_unittest GTest::gtest_main) target_link_libraries(Metrics_unittest GTest::gtest_main)
target_link_libraries(FImdlp_unittest GTest::gtest_main) target_link_libraries(FImdlp_unittest GTest::gtest_main)
target_compile_options(Metrics_unittest PRIVATE --coverage) target_compile_options(Metrics_unittest PRIVATE --coverage)

View File

@@ -1,8 +1,9 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "../Metrics.h" #include "../Metrics.h"
#include "../CPPFImdlp.h" #include "../CPPFImdlp.h"
#include "ArffFiles.h" #include <fstream>
#include <iostream> #include <iostream>
#include "ArffFiles.h"
#define EXPECT_THROW_WITH_MESSAGE(stmt, etype, whatstring) EXPECT_THROW( \ #define EXPECT_THROW_WITH_MESSAGE(stmt, etype, whatstring) EXPECT_THROW( \
try { \ try { \
stmt; \ stmt; \
@@ -17,11 +18,23 @@ namespace mdlp {
public: public:
precision_t precision = 0.000001; precision_t precision = 0.000001;
TestFImdlp(): CPPFImdlp() {} TestFImdlp(): CPPFImdlp() {}
string data_path;
void SetUp() void SetUp()
{ {
X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
fit(X, y); fit(X, y);
data_path = set_data_path();
}
string set_data_path()
{
string path = "../datasets/";
ifstream file(path+"iris.arff");
if (file.is_open()) {
file.close();
return path;
}
return "../../tests/datasets/";
} }
void checkSortedVector() void checkSortedVector()
{ {
@@ -37,6 +50,7 @@ namespace mdlp {
{ {
EXPECT_EQ(computed.size(), expected.size()); EXPECT_EQ(computed.size(), expected.size());
for (unsigned long i = 0; i < computed.size(); i++) { for (unsigned long i = 0; i < computed.size(); i++) {
cout << "(" << computed[i] << ", " << expected[i] << ") ";
EXPECT_NEAR(computed[i], expected[i], precision); EXPECT_NEAR(computed[i], expected[i], precision);
} }
} }
@@ -64,7 +78,7 @@ namespace mdlp {
void test_dataset(CPPFImdlp& test, string filename, vector<cutPoints_t>& expected, int depths[]) void test_dataset(CPPFImdlp& test, string filename, vector<cutPoints_t>& expected, int depths[])
{ {
ArffFiles file; ArffFiles file;
file.load("../datasets/" + filename + ".arff", true); file.load(data_path + filename + ".arff", true);
vector<samples_t>& X = file.getX(); vector<samples_t>& X = file.getX();
labels_t& y = file.getY(); labels_t& y = file.getY();
auto attributes = file.getAttributes(); auto attributes = file.getAttributes();
@@ -73,10 +87,8 @@ namespace mdlp {
EXPECT_EQ(test.get_depth(), depths[feature]); EXPECT_EQ(test.get_depth(), depths[feature]);
auto computed = test.getCutPoints(); auto computed = test.getCutPoints();
cout << "Feature " << feature << ": "; cout << "Feature " << feature << ": ";
for (auto item : computed)
cout << item << " ";
cout << endl;
checkCutPoints(computed, expected[feature]); checkCutPoints(computed, expected[feature]);
cout << endl;
} }
} }
}; };

View File

@@ -114,7 +114,7 @@
@attribute 'Ca' real @attribute 'Ca' real
@attribute 'Ba' real @attribute 'Ba' real
@attribute 'Fe' real @attribute 'Fe' real
@attribute 'Type' { 'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps} @attribute 'Type' {'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps}
@data @data
1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0,0,'build wind float' 1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0,0,'build wind float'
1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0,0,'vehic wind float' 1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0,0,'vehic wind float'