Fix mistake in class type of ArffFiles

Add some type casting to CPPFImdlp
Add additional path to datasets in tests
Fix some smells in sample
Join CMakeLists
This commit is contained in:
2023-03-18 18:40:10 +01:00
parent 1f4abade2c
commit f0845c5bd1
12 changed files with 87 additions and 113 deletions

8
.vscode/launch.json vendored
View File

@@ -5,12 +5,14 @@
"version": "0.2.0",
"configurations": [
{
"name": "(lldb) Launch",
"type": "cppdbg",
"name": "lldb samplex",
"type": "lldb",
"request": "launch",
"targetArchitecture": "arm64",
"program": "${workspaceRoot}/sample/build/sample",
"args": [
"mfeat-factors"
"-f",
"glass"
],
"stopAtEntry": false,
"cwd": "${workspaceRoot}/sample/build/",

29
.vscode/tasks.json vendored
View File

@@ -1,29 +0,0 @@
{
"tasks": [
{
"type": "cppbuild",
"label": "C/C++: clang++ build active file",
"command": "/usr/bin/clang++",
"args": [
"-fcolor-diagnostics",
"-fansi-escape-codes",
"-g",
"${file}",
"-o",
"${fileDirname}/${fileBasenameNoExtension}"
],
"options": {
"cwd": "${fileDirname}"
},
"problemMatcher": [
"$gcc"
],
"group": {
"kind": "build",
"isDefault": true
},
"detail": "Task generated by Debugger."
}
],
"version": "2.0.0"
}

View File

@@ -3,5 +3,7 @@ project(mdlp)
set(CMAKE_CXX_STANDARD 11)
add_library(mdlp CPPFImdlp.cpp Metrics.cpp)
add_library(mdlp CPPFImdlp.cpp Metrics.cpp sample/sample.cpp)
add_subdirectory(sample)
add_subdirectory(tests)

View File

@@ -3,7 +3,6 @@
#include <set>
#include <cmath>
#include <limits>
#include <cmath>
#include "CPPFImdlp.h"
#include "Metrics.h"
namespace mdlp {
@@ -21,7 +20,7 @@ namespace mdlp {
if (proposed_cuts == 0) {
return numeric_limits<size_t>::max();
}
if (proposed_cuts < 0 || proposed_cuts > X.size()) {
if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
throw invalid_argument("wrong proposed num_cuts value");
}
if (proposed_cuts < 1)
@@ -125,8 +124,8 @@ namespace mdlp {
// Cutpoints are always on boundaries (definition 2)
if (y[indices[idx]] == y[indices[idx - 1]])
continue;
entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end);
entropy_left = precision_t(idx - start) / static_cast<float>(elements) * metrics.entropy(start, idx);
entropy_right = precision_t(end - idx) / static_cast<float>(elements) * metrics.entropy(idx, end);
if (entropy_left + entropy_right < minEntropy) {
minEntropy = entropy_left + entropy_right;
candidate = idx;
@@ -148,8 +147,8 @@ namespace mdlp {
ent1 = metrics.entropy(start, cut);
ent2 = metrics.entropy(cut, end);
ig = metrics.informationGain(start, cut, end);
delta = log2(pow(3, precision_t(k)) - 2) -
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
delta = static_cast<float>(log2(pow(3, precision_t(k)) - 2) -
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
precision_t term = 1 / N * (log2(N - 1) + delta);
return ig > term;
}

View File

@@ -1,30 +1,21 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Launch sample",
"name": "lldb puro",
"type": "cppdbg",
// "targetArchitecture": "arm64",
"request": "launch",
"program": "${workspaceRoot}/build/sample",
"args": [
"-f",
"glass"
],
"setupCommands": [
{
"description": "Enable pretty-printing for gdb",
"text": "-enable-pretty-printing",
"ignoreFailures": true
}
"iris"
],
"stopAtEntry": false,
"cwd": "${workspaceRoot}/build/",
"environment": [],
"externalConsole": false,
"MIMode": "gdb",
}
"MIMode": "lldb"
},
]
}

View File

@@ -1,5 +1,3 @@
cmake_minimum_required(VERSION 3.20)
project(main)
set(CMAKE_CXX_STANDARD 11)

View File

@@ -14,39 +14,41 @@ using namespace mdlp;
const string PATH = "../../tests/datasets/";
/* print a description of all supported options */
void usage(const char* path)
{
void usage(const char *path) {
/* take only the last portion of the path */
const char* basename = strrchr(path, '/');
const char *basename = strrchr(path, '/');
basename = basename ? basename + 1 : path;
cout << "usage: " << basename << "[OPTION]" << endl;
cout << " -h, --help\t\t Print this help and exit." << endl;
cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl;
cout
<< " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
<< endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl;
cout
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any"
<< endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
}
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
{
tuple<string, string, int, int, float> parse_arguments(int argc, char **argv) {
string file_name;
string path = PATH;
int max_depth = numeric_limits<int>::max();
int min_length = 3;
float max_cutpoints = 0;
static struct option long_options[] = {
{ "help", no_argument, 0, 'h' },
{ "file", required_argument, 0, 'f' },
{ "path", required_argument, 0, 'p' },
{ "max_depth", required_argument, 0, 'm' },
{ "max_cutpoints", required_argument, 0, 'c' },
{ "min_length", required_argument, 0, 'n' },
{ 0, 0, 0, 0 }
{"help", no_argument, nullptr, 'h'},
{"file", required_argument, nullptr, 'f'},
{"path", required_argument, nullptr, 'p'},
{"max_depth", required_argument, nullptr, 'm'},
{"max_cutpoints", required_argument, nullptr, 'c'},
{"min_length", required_argument, nullptr, 'n'},
{nullptr, 0, nullptr, 0}
};
while (1) {
auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0);
while (true) {
auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr);
if (c == -1)
break;
switch (c) {
@@ -57,13 +59,13 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
file_name = optarg;
break;
case 'm':
max_depth = atoi(optarg);
max_depth = (int) strtol(optarg, nullptr, 10);
break;
case 'n':
min_length = atoi(optarg);
min_length = (int) strtol(optarg, nullptr, 10);
break;
case 'c':
max_cutpoints = atof(optarg);
max_cutpoints = strtof(optarg, nullptr);
break;
case 'p':
path = optarg;
@@ -84,8 +86,8 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
}
void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints)
{
void process_file(const string &path, const string &file_name, bool class_last, int max_depth, int min_length,
float max_cutpoints) {
ArffFiles file;
file.load(path + file_name + ".arff", class_last);
@@ -93,16 +95,16 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
int items = file.getSize();
cout << "Number of lines: " << items << endl;
cout << "Attributes: " << endl;
for (auto attribute : attributes) {
for (auto attribute: attributes) {
cout << "Name: " << get<0>(attribute) << " Type: " << get<1>(attribute) << endl;
}
cout << "Class name: " << file.getClassName() << endl;
cout << "Class type: " << file.getClassType() << endl;
cout << "Data: " << endl;
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
vector<samples_t> &X = file.getX();
labels_t &y = file.getY();
for (int i = 0; i < 5; i++) {
for (auto feature : X) {
for (auto feature: X) {
cout << fixed << setprecision(1) << feature[i] << " ";
}
cout << y[i] << endl;
@@ -115,7 +117,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
cout << "--------------------------" << setprecision(3) << endl;
test.fit(X[i], y);
for (auto item : test.getCutPoints()) {
for (auto item: test.getCutPoints()) {
cout << item << endl;
}
total += test.getCutPoints().size();
@@ -124,17 +126,18 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
cout << "Total feature states: " << total + attributes.size() << endl;
}
void process_all_files(map<string, bool> datasets, string path, int max_depth, int min_length, float max_cutpoints)
{
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl;
void process_all_files(const map<string, bool> &datasets, const string &path, int max_depth, int min_length,
float max_cutpoints) {
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
<< max_cutpoints << endl << endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
printf("==================== ==== ==== ========\n");
for (auto dataset : datasets) {
for (const auto &dataset: datasets) {
ArffFiles file;
file.load(path + dataset.first + ".arff", dataset.second);
auto attributes = file.getAttributes();
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
vector<samples_t> &X = file.getX();
labels_t &y = file.getY();
size_t timing = 0;
int cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) {
@@ -150,8 +153,7 @@ void process_all_files(map<string, bool> datasets, string path, int max_depth, i
}
int main(int argc, char** argv)
{
int main(int argc, char **argv) {
map<string, bool> datasets = {
{"glass", true},
{"iris", true},

View File

@@ -2,13 +2,10 @@
#include <fstream>
#include <sstream>
#include <map>
#include <iostream>
using namespace std;
ArffFiles::ArffFiles()
{
}
ArffFiles::ArffFiles() = default;
vector<string> ArffFiles::getLines()
{
return lines;
@@ -37,19 +34,22 @@ vector<int>& ArffFiles::getY()
{
return y;
}
void ArffFiles::load(string fileName, bool classLast)
void ArffFiles::load(const string fileName, bool classLast)
{
ifstream file(fileName);
if (file.is_open()) {
string line, keyword, attribute, type;
string line, keyword, attribute, type, type_w;
while (getline(file, line)) {
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue;
}
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
stringstream ss(line);
ss >> keyword >> attribute >> type;
attributes.push_back({ attribute, type });
ss >> keyword >> attribute;
type = "";
while(ss >> type_w)
type += type_w + " ";
attributes.emplace_back(attribute, type );
continue;
}
if (line[0] == '@') {
@@ -77,7 +77,7 @@ void ArffFiles::generateDataset(bool classLast)
{
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0;
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]);
string value;
@@ -92,7 +92,7 @@ void ArffFiles::generateDataset(bool classLast)
}
y = factorize(yy);
}
string ArffFiles::trim(const string& source)
string ArffFiles::trim(const string& source)
{
string s(source);
s.erase(0, s.find_first_not_of(" \n\r\t"));
@@ -105,7 +105,7 @@ vector<int> ArffFiles::factorize(const vector<string>& labels_t)
yy.reserve(labels_t.size());
map<string, int> labelMap;
int i = 0;
for (string label : labels_t) {
for (const string &label : labels_t) {
if (labelMap.find(label) == labelMap.end()) {
labelMap[label] = i++;
}

View File

@@ -18,10 +18,10 @@ public:
unsigned long int getSize();
string getClassName();
string getClassType();
string trim(const string&);
static string trim(const string&);
vector<vector<float>>& getX();
vector<int>& getY();
vector<pair<string, string>> getAttributes();
vector<int> factorize(const vector<string>& labels_t);
static vector<int> factorize(const vector<string>& labels_t);
};
#endif

View File

@@ -1,8 +1,5 @@
cmake_minimum_required(VERSION 3.14)
project(FImdlp)
# GoogleTest requires at least C++14
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD 11)
include(FetchContent)
include_directories(${GTEST_INCLUDE_DIRS})
@@ -18,7 +15,7 @@ FetchContent_MakeAvailable(googletest)
enable_testing()
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ../ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
target_link_libraries(Metrics_unittest GTest::gtest_main)
target_link_libraries(FImdlp_unittest GTest::gtest_main)
target_compile_options(Metrics_unittest PRIVATE --coverage)

View File

@@ -1,8 +1,9 @@
#include "gtest/gtest.h"
#include "../Metrics.h"
#include "../CPPFImdlp.h"
#include "ArffFiles.h"
#include <fstream>
#include <iostream>
#include "ArffFiles.h"
#define EXPECT_THROW_WITH_MESSAGE(stmt, etype, whatstring) EXPECT_THROW( \
try { \
stmt; \
@@ -17,11 +18,23 @@ namespace mdlp {
public:
precision_t precision = 0.000001;
TestFImdlp(): CPPFImdlp() {}
string data_path;
void SetUp()
{
X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
fit(X, y);
data_path = set_data_path();
}
string set_data_path()
{
string path = "../datasets/";
ifstream file(path+"iris.arff");
if (file.is_open()) {
file.close();
return path;
}
return "../../tests/datasets/";
}
void checkSortedVector()
{
@@ -37,6 +50,7 @@ namespace mdlp {
{
EXPECT_EQ(computed.size(), expected.size());
for (unsigned long i = 0; i < computed.size(); i++) {
cout << "(" << computed[i] << ", " << expected[i] << ") ";
EXPECT_NEAR(computed[i], expected[i], precision);
}
}
@@ -64,7 +78,7 @@ namespace mdlp {
void test_dataset(CPPFImdlp& test, string filename, vector<cutPoints_t>& expected, int depths[])
{
ArffFiles file;
file.load("../datasets/" + filename + ".arff", true);
file.load(data_path + filename + ".arff", true);
vector<samples_t>& X = file.getX();
labels_t& y = file.getY();
auto attributes = file.getAttributes();
@@ -73,10 +87,8 @@ namespace mdlp {
EXPECT_EQ(test.get_depth(), depths[feature]);
auto computed = test.getCutPoints();
cout << "Feature " << feature << ": ";
for (auto item : computed)
cout << item << " ";
cout << endl;
checkCutPoints(computed, expected[feature]);
cout << endl;
}
}
};

View File

@@ -114,7 +114,7 @@
@attribute 'Ca' real
@attribute 'Ba' real
@attribute 'Fe' real
@attribute 'Type' { 'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps}
@attribute 'Type' {'build wind float', 'build wind non-float', 'vehic wind float', 'vehic wind non-float', containers, tableware, headlamps}
@data
1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0,0,'build wind float'
1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0,0,'vehic wind float'