mirror of
https://github.com/rmontanana/mdlp.git
synced 2025-08-15 15:35:55 +00:00
Fix mistake in class type of ArffFiles
Add some type casting to CPPFImdlp Add additional path to datasets in tests Fix some smells in sample Join CMakeLists
This commit is contained in:
8
.vscode/launch.json
vendored
8
.vscode/launch.json
vendored
@@ -5,12 +5,14 @@
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "(lldb) Launch",
|
||||
"type": "cppdbg",
|
||||
"name": "lldb samplex",
|
||||
"type": "lldb",
|
||||
"request": "launch",
|
||||
"targetArchitecture": "arm64",
|
||||
"program": "${workspaceRoot}/sample/build/sample",
|
||||
"args": [
|
||||
"mfeat-factors"
|
||||
"-f",
|
||||
"glass"
|
||||
],
|
||||
"stopAtEntry": false,
|
||||
"cwd": "${workspaceRoot}/sample/build/",
|
||||
|
29
.vscode/tasks.json
vendored
29
.vscode/tasks.json
vendored
@@ -1,29 +0,0 @@
|
||||
{
|
||||
"tasks": [
|
||||
{
|
||||
"type": "cppbuild",
|
||||
"label": "C/C++: clang++ build active file",
|
||||
"command": "/usr/bin/clang++",
|
||||
"args": [
|
||||
"-fcolor-diagnostics",
|
||||
"-fansi-escape-codes",
|
||||
"-g",
|
||||
"${file}",
|
||||
"-o",
|
||||
"${fileDirname}/${fileBasenameNoExtension}"
|
||||
],
|
||||
"options": {
|
||||
"cwd": "${fileDirname}"
|
||||
},
|
||||
"problemMatcher": [
|
||||
"$gcc"
|
||||
],
|
||||
"group": {
|
||||
"kind": "build",
|
||||
"isDefault": true
|
||||
},
|
||||
"detail": "Task generated by Debugger."
|
||||
}
|
||||
],
|
||||
"version": "2.0.0"
|
||||
}
|
@@ -3,5 +3,7 @@ project(mdlp)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
add_library(mdlp CPPFImdlp.cpp Metrics.cpp)
|
||||
add_library(mdlp CPPFImdlp.cpp Metrics.cpp sample/sample.cpp)
|
||||
add_subdirectory(sample)
|
||||
add_subdirectory(tests)
|
||||
|
||||
|
@@ -3,7 +3,6 @@
|
||||
#include <set>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
#include "CPPFImdlp.h"
|
||||
#include "Metrics.h"
|
||||
namespace mdlp {
|
||||
@@ -21,7 +20,7 @@ namespace mdlp {
|
||||
if (proposed_cuts == 0) {
|
||||
return numeric_limits<size_t>::max();
|
||||
}
|
||||
if (proposed_cuts < 0 || proposed_cuts > X.size()) {
|
||||
if (proposed_cuts < 0 || proposed_cuts > static_cast<float>(X.size())) {
|
||||
throw invalid_argument("wrong proposed num_cuts value");
|
||||
}
|
||||
if (proposed_cuts < 1)
|
||||
@@ -125,8 +124,8 @@ namespace mdlp {
|
||||
// Cutpoints are always on boundaries (definition 2)
|
||||
if (y[indices[idx]] == y[indices[idx - 1]])
|
||||
continue;
|
||||
entropy_left = precision_t(idx - start) / elements * metrics.entropy(start, idx);
|
||||
entropy_right = precision_t(end - idx) / elements * metrics.entropy(idx, end);
|
||||
entropy_left = precision_t(idx - start) / static_cast<float>(elements) * metrics.entropy(start, idx);
|
||||
entropy_right = precision_t(end - idx) / static_cast<float>(elements) * metrics.entropy(idx, end);
|
||||
if (entropy_left + entropy_right < minEntropy) {
|
||||
minEntropy = entropy_left + entropy_right;
|
||||
candidate = idx;
|
||||
@@ -148,8 +147,8 @@ namespace mdlp {
|
||||
ent1 = metrics.entropy(start, cut);
|
||||
ent2 = metrics.entropy(cut, end);
|
||||
ig = metrics.informationGain(start, cut, end);
|
||||
delta = log2(pow(3, precision_t(k)) - 2) -
|
||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2);
|
||||
delta = static_cast<float>(log2(pow(3, precision_t(k)) - 2) -
|
||||
(precision_t(k) * ent - precision_t(k1) * ent1 - precision_t(k2) * ent2));
|
||||
precision_t term = 1 / N * (log2(N - 1) + delta);
|
||||
return ig > term;
|
||||
}
|
||||
|
19
sample/.vscode/launch.json
vendored
19
sample/.vscode/launch.json
vendored
@@ -1,30 +1,21 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Launch sample",
|
||||
"name": "lldb puro",
|
||||
"type": "cppdbg",
|
||||
// "targetArchitecture": "arm64",
|
||||
"request": "launch",
|
||||
"program": "${workspaceRoot}/build/sample",
|
||||
"args": [
|
||||
"-f",
|
||||
"glass"
|
||||
],
|
||||
"setupCommands": [
|
||||
{
|
||||
"description": "Enable pretty-printing for gdb",
|
||||
"text": "-enable-pretty-printing",
|
||||
"ignoreFailures": true
|
||||
}
|
||||
"iris"
|
||||
],
|
||||
"stopAtEntry": false,
|
||||
"cwd": "${workspaceRoot}/build/",
|
||||
"environment": [],
|
||||
"externalConsole": false,
|
||||
"MIMode": "gdb",
|
||||
}
|
||||
"MIMode": "lldb"
|
||||
},
|
||||
]
|
||||
}
|
@@ -1,5 +1,3 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
project(main)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
|
||||
|
@@ -14,39 +14,41 @@ using namespace mdlp;
|
||||
const string PATH = "../../tests/datasets/";
|
||||
|
||||
/* print a description of all supported options */
|
||||
void usage(const char* path)
|
||||
{
|
||||
void usage(const char *path) {
|
||||
/* take only the last portion of the path */
|
||||
const char *basename = strrchr(path, '/');
|
||||
basename = basename ? basename + 1 : path;
|
||||
|
||||
cout << "usage: " << basename << "[OPTION]" << endl;
|
||||
cout << " -h, --help\t\t Print this help and exit." << endl;
|
||||
cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl;
|
||||
cout
|
||||
<< " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}."
|
||||
<< endl;
|
||||
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
|
||||
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
|
||||
cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl;
|
||||
cout
|
||||
<< " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any"
|
||||
<< endl;
|
||||
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
|
||||
}
|
||||
|
||||
tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
|
||||
{
|
||||
tuple<string, string, int, int, float> parse_arguments(int argc, char **argv) {
|
||||
string file_name;
|
||||
string path = PATH;
|
||||
int max_depth = numeric_limits<int>::max();
|
||||
int min_length = 3;
|
||||
float max_cutpoints = 0;
|
||||
static struct option long_options[] = {
|
||||
{ "help", no_argument, 0, 'h' },
|
||||
{ "file", required_argument, 0, 'f' },
|
||||
{ "path", required_argument, 0, 'p' },
|
||||
{ "max_depth", required_argument, 0, 'm' },
|
||||
{ "max_cutpoints", required_argument, 0, 'c' },
|
||||
{ "min_length", required_argument, 0, 'n' },
|
||||
{ 0, 0, 0, 0 }
|
||||
{"help", no_argument, nullptr, 'h'},
|
||||
{"file", required_argument, nullptr, 'f'},
|
||||
{"path", required_argument, nullptr, 'p'},
|
||||
{"max_depth", required_argument, nullptr, 'm'},
|
||||
{"max_cutpoints", required_argument, nullptr, 'c'},
|
||||
{"min_length", required_argument, nullptr, 'n'},
|
||||
{nullptr, 0, nullptr, 0}
|
||||
};
|
||||
while (1) {
|
||||
auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0);
|
||||
while (true) {
|
||||
auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, nullptr);
|
||||
if (c == -1)
|
||||
break;
|
||||
switch (c) {
|
||||
@@ -57,13 +59,13 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
|
||||
file_name = optarg;
|
||||
break;
|
||||
case 'm':
|
||||
max_depth = atoi(optarg);
|
||||
max_depth = (int) strtol(optarg, nullptr, 10);
|
||||
break;
|
||||
case 'n':
|
||||
min_length = atoi(optarg);
|
||||
min_length = (int) strtol(optarg, nullptr, 10);
|
||||
break;
|
||||
case 'c':
|
||||
max_cutpoints = atof(optarg);
|
||||
max_cutpoints = strtof(optarg, nullptr);
|
||||
break;
|
||||
case 'p':
|
||||
path = optarg;
|
||||
@@ -84,8 +86,8 @@ tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
|
||||
return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
|
||||
}
|
||||
|
||||
void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints)
|
||||
{
|
||||
void process_file(const string &path, const string &file_name, bool class_last, int max_depth, int min_length,
|
||||
float max_cutpoints) {
|
||||
ArffFiles file;
|
||||
|
||||
file.load(path + file_name + ".arff", class_last);
|
||||
@@ -124,12 +126,13 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
|
||||
cout << "Total feature states: " << total + attributes.size() << endl;
|
||||
}
|
||||
|
||||
void process_all_files(map<string, bool> datasets, string path, int max_depth, int min_length, float max_cutpoints)
|
||||
{
|
||||
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl;
|
||||
void process_all_files(const map<string, bool> &datasets, const string &path, int max_depth, int min_length,
|
||||
float max_cutpoints) {
|
||||
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << " Max_cutpoints: "
|
||||
<< max_cutpoints << endl << endl;
|
||||
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
|
||||
printf("==================== ==== ==== ========\n");
|
||||
for (auto dataset : datasets) {
|
||||
for (const auto &dataset: datasets) {
|
||||
ArffFiles file;
|
||||
file.load(path + dataset.first + ".arff", dataset.second);
|
||||
auto attributes = file.getAttributes();
|
||||
@@ -150,8 +153,7 @@ void process_all_files(map<string, bool> datasets, string path, int max_depth, i
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int main(int argc, char **argv) {
|
||||
map<string, bool> datasets = {
|
||||
{"glass", true},
|
||||
{"iris", true},
|
||||
|
@@ -2,13 +2,10 @@
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
ArffFiles::ArffFiles()
|
||||
{
|
||||
}
|
||||
ArffFiles::ArffFiles() = default;
|
||||
vector<string> ArffFiles::getLines()
|
||||
{
|
||||
return lines;
|
||||
@@ -37,19 +34,22 @@ vector<int>& ArffFiles::getY()
|
||||
{
|
||||
return y;
|
||||
}
|
||||
void ArffFiles::load(string fileName, bool classLast)
|
||||
void ArffFiles::load(const string fileName, bool classLast)
|
||||
{
|
||||
ifstream file(fileName);
|
||||
if (file.is_open()) {
|
||||
string line, keyword, attribute, type;
|
||||
string line, keyword, attribute, type, type_w;
|
||||
while (getline(file, line)) {
|
||||
if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
|
||||
continue;
|
||||
}
|
||||
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
|
||||
stringstream ss(line);
|
||||
ss >> keyword >> attribute >> type;
|
||||
attributes.push_back({ attribute, type });
|
||||
ss >> keyword >> attribute;
|
||||
type = "";
|
||||
while(ss >> type_w)
|
||||
type += type_w + " ";
|
||||
attributes.emplace_back(attribute, type );
|
||||
continue;
|
||||
}
|
||||
if (line[0] == '@') {
|
||||
@@ -77,7 +77,7 @@ void ArffFiles::generateDataset(bool classLast)
|
||||
{
|
||||
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
|
||||
vector<string> yy = vector<string>(lines.size(), "");
|
||||
int labelIndex = classLast ? attributes.size() : 0;
|
||||
int labelIndex = classLast ? static_cast<int>(attributes.size()) : 0;
|
||||
for (size_t i = 0; i < lines.size(); i++) {
|
||||
stringstream ss(lines[i]);
|
||||
string value;
|
||||
@@ -105,7 +105,7 @@ vector<int> ArffFiles::factorize(const vector<string>& labels_t)
|
||||
yy.reserve(labels_t.size());
|
||||
map<string, int> labelMap;
|
||||
int i = 0;
|
||||
for (string label : labels_t) {
|
||||
for (const string &label : labels_t) {
|
||||
if (labelMap.find(label) == labelMap.end()) {
|
||||
labelMap[label] = i++;
|
||||
}
|
||||
|
@@ -18,10 +18,10 @@ public:
|
||||
unsigned long int getSize();
|
||||
string getClassName();
|
||||
string getClassType();
|
||||
string trim(const string&);
|
||||
static string trim(const string&);
|
||||
vector<vector<float>>& getX();
|
||||
vector<int>& getY();
|
||||
vector<pair<string, string>> getAttributes();
|
||||
vector<int> factorize(const vector<string>& labels_t);
|
||||
static vector<int> factorize(const vector<string>& labels_t);
|
||||
};
|
||||
#endif
|
@@ -1,8 +1,5 @@
|
||||
cmake_minimum_required(VERSION 3.14)
|
||||
project(FImdlp)
|
||||
|
||||
# GoogleTest requires at least C++14
|
||||
set(CMAKE_CXX_STANDARD 14)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
include(FetchContent)
|
||||
|
||||
include_directories(${GTEST_INCLUDE_DIRS})
|
||||
@@ -18,7 +15,7 @@ FetchContent_MakeAvailable(googletest)
|
||||
enable_testing()
|
||||
|
||||
add_executable(Metrics_unittest ../Metrics.cpp Metrics_unittest.cpp)
|
||||
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ../ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
|
||||
add_executable(FImdlp_unittest ../CPPFImdlp.cpp ArffFiles.cpp ../Metrics.cpp FImdlp_unittest.cpp)
|
||||
target_link_libraries(Metrics_unittest GTest::gtest_main)
|
||||
target_link_libraries(FImdlp_unittest GTest::gtest_main)
|
||||
target_compile_options(Metrics_unittest PRIVATE --coverage)
|
||||
|
@@ -1,8 +1,9 @@
|
||||
#include "gtest/gtest.h"
|
||||
#include "../Metrics.h"
|
||||
#include "../CPPFImdlp.h"
|
||||
#include "ArffFiles.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include "ArffFiles.h"
|
||||
#define EXPECT_THROW_WITH_MESSAGE(stmt, etype, whatstring) EXPECT_THROW( \
|
||||
try { \
|
||||
stmt; \
|
||||
@@ -17,11 +18,23 @@ namespace mdlp {
|
||||
public:
|
||||
precision_t precision = 0.000001;
|
||||
TestFImdlp(): CPPFImdlp() {}
|
||||
string data_path;
|
||||
void SetUp()
|
||||
{
|
||||
X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
|
||||
y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
|
||||
fit(X, y);
|
||||
data_path = set_data_path();
|
||||
}
|
||||
string set_data_path()
|
||||
{
|
||||
string path = "../datasets/";
|
||||
ifstream file(path+"iris.arff");
|
||||
if (file.is_open()) {
|
||||
file.close();
|
||||
return path;
|
||||
}
|
||||
return "../../tests/datasets/";
|
||||
}
|
||||
void checkSortedVector()
|
||||
{
|
||||
@@ -37,6 +50,7 @@ namespace mdlp {
|
||||
{
|
||||
EXPECT_EQ(computed.size(), expected.size());
|
||||
for (unsigned long i = 0; i < computed.size(); i++) {
|
||||
cout << "(" << computed[i] << ", " << expected[i] << ") ";
|
||||
EXPECT_NEAR(computed[i], expected[i], precision);
|
||||
}
|
||||
}
|
||||
@@ -64,7 +78,7 @@ namespace mdlp {
|
||||
void test_dataset(CPPFImdlp& test, string filename, vector<cutPoints_t>& expected, int depths[])
|
||||
{
|
||||
ArffFiles file;
|
||||
file.load("../datasets/" + filename + ".arff", true);
|
||||
file.load(data_path + filename + ".arff", true);
|
||||
vector<samples_t>& X = file.getX();
|
||||
labels_t& y = file.getY();
|
||||
auto attributes = file.getAttributes();
|
||||
@@ -73,10 +87,8 @@ namespace mdlp {
|
||||
EXPECT_EQ(test.get_depth(), depths[feature]);
|
||||
auto computed = test.getCutPoints();
|
||||
cout << "Feature " << feature << ": ";
|
||||
for (auto item : computed)
|
||||
cout << item << " ";
|
||||
cout << endl;
|
||||
checkCutPoints(computed, expected[feature]);
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
Reference in New Issue
Block a user