Add max_cutpoints Hyperparameter

This commit is contained in:
2023-03-13 01:17:04 +01:00
parent ed784736ca
commit ffb8df4d1c
7 changed files with 105 additions and 41 deletions

View File

@@ -3,25 +3,42 @@
#include <set> #include <set>
#include <cmath> #include <cmath>
#include <limits> #include <limits>
#include <cmath>
#include "CPPFImdlp.h" #include "CPPFImdlp.h"
#include "Metrics.h" #include "Metrics.h"
namespace mdlp { namespace mdlp {
CPPFImdlp::CPPFImdlp():depth(0), max_depth(numeric_limits<int>::max()), min_length(3), indices(indices_t()), X(samples_t()), y(labels_t()), CPPFImdlp::CPPFImdlp():min_length(3), depth(0), max_depth(numeric_limits<int>::max()), proposed_cuts(0),
metrics(Metrics(y, indices)) indices(indices_t()), X(samples_t()), y(labels_t()),
metrics(Metrics(y, indices)), num_cut_points(numeric_limits<size_t>::max())
{ {
} }
CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_): depth(0), max_depth(max_depth_), min_length(min_length_), indices(indices_t()), X(samples_t()), y(labels_t()), CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed): min_length(min_length_), depth(0),
metrics(Metrics(y, indices)) max_depth(max_depth_), proposed_cuts(proposed), indices(indices_t()), X(samples_t()), y(labels_t()),
metrics(Metrics(y, indices)), num_cut_points(numeric_limits<size_t>::max())
{ {
} }
CPPFImdlp::~CPPFImdlp() = default; CPPFImdlp::~CPPFImdlp() = default;
CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_) size_t CPPFImdlp::compute_max_num_cut_points()
{
// Set the actual maximum number of cut points as a number or as a percentage of the number of samples
if (proposed_cuts == 0) {
return numeric_limits<size_t>::max();
}
if (proposed_cuts < 0 || proposed_cuts > X.size()) {
throw invalid_argument("wrong proposed num_cuts value");
}
if (proposed_cuts < 1)
return (int)round(X.size() * proposed_cuts);
return (int)proposed_cuts;
}
void CPPFImdlp::fit(samples_t& X_, labels_t& y_)
{ {
X = X_; X = X_;
y = y_; y = y_;
num_cut_points = compute_max_num_cut_points();
depth = 0; depth = 0;
cutPoints.clear(); cutPoints.clear();
if (X.size() != y.size()) { if (X.size() != y.size()) {
@@ -39,7 +56,6 @@ namespace mdlp {
indices = sortIndices(X_, y_); indices = sortIndices(X_, y_);
metrics.setData(y, indices); metrics.setData(y, indices);
computeCutPoints(0, X.size(), 1); computeCutPoints(0, X.size(), 1);
return *this;
} }
pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) pair<precision_t, size_t> CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end)
@@ -75,6 +91,8 @@ namespace mdlp {
{ {
size_t cut; size_t cut;
pair<precision_t, size_t> result; pair<precision_t, size_t> result;
if (cutPoints.size() == num_cut_points)
return;
// Check if the interval length and the depth are Ok // Check if the interval length and the depth are Ok
if (end - start < min_length || depth_ > max_depth) if (end - start < min_length || depth_ > max_depth)
return; return;
@@ -158,15 +176,8 @@ namespace mdlp {
cutPoints_t CPPFImdlp::getCutPoints() cutPoints_t CPPFImdlp::getCutPoints()
{ {
// Remove duplicates and sort sort(cutPoints.begin(), cutPoints.end());
cutPoints_t output(cutPoints.size()); return cutPoints;
set<precision_t> s;
unsigned size = cutPoints.size();
for (unsigned i = 0; i < size; i++)
s.insert(cutPoints[i]);
output.assign(s.begin(), s.end());
sort(output.begin(), output.end());
return output;
} }
int CPPFImdlp::get_depth() int CPPFImdlp::get_depth()
{ {

View File

@@ -9,22 +9,25 @@ namespace mdlp {
protected: protected:
size_t min_length; size_t min_length;
int depth, max_depth; int depth, max_depth;
float proposed_cuts;
indices_t indices;
samples_t X; samples_t X;
labels_t y; labels_t y;
indices_t indices;
Metrics metrics; Metrics metrics;
cutPoints_t cutPoints; cutPoints_t cutPoints;
size_t num_cut_points;
static indices_t sortIndices(samples_t&, labels_t&); static indices_t sortIndices(samples_t&, labels_t&);
void computeCutPoints(size_t, size_t, int); void computeCutPoints(size_t, size_t, int);
bool mdlp(size_t, size_t, size_t); bool mdlp(size_t, size_t, size_t);
size_t getCandidate(size_t, size_t); size_t getCandidate(size_t, size_t);
size_t compute_max_num_cut_points();
pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t); pair<precision_t, size_t> valueCutPoint(size_t, size_t, size_t);
public: public:
CPPFImdlp(); CPPFImdlp();
CPPFImdlp(size_t, int); CPPFImdlp(size_t, int, float);
~CPPFImdlp(); ~CPPFImdlp();
CPPFImdlp& fit(samples_t&, labels_t&); void fit(samples_t&, labels_t&);
cutPoints_t getCutPoints(); cutPoints_t getCutPoints();
int get_depth(); int get_depth();
inline string version() { return "1.1.1"; }; inline string version() { return "1.1.1"; };

View File

@@ -25,25 +25,28 @@ void usage(const char* path)
cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl; cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl;
cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl;
cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
} }
tuple<string, string, int, int> parse_arguments(int argc, char** argv) tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
{ {
string file_name; string file_name;
string path = PATH; string path = PATH;
int max_depth = numeric_limits<int>::max(); int max_depth = numeric_limits<int>::max();
int min_length = 3; int min_length = 3;
float max_cutpoints = 0;
static struct option long_options[] = { static struct option long_options[] = {
{ "help", no_argument, 0, 'h' }, { "help", no_argument, 0, 'h' },
{ "file", required_argument, 0, 'f' }, { "file", required_argument, 0, 'f' },
{ "path", required_argument, 0, 'p' }, { "path", required_argument, 0, 'p' },
{ "max_depth", required_argument, 0, 'm' }, { "max_depth", required_argument, 0, 'm' },
{ "max_cutpoints", required_argument, 0, 'c' },
{ "min_length", required_argument, 0, 'n' }, { "min_length", required_argument, 0, 'n' },
{ 0, 0, 0, 0 } { 0, 0, 0, 0 }
}; };
while (1) { while (1) {
auto c = getopt_long(argc, argv, "hf:p:m:n:", long_options, 0); auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0);
if (c == -1) if (c == -1)
break; break;
switch (c) { switch (c) {
@@ -59,6 +62,9 @@ tuple<string, string, int, int> parse_arguments(int argc, char** argv)
case 'n': case 'n':
min_length = atoi(optarg); min_length = atoi(optarg);
break; break;
case 'c':
max_cutpoints = atof(optarg);
break;
case 'p': case 'p':
path = optarg; path = optarg;
if (path.back() != '/') if (path.back() != '/')
@@ -75,10 +81,10 @@ tuple<string, string, int, int> parse_arguments(int argc, char** argv)
usage(argv[0]); usage(argv[0]);
exit(1); exit(1);
} }
return make_tuple(file_name, path, max_depth, min_length); return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
} }
void process_file(string path, string file_name, bool class_last, int max_depth, int min_length) void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints)
{ {
ArffFiles file; ArffFiles file;
@@ -101,7 +107,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
} }
cout << y[i] << endl; cout << y[i] << endl;
} }
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth); mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
auto total = 0; auto total = 0;
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
auto min_max = minmax_element(X[i].begin(), X[i].end()); auto min_max = minmax_element(X[i].begin(), X[i].end());
@@ -118,7 +124,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth,
cout << "Total feature states: " << total + attributes.size() << endl; cout << "Total feature states: " << total + attributes.size() << endl;
} }
void process_all_files(map<string, bool> datasets, string path, int max_depth, int min_length) void process_all_files(map<string, bool> datasets, string path, int max_depth, int min_length, float max_cutpoints)
{ {
cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl; cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl;
printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
@@ -132,7 +138,7 @@ void process_all_files(map<string, bool> datasets, string path, int max_depth, i
size_t timing = 0; size_t timing = 0;
int cut_points = 0; int cut_points = 0;
for (auto i = 0; i < attributes.size(); i++) { for (auto i = 0; i < attributes.size(); i++) {
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth); mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
test.fit(X[i], y); test.fit(X[i], y);
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
@@ -157,19 +163,29 @@ int main(int argc, char** argv)
}; };
string file_name, path; string file_name, path;
int max_depth, min_length; int max_depth, min_length;
tie(file_name, path, max_depth, min_length) = parse_arguments(argc, argv); float max_cutpoints;
tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
if (datasets.find(file_name) == datasets.end() && file_name != "all") { if (datasets.find(file_name) == datasets.end() && file_name != "all") {
cout << "Invalid file name: " << file_name << endl; cout << "Invalid file name: " << file_name << endl;
usage(argv[0]); usage(argv[0]);
exit(1); exit(1);
} }
if (file_name == "all") if (file_name == "all")
process_all_files(datasets, path, max_depth, min_length); process_all_files(datasets, path, max_depth, min_length, max_cutpoints);
else { else {
process_file(path, file_name, datasets[file_name], max_depth, min_length); process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints);
cout << "File name: " << file_name << endl; cout << "File name: " << file_name << endl;
cout << "Max depth: " << max_depth << endl; cout << "Max depth: " << max_depth << endl;
cout << "Min length: " << min_length << endl; cout << "Min length: " << min_length << endl;
} }
mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
samples_t X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
labels_t y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 };
test.fit(X, y);
vector<precision_t> computed = test.getCutPoints();
cout << "Computed cut points: " << endl;
for (auto item : computed) {
cout << item << endl;
}
return 0; return 0;
} }

View File

@@ -40,11 +40,10 @@ vector<int>& ArffFiles::getY()
void ArffFiles::load(string fileName, bool classLast) void ArffFiles::load(string fileName, bool classLast)
{ {
ifstream file(fileName); ifstream file(fileName);
string keyword, attribute, type;
if (file.is_open()) { if (file.is_open()) {
string line; string line, keyword, attribute, type;
while (getline(file, line)) { while (getline(file, line)) {
if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { if (line.empty() || line[0] == '%' || line == "\r" || line == " ") {
continue; continue;
} }
if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) {
@@ -79,7 +78,7 @@ void ArffFiles::generateDataset(bool classLast)
X = vector<vector<float>>(attributes.size(), vector<float>(lines.size())); X = vector<vector<float>>(attributes.size(), vector<float>(lines.size()));
vector<string> yy = vector<string>(lines.size(), ""); vector<string> yy = vector<string>(lines.size(), "");
int labelIndex = classLast ? attributes.size() : 0; int labelIndex = classLast ? attributes.size() : 0;
for (int i = 0; i < lines.size(); i++) { for (size_t i = 0; i < lines.size(); i++) {
stringstream ss(lines[i]); stringstream ss(lines[i]);
string value; string value;
int pos = 0, xIndex = 0; int pos = 0, xIndex = 0;

View File

@@ -86,13 +86,22 @@ namespace mdlp {
} }
TEST_F(TestFImdlp, FitErrorMinLengtMaxDepth) TEST_F(TestFImdlp, FitErrorMinLengtMaxDepth)
{ {
auto testLength = CPPFImdlp(2, 10); auto testLength = CPPFImdlp(2, 10, 0);
auto testDepth = CPPFImdlp(3, 0); auto testDepth = CPPFImdlp(3, 0, 0);
X = { 1, 2, 3 }; X = { 1, 2, 3 };
y = { 1, 2, 3 }; y = { 1, 2, 3 };
EXPECT_THROW(testLength.fit(X, y), invalid_argument); EXPECT_THROW(testLength.fit(X, y), invalid_argument);
EXPECT_THROW(testDepth.fit(X, y), invalid_argument); EXPECT_THROW(testDepth.fit(X, y), invalid_argument);
} }
TEST_F(TestFImdlp, FitErrorMaxCutPoints)
{
auto testmin = CPPFImdlp(2, 10, -1);
auto testmax = CPPFImdlp(3, 0, 200);
X = { 1, 2, 3 };
y = { 1, 2, 3 };
EXPECT_THROW(testmin.fit(X, y), invalid_argument);
EXPECT_THROW(testmax.fit(X, y), invalid_argument);
}
TEST_F(TestFImdlp, SortIndices) TEST_F(TestFImdlp, SortIndices)
{ {
X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 };
@@ -139,10 +148,8 @@ namespace mdlp {
TEST_F(TestFImdlp, TestArtificialDataset) TEST_F(TestFImdlp, TestArtificialDataset)
{ {
fit(X, y); fit(X, y);
computeCutPoints(0, 20, 1);
cutPoints_t expected = { 5.05 }; cutPoints_t expected = { 5.05 };
vector<precision_t> computed = getCutPoints(); vector<precision_t> computed = getCutPoints();
computed = getCutPoints();
int expectedSize = expected.size(); int expectedSize = expected.size();
EXPECT_EQ(computed.size(), expected.size()); EXPECT_EQ(computed.size(), expected.size());
for (unsigned long i = 0; i < computed.size(); i++) { for (unsigned long i = 0; i < computed.size(); i++) {
@@ -194,7 +201,7 @@ namespace mdlp {
TEST_F(TestFImdlp, MaxDepth) TEST_F(TestFImdlp, MaxDepth)
{ {
// Set max_depth to 1 // Set max_depth to 1
auto test = CPPFImdlp(3, 1); auto test = CPPFImdlp(3, 1, 0);
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{ 5.45 }, { 5.45 },
{ 3.35 }, { 3.35 },
@@ -206,7 +213,7 @@ namespace mdlp {
} }
TEST_F(TestFImdlp, MinLength) TEST_F(TestFImdlp, MinLength)
{ {
auto test = CPPFImdlp(75, 100); auto test = CPPFImdlp(75, 100, 0);
// Set min_length to 75 // Set min_length to 75
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{ 5.45, 5.75 }, { 5.45, 5.75 },
@@ -220,7 +227,33 @@ namespace mdlp {
TEST_F(TestFImdlp, MinLengthMaxDepth) TEST_F(TestFImdlp, MinLengthMaxDepth)
{ {
// Set min_length to 75 // Set min_length to 75
auto test = CPPFImdlp(75, 2); auto test = CPPFImdlp(75, 2, 0);
vector<cutPoints_t> expected = {
{ 5.45, 5.75 },
{ 2.85, 3.35 },
{ 2.45, 4.75 },
{ 0.8, 1.75 }
};
int depths[] = { 2, 2, 2, 2 };
test_dataset(test, "iris", expected, depths);
}
TEST_F(TestFImdlp, MaxCutPointsInteger)
{
// Set min_length to 75
auto test = CPPFImdlp(75, 2, 1);
vector<cutPoints_t> expected = {
{ 5.45 },
{ 3.35 },
{ 2.45 },
{ 0.8}
};
int depths[] = { 1, 1, 1, 1 };
test_dataset(test, "iris", expected, depths);
}
TEST_F(TestFImdlp, MaxCutPointsFloat)
{
// Set min_length to 75
auto test = CPPFImdlp(75, 2, 0.2);
vector<cutPoints_t> expected = { vector<cutPoints_t> expected = {
{ 5.45, 5.75 }, { 5.45, 5.75 },
{ 2.85, 3.35 }, { 2.85, 3.35 },

View File

@@ -36,6 +36,7 @@ namespace mdlp {
TEST_F(TestMetrics, InformationGain) TEST_F(TestMetrics, InformationGain)
{ {
ASSERT_NEAR(1, informationGain(0, 5, 10), precision); ASSERT_NEAR(1, informationGain(0, 5, 10), precision);
ASSERT_NEAR(1, informationGain(0, 5, 10), precision); // For cache
y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 };
setData(y, indices); setData(y, indices);
ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision); ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision);

View File

@@ -13,4 +13,5 @@ rm -fr gcovr-report/* 2>/dev/null
#lcov --capture --directory ./ --output-file lcoverage/main_coverage.info #lcov --capture --directory ./ --output-file lcoverage/main_coverage.info
#lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q #lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q
#lcov --list lcoverage/main_coverage.info #lcov --list lcoverage/main_coverage.info
gcovr --root .. --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --txt --sonarqube=gcovr-report/coverage.xml cd ..
gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml