diff --git a/CPPFImdlp.cpp b/CPPFImdlp.cpp index a2c17ce..7aa0beb 100644 --- a/CPPFImdlp.cpp +++ b/CPPFImdlp.cpp @@ -3,25 +3,42 @@ #include #include #include +#include #include "CPPFImdlp.h" #include "Metrics.h" - namespace mdlp { - CPPFImdlp::CPPFImdlp():depth(0), max_depth(numeric_limits::max()), min_length(3), indices(indices_t()), X(samples_t()), y(labels_t()), - metrics(Metrics(y, indices)) + CPPFImdlp::CPPFImdlp():min_length(3), depth(0), max_depth(numeric_limits::max()), proposed_cuts(0), + indices(indices_t()), X(samples_t()), y(labels_t()), + metrics(Metrics(y, indices)), num_cut_points(numeric_limits::max()) { } - CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_): depth(0), max_depth(max_depth_), min_length(min_length_), indices(indices_t()), X(samples_t()), y(labels_t()), - metrics(Metrics(y, indices)) + CPPFImdlp::CPPFImdlp(size_t min_length_, int max_depth_, float proposed): min_length(min_length_), depth(0), + max_depth(max_depth_), proposed_cuts(proposed), indices(indices_t()), X(samples_t()), y(labels_t()), + metrics(Metrics(y, indices)), num_cut_points(numeric_limits::max()) { } CPPFImdlp::~CPPFImdlp() = default; - CPPFImdlp& CPPFImdlp::fit(samples_t& X_, labels_t& y_) + size_t CPPFImdlp::compute_max_num_cut_points() + { + // Set the actual maximum number of cut points as a number or as a percentage of the number of samples + if (proposed_cuts == 0) { + return numeric_limits::max(); + } + if (proposed_cuts < 0 || proposed_cuts > X.size()) { + throw invalid_argument("wrong proposed num_cuts value"); + } + if (proposed_cuts < 1) + return (int)round(X.size() * proposed_cuts); + return (int)proposed_cuts; + } + + void CPPFImdlp::fit(samples_t& X_, labels_t& y_) { X = X_; y = y_; + num_cut_points = compute_max_num_cut_points(); depth = 0; cutPoints.clear(); if (X.size() != y.size()) { @@ -39,7 +56,6 @@ namespace mdlp { indices = sortIndices(X_, y_); metrics.setData(y, indices); computeCutPoints(0, X.size(), 1); - return *this; } pair CPPFImdlp::valueCutPoint(size_t start, size_t cut, size_t end) @@ -75,6 +91,8 @@ namespace mdlp { { size_t cut; pair result; + if (cutPoints.size() == num_cut_points) + return; // Check if the interval length and the depth are Ok if (end - start < min_length || depth_ > max_depth) return; @@ -158,15 +176,8 @@ namespace mdlp { cutPoints_t CPPFImdlp::getCutPoints() { - // Remove duplicates and sort - cutPoints_t output(cutPoints.size()); - set s; - unsigned size = cutPoints.size(); - for (unsigned i = 0; i < size; i++) - s.insert(cutPoints[i]); - output.assign(s.begin(), s.end()); - sort(output.begin(), output.end()); - return output; + sort(cutPoints.begin(), cutPoints.end()); + return cutPoints; } int CPPFImdlp::get_depth() { diff --git a/CPPFImdlp.h b/CPPFImdlp.h index ac57eb2..a24ffaf 100644 --- a/CPPFImdlp.h +++ b/CPPFImdlp.h @@ -9,22 +9,25 @@ namespace mdlp { protected: size_t min_length; int depth, max_depth; + float proposed_cuts; + indices_t indices; samples_t X; labels_t y; - indices_t indices; Metrics metrics; cutPoints_t cutPoints; + size_t num_cut_points; static indices_t sortIndices(samples_t&, labels_t&); void computeCutPoints(size_t, size_t, int); bool mdlp(size_t, size_t, size_t); size_t getCandidate(size_t, size_t); + size_t compute_max_num_cut_points(); pair valueCutPoint(size_t, size_t, size_t); public: CPPFImdlp(); - CPPFImdlp(size_t, int); + CPPFImdlp(size_t, int, float); ~CPPFImdlp(); - CPPFImdlp& fit(samples_t&, labels_t&); + void fit(samples_t&, labels_t&); cutPoints_t getCutPoints(); int get_depth(); inline string version() { return "1.1.1"; }; diff --git a/sample/sample.cpp b/sample/sample.cpp index d8c7b3f..926d464 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -25,25 +25,28 @@ void usage(const char* path) cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl; cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; + cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl; cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; } -tuple parse_arguments(int argc, char** argv) +tuple parse_arguments(int argc, char** argv) { string file_name; string path = PATH; int max_depth = numeric_limits::max(); int min_length = 3; + float max_cutpoints = 0; static struct option long_options[] = { { "help", no_argument, 0, 'h' }, { "file", required_argument, 0, 'f' }, { "path", required_argument, 0, 'p' }, { "max_depth", required_argument, 0, 'm' }, + { "max_cutpoints", required_argument, 0, 'c' }, { "min_length", required_argument, 0, 'n' }, { 0, 0, 0, 0 } }; while (1) { - auto c = getopt_long(argc, argv, "hf:p:m:n:", long_options, 0); + auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0); if (c == -1) break; switch (c) { @@ -59,6 +62,9 @@ tuple parse_arguments(int argc, char** argv) case 'n': min_length = atoi(optarg); break; + case 'c': + max_cutpoints = atof(optarg); + break; case 'p': path = optarg; if (path.back() != '/') @@ -75,10 +81,10 @@ tuple parse_arguments(int argc, char** argv) usage(argv[0]); exit(1); } - return make_tuple(file_name, path, max_depth, min_length); + return make_tuple(file_name, path, max_depth, min_length, max_cutpoints); } -void process_file(string path, string file_name, bool class_last, int max_depth, int min_length) +void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints) { ArffFiles file; @@ -101,7 +107,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth, } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth); + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); auto total = 0; for (auto i = 0; i < attributes.size(); i++) { auto min_max = minmax_element(X[i].begin(), X[i].end()); @@ -118,7 +124,7 @@ void process_file(string path, string file_name, bool class_last, int max_depth, cout << "Total feature states: " << total + attributes.size() << endl; } -void process_all_files(map datasets, string path, int max_depth, int min_length) +void process_all_files(map datasets, string path, int max_depth, int min_length, float max_cutpoints) { cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl; printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); @@ -132,7 +138,7 @@ void process_all_files(map datasets, string path, int max_depth, i size_t timing = 0; int cut_points = 0; for (auto i = 0; i < attributes.size(); i++) { - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth); + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); test.fit(X[i], y); std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); @@ -157,19 +163,29 @@ int main(int argc, char** argv) }; string file_name, path; int max_depth, min_length; - tie(file_name, path, max_depth, min_length) = parse_arguments(argc, argv); + float max_cutpoints; + tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv); if (datasets.find(file_name) == datasets.end() && file_name != "all") { cout << "Invalid file name: " << file_name << endl; usage(argv[0]); exit(1); } if (file_name == "all") - process_all_files(datasets, path, max_depth, min_length); + process_all_files(datasets, path, max_depth, min_length, max_cutpoints); else { - process_file(path, file_name, datasets[file_name], max_depth, min_length); + process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints); cout << "File name: " << file_name << endl; cout << "Max depth: " << max_depth << endl; cout << "Min length: " << min_length << endl; } + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + samples_t X = { 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.8, 4.8, 4.9, 4.95, 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; + labels_t y = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2 }; + test.fit(X, y); + vector computed = test.getCutPoints(); + cout << "Computed cut points: " << endl; + for (auto item : computed) { + cout << item << endl; + } return 0; } \ No newline at end of file diff --git a/tests/ArffFiles.cpp b/tests/ArffFiles.cpp index 470f5fa..4fbca78 100644 --- a/tests/ArffFiles.cpp +++ b/tests/ArffFiles.cpp @@ -40,11 +40,10 @@ vector& ArffFiles::getY() void ArffFiles::load(string fileName, bool classLast) { ifstream file(fileName); - string keyword, attribute, type; if (file.is_open()) { - string line; + string line, keyword, attribute, type; while (getline(file, line)) { - if (line[0] == '%' || line.empty() || line == "\r" || line == " ") { + if (line.empty() || line[0] == '%' || line == "\r" || line == " ") { continue; } if (line.find("@attribute") != string::npos || line.find("@ATTRIBUTE") != string::npos) { @@ -79,7 +78,7 @@ void ArffFiles::generateDataset(bool classLast) X = vector>(attributes.size(), vector(lines.size())); vector yy = vector(lines.size(), ""); int labelIndex = classLast ? attributes.size() : 0; - for (int i = 0; i < lines.size(); i++) { + for (size_t i = 0; i < lines.size(); i++) { stringstream ss(lines[i]); string value; int pos = 0, xIndex = 0; diff --git a/tests/FImdlp_unittest.cpp b/tests/FImdlp_unittest.cpp index e76609a..059c592 100644 --- a/tests/FImdlp_unittest.cpp +++ b/tests/FImdlp_unittest.cpp @@ -86,13 +86,22 @@ namespace mdlp { } TEST_F(TestFImdlp, FitErrorMinLengtMaxDepth) { - auto testLength = CPPFImdlp(2, 10); - auto testDepth = CPPFImdlp(3, 0); + auto testLength = CPPFImdlp(2, 10, 0); + auto testDepth = CPPFImdlp(3, 0, 0); X = { 1, 2, 3 }; y = { 1, 2, 3 }; EXPECT_THROW(testLength.fit(X, y), invalid_argument); EXPECT_THROW(testDepth.fit(X, y), invalid_argument); } + TEST_F(TestFImdlp, FitErrorMaxCutPoints) + { + auto testmin = CPPFImdlp(2, 10, -1); + auto testmax = CPPFImdlp(3, 0, 200); + X = { 1, 2, 3 }; + y = { 1, 2, 3 }; + EXPECT_THROW(testmin.fit(X, y), invalid_argument); + EXPECT_THROW(testmax.fit(X, y), invalid_argument); + } TEST_F(TestFImdlp, SortIndices) { X = { 5.7, 5.3, 5.2, 5.1, 5.0, 5.6, 5.1, 6.0, 5.1, 5.9 }; @@ -139,10 +148,8 @@ namespace mdlp { TEST_F(TestFImdlp, TestArtificialDataset) { fit(X, y); - computeCutPoints(0, 20, 1); cutPoints_t expected = { 5.05 }; vector computed = getCutPoints(); - computed = getCutPoints(); int expectedSize = expected.size(); EXPECT_EQ(computed.size(), expected.size()); for (unsigned long i = 0; i < computed.size(); i++) { @@ -194,7 +201,7 @@ namespace mdlp { TEST_F(TestFImdlp, MaxDepth) { // Set max_depth to 1 - auto test = CPPFImdlp(3, 1); + auto test = CPPFImdlp(3, 1, 0); vector expected = { { 5.45 }, { 3.35 }, @@ -206,7 +213,7 @@ namespace mdlp { } TEST_F(TestFImdlp, MinLength) { - auto test = CPPFImdlp(75, 100); + auto test = CPPFImdlp(75, 100, 0); // Set min_length to 75 vector expected = { { 5.45, 5.75 }, @@ -220,7 +227,33 @@ namespace mdlp { TEST_F(TestFImdlp, MinLengthMaxDepth) { // Set min_length to 75 - auto test = CPPFImdlp(75, 2); + auto test = CPPFImdlp(75, 2, 0); + vector expected = { + { 5.45, 5.75 }, + { 2.85, 3.35 }, + { 2.45, 4.75 }, + { 0.8, 1.75 } + }; + int depths[] = { 2, 2, 2, 2 }; + test_dataset(test, "iris", expected, depths); + } + TEST_F(TestFImdlp, MaxCutPointsInteger) + { + // Set min_length to 75 + auto test = CPPFImdlp(75, 2, 1); + vector expected = { + { 5.45 }, + { 3.35 }, + { 2.45 }, + { 0.8} + }; + int depths[] = { 1, 1, 1, 1 }; + test_dataset(test, "iris", expected, depths); + } + TEST_F(TestFImdlp, MaxCutPointsFloat) + { + // Set min_length to 75 + auto test = CPPFImdlp(75, 2, 0.2); vector expected = { { 5.45, 5.75 }, { 2.85, 3.35 }, diff --git a/tests/Metrics_unittest.cpp b/tests/Metrics_unittest.cpp index c6e3e56..e7d6f8b 100644 --- a/tests/Metrics_unittest.cpp +++ b/tests/Metrics_unittest.cpp @@ -36,6 +36,7 @@ namespace mdlp { TEST_F(TestMetrics, InformationGain) { ASSERT_NEAR(1, informationGain(0, 5, 10), precision); + ASSERT_NEAR(1, informationGain(0, 5, 10), precision); // For cache y = { 1, 1, 1, 1, 1, 1, 1, 1, 2, 1 }; setData(y, indices); ASSERT_NEAR(0.108032, informationGain(0, 5, 10), precision); diff --git a/tests/test b/tests/test index ca7061f..4180150 100755 --- a/tests/test +++ b/tests/test @@ -13,4 +13,5 @@ rm -fr gcovr-report/* 2>/dev/null #lcov --capture --directory ./ --output-file lcoverage/main_coverage.info #lcov --remove lcoverage/main_coverage.info 'v1/*' '/Applications/*' '*/tests/*' --output-file lcoverage/main_coverage.info -q #lcov --list lcoverage/main_coverage.info -gcovr --root .. --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --txt --sonarqube=gcovr-report/coverage.xml +cd .. +gcovr --gcov-filter "CPPFImdlp.cpp" --gcov-filter "Metrics.cpp" --txt --sonarqube=tests/gcovr-report/coverage.xml