From 90428218c2be4093f56e539c2ab2be7557ad2ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Tue, 28 Feb 2023 00:43:37 +0100 Subject: [PATCH] Add dataset to test and add hyperparameters to sample --- sample/sample.cpp | 104 ++++++-- tests/datasets/liver-disorders.arff | 399 ++++++++++++++++++++++++++++ 2 files changed, 487 insertions(+), 16 deletions(-) create mode 100755 tests/datasets/liver-disorders.arff diff --git a/sample/sample.cpp b/sample/sample.cpp index 117caba..93e87a9 100644 --- a/sample/sample.cpp +++ b/sample/sample.cpp @@ -1,31 +1,76 @@ #include #include #include +#include #include "../CPPFImdlp.h" #include "../tests/ArffFiles.h" using namespace std; using namespace mdlp; +/* print a description of all supported options */ +void usage(const char* path) +{ + /* take only the last portion of the path */ + const char* basename = strrchr(path, '/'); + basename = basename ? basename + 1 : path; -int main(int argc, char** argv) + cout << "usage: " << basename << "[OPTION]" << endl; + cout << " -h, --help\t\t Print this help and exit." << endl; + cout << " -f, --file[=FILENAME]\t {mfeat-factors, glass, iris, letter, kdd_JapaneseVowels, liver-disorders, test}." << endl; + cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; + cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; +} + +tuple parse_arguments(int argc, char** argv) +{ + string file_name; + int max_depth = numeric_limits::max(); + int min_length = 3; + static struct option long_options[] = { + { "help", no_argument, 0, 'h' }, + { "file", required_argument, 0, 'f' }, + { "max_depth", required_argument, 0, 'm' }, + { "min_length", required_argument, 0, 'n' }, + { 0, 0, 0, 0 } + }; + while (1) { + auto c = getopt_long(argc, argv, "hf:m:n:", long_options, 0); + if (c == -1) + break; + switch (c) { + case 'h': + usage(argv[0]); + exit(0); + case 'f': + file_name = optarg; + break; + case 'm': + max_depth = atoi(optarg); + break; + case 'n': + min_length = atoi(optarg); + break; + case '?': + usage(argv[0]); + exit(1); + default: + abort(); + } + } + if (file_name.empty()) { + usage(argv[0]); + exit(1); + } + return make_tuple(file_name, max_depth, min_length); +} + +void process_file(string file_name, bool class_last, int max_depth, int min_length) { ArffFiles file; string path = "../../tests/datasets/"; - map datasets = { - {"mfeat-factors", true}, - {"iris", true}, - {"letter", true}, - {"glass", true}, - {"kdd_JapaneseVowels", false}, - {"test", true} - }; - if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { - cout << "Usage: " << argv[0] << " {mfeat-factors, glass, iris, letter, kdd_JapaneseVowels, test}" << endl; - return 1; - } - file.load(path + argv[1] + ".arff", datasets[argv[1]]); + file.load(path + file_name + ".arff", class_last); auto attributes = file.getAttributes(); int items = file.getSize(); cout << "Number of lines: " << items << endl; @@ -44,7 +89,7 @@ int main(int argc, char** argv) } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(); + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth); auto total = 0; for (auto i = 0; i < attributes.size(); i++) { auto min_max = minmax_element(X[i].begin(), X[i].end()); @@ -57,6 +102,33 @@ int main(int argc, char** argv) } total += test.getCutPoints().size(); } - cout << "Total cut points: " << total << endl; + cout << "Total cut points ...: " << total << endl; + cout << "Total feature states: " << total + attributes.size() << endl; +} + + +int main(int argc, char** argv) +{ + map datasets = { + {"mfeat-factors", true}, + {"iris", true}, + {"letter", true}, + {"glass", true}, + {"kdd_JapaneseVowels", false}, + {"liver-disorders", true}, + {"test", true} + }; + string file_name; + int max_depth, min_length; + tie(file_name, max_depth, min_length) = parse_arguments(argc, argv); + if (datasets.find(file_name) == datasets.end()) { + cout << "Invalid file name: " << file_name << endl; + usage(argv[0]); + exit(1); + } + process_file(file_name, datasets[file_name], max_depth, min_length); + cout << "File name: " << file_name << endl; + cout << "Max depth: " << max_depth << endl; + cout << "Min length: " << min_length << endl; return 0; } diff --git a/tests/datasets/liver-disorders.arff b/tests/datasets/liver-disorders.arff new file mode 100755 index 0000000..fe967df --- /dev/null +++ b/tests/datasets/liver-disorders.arff @@ -0,0 +1,399 @@ +% 1. Title: BUPA liver disorders +% +% 2. Source information: +% -- Creators: BUPA Medical Research Ltd. +% -- Donor: Richard S. Forsyth +% 8 Grosvenor Avenue +% Mapperley Park +% Nottingham NG3 5DX +% 0602-621676 +% -- Date: 5/15/1990 +% +% 3. Past usage: +% -- None known other than what is shown in the PC/BEAGLE User's Guide +% (written by Richard S. Forsyth). +% +% 4. Relevant information: +% -- The first 5 variables are all blood tests which are thought +% to be sensitive to liver disorders that might arise from +% excessive alcohol consumption. Each line in the bupa.data file +% constitutes the record of a single male individual. +% -- It appears that drinks>5 is some sort of a selector on this database. +% See the PC/BEAGLE User's Guide for more information. +% +% 5. Number of instances: 345 +% +% 6. Number of attributes: 7 overall +% +% 7. Attribute information: +% 1. mcv mean corpuscular volume +% 2. alkphos alkaline phosphotase +% 3. sgpt alamine aminotransferase +% 4. sgot aspartate aminotransferase +% 5. gammagt gamma-glutamyl transpeptidase +% 6. drinks number of half-pint equivalents of alcoholic beverages +% drunk per day +% 7. selector field used to split data into two sets +% +% 8. Missing values: none% +% Information about the dataset +% CLASSTYPE: nominal +% CLASSINDEX: last +% + +@relation liver-disorders + +@attribute mcv INTEGER +@attribute alkphos INTEGER +@attribute sgpt INTEGER +@attribute sgot INTEGER +@attribute gammagt INTEGER +@attribute drinks REAL +@attribute selector {1,2} + +@data +85,92,45,27,31,0.0,1 +85,64,59,32,23,0.0,2 +86,54,33,16,54,0.0,2 +91,78,34,24,36,0.0,2 +87,70,12,28,10,0.0,2 +98,55,13,17,17,0.0,2 +88,62,20,17,9,0.5,1 +88,67,21,11,11,0.5,1 +92,54,22,20,7,0.5,1 +90,60,25,19,5,0.5,1 +89,52,13,24,15,0.5,1 +82,62,17,17,15,0.5,1 +90,64,61,32,13,0.5,1 +86,77,25,19,18,0.5,1 +96,67,29,20,11,0.5,1 +91,78,20,31,18,0.5,1 +89,67,23,16,10,0.5,1 +89,79,17,17,16,0.5,1 +91,107,20,20,56,0.5,1 +94,116,11,33,11,0.5,1 +92,59,35,13,19,0.5,1 +93,23,35,20,20,0.5,1 +90,60,23,27,5,0.5,1 +96,68,18,19,19,0.5,1 +84,80,47,33,97,0.5,1 +92,70,24,13,26,0.5,1 +90,47,28,15,18,0.5,1 +88,66,20,21,10,0.5,1 +91,102,17,13,19,0.5,1 +87,41,31,19,16,0.5,1 +86,79,28,16,17,0.5,1 +91,57,31,23,42,0.5,1 +93,77,32,18,29,0.5,1 +88,96,28,21,40,0.5,1 +94,65,22,18,11,0.5,1 +91,72,155,68,82,0.5,2 +85,54,47,33,22,0.5,2 +79,39,14,19,9,0.5,2 +85,85,25,26,30,0.5,2 +89,63,24,20,38,0.5,2 +84,92,68,37,44,0.5,2 +89,68,26,39,42,0.5,2 +89,101,18,25,13,0.5,2 +86,84,18,14,16,0.5,2 +85,65,25,14,18,0.5,2 +88,61,19,21,13,0.5,2 +92,56,14,16,10,0.5,2 +95,50,29,25,50,0.5,2 +91,75,24,22,11,0.5,2 +83,40,29,25,38,0.5,2 +89,74,19,23,16,0.5,2 +85,64,24,22,11,0.5,2 +92,57,64,36,90,0.5,2 +94,48,11,23,43,0.5,2 +87,52,21,19,30,0.5,2 +85,65,23,29,15,0.5,2 +84,82,21,21,19,0.5,2 +88,49,20,22,19,0.5,2 +96,67,26,26,36,0.5,2 +90,63,24,24,24,0.5,2 +90,45,33,34,27,0.5,2 +90,72,14,15,18,0.5,2 +91,55,4,8,13,0.5,2 +91,52,15,22,11,0.5,2 +87,71,32,19,27,1.0,1 +89,77,26,20,19,1.0,1 +89,67,5,17,14,1.0,2 +85,51,26,24,23,1.0,2 +103,75,19,30,13,1.0,2 +90,63,16,21,14,1.0,2 +90,63,29,23,57,2.0,1 +90,67,35,19,35,2.0,1 +87,66,27,22,9,2.0,1 +90,73,34,21,22,2.0,1 +86,54,20,21,16,2.0,1 +90,80,19,14,42,2.0,1 +87,90,43,28,156,2.0,2 +96,72,28,19,30,2.0,2 +91,55,9,25,16,2.0,2 +95,78,27,25,30,2.0,2 +92,101,34,30,64,2.0,2 +89,51,41,22,48,2.0,2 +91,99,42,33,16,2.0,2 +94,58,21,18,26,2.0,2 +92,60,30,27,297,2.0,2 +94,58,21,18,26,2.0,2 +88,47,33,26,29,2.0,2 +92,65,17,25,9,2.0,2 +92,79,22,20,11,3.0,1 +84,83,20,25,7,3.0,1 +88,68,27,21,26,3.0,1 +86,48,20,20,6,3.0,1 +99,69,45,32,30,3.0,1 +88,66,23,12,15,3.0,1 +89,62,42,30,20,3.0,1 +90,51,23,17,27,3.0,1 +81,61,32,37,53,3.0,2 +89,89,23,18,104,3.0,2 +89,65,26,18,36,3.0,2 +92,75,26,26,24,3.0,2 +85,59,25,20,25,3.0,2 +92,61,18,13,81,3.0,2 +89,63,22,27,10,4.0,1 +90,84,18,23,13,4.0,1 +88,95,25,19,14,4.0,1 +89,35,27,29,17,4.0,1 +91,80,37,23,27,4.0,1 +91,109,33,15,18,4.0,1 +91,65,17,5,7,4.0,1 +88,107,29,20,50,4.0,2 +87,76,22,55,9,4.0,2 +87,86,28,23,21,4.0,2 +87,42,26,23,17,4.0,2 +88,80,24,25,17,4.0,2 +90,96,34,49,169,4.0,2 +86,67,11,15,8,4.0,2 +92,40,19,20,21,4.0,2 +85,60,17,21,14,4.0,2 +89,90,15,17,25,4.0,2 +91,57,15,16,16,4.0,2 +96,55,48,39,42,4.0,2 +79,101,17,27,23,4.0,2 +90,134,14,20,14,4.0,2 +89,76,14,21,24,4.0,2 +88,93,29,27,31,4.0,2 +90,67,10,16,16,4.0,2 +92,73,24,21,48,4.0,2 +91,55,28,28,82,4.0,2 +83,45,19,21,13,4.0,2 +90,74,19,14,22,4.0,2 +92,66,21,16,33,5.0,1 +93,63,26,18,18,5.0,1 +86,78,47,39,107,5.0,2 +97,44,113,45,150,5.0,2 +87,59,15,19,12,5.0,2 +86,44,21,11,15,5.0,2 +87,64,16,20,24,5.0,2 +92,57,21,23,22,5.0,2 +90,70,25,23,112,5.0,2 +99,59,17,19,11,5.0,2 +92,80,10,26,20,6.0,1 +95,60,26,22,28,6.0,1 +91,63,25,26,15,6.0,1 +92,62,37,21,36,6.0,1 +95,50,13,14,15,6.0,1 +90,76,37,19,50,6.0,1 +96,70,70,26,36,6.0,1 +95,62,64,42,76,6.0,1 +92,62,20,23,20,6.0,1 +91,63,25,26,15,6.0,1 +82,56,67,38,92,6.0,2 +92,82,27,24,37,6.0,2 +90,63,12,26,21,6.0,2 +88,37,9,15,16,6.0,2 +100,60,29,23,76,6.0,2 +98,43,35,23,69,6.0,2 +91,74,87,50,67,6.0,2 +92,87,57,25,44,6.0,2 +93,99,36,34,48,6.0,2 +90,72,17,19,19,6.0,2 +97,93,21,20,68,6.0,2 +93,50,18,25,17,6.0,2 +90,57,20,26,33,6.0,2 +92,76,31,28,41,6.0,2 +88,55,19,17,14,6.0,2 +89,63,24,29,29,6.0,2 +92,79,70,32,84,7.0,1 +92,93,58,35,120,7.0,1 +93,84,58,47,62,7.0,2 +97,71,29,22,52,8.0,1 +84,99,33,19,26,8.0,1 +96,44,42,23,73,8.0,1 +90,62,22,21,21,8.0,1 +92,94,18,17,6,8.0,1 +90,67,77,39,114,8.0,1 +97,71,29,22,52,8.0,1 +91,69,25,25,66,8.0,2 +93,59,17,20,14,8.0,2 +92,95,85,48,200,8.0,2 +90,50,26,22,53,8.0,2 +91,62,59,47,60,8.0,2 +92,93,22,28,123,9.0,1 +92,77,86,41,31,10.0,1 +86,66,22,24,26,10.0,2 +98,57,31,34,73,10.0,2 +95,80,50,64,55,10.0,2 +92,108,53,33,94,12.0,2 +97,92,22,28,49,12.0,2 +93,77,39,37,108,16.0,1 +94,83,81,34,201,20.0,1 +87,75,25,21,14,0.0,1 +88,56,23,18,12,0.0,1 +84,97,41,20,32,0.0,2 +94,91,27,20,15,0.5,1 +97,62,17,13,5,0.5,1 +92,85,25,20,12,0.5,1 +82,48,27,15,12,0.5,1 +88,74,31,25,15,0.5,1 +95,77,30,14,21,0.5,1 +88,94,26,18,8,0.5,1 +91,70,19,19,22,0.5,1 +83,54,27,15,12,0.5,1 +91,105,40,26,56,0.5,1 +86,79,37,28,14,0.5,1 +91,96,35,22,135,0.5,1 +89,82,23,14,35,0.5,1 +90,73,24,23,11,0.5,1 +90,87,19,25,19,0.5,1 +89,82,33,32,18,0.5,1 +85,79,17,8,9,0.5,1 +85,119,30,26,17,0.5,1 +78,69,24,18,31,0.5,1 +88,107,34,21,27,0.5,1 +89,115,17,27,7,0.5,1 +92,67,23,15,12,0.5,1 +89,101,27,34,14,0.5,1 +91,84,11,12,10,0.5,1 +94,101,41,20,53,0.5,2 +88,46,29,22,18,0.5,2 +88,122,35,29,42,0.5,2 +84,88,28,25,35,0.5,2 +90,79,18,15,24,0.5,2 +87,69,22,26,11,0.5,2 +65,63,19,20,14,0.5,2 +90,64,12,17,14,0.5,2 +85,58,18,24,16,0.5,2 +88,81,41,27,36,0.5,2 +86,78,52,29,62,0.5,2 +82,74,38,28,48,0.5,2 +86,58,36,27,59,0.5,2 +94,56,30,18,27,0.5,2 +87,57,30,30,22,0.5,2 +98,74,148,75,159,0.5,2 +94,75,20,25,38,0.5,2 +83,68,17,20,71,0.5,2 +93,56,25,21,33,0.5,2 +101,65,18,21,22,0.5,2 +92,65,25,20,31,0.5,2 +92,58,14,16,13,0.5,2 +86,58,16,23,23,0.5,2 +85,62,15,13,22,0.5,2 +86,57,13,20,13,0.5,2 +86,54,26,30,13,0.5,2 +81,41,33,27,34,1.0,1 +91,67,32,26,13,1.0,1 +91,80,21,19,14,1.0,1 +92,60,23,15,19,1.0,1 +91,60,32,14,8,1.0,1 +93,65,28,22,10,1.0,1 +90,63,45,24,85,1.0,2 +87,92,21,22,37,1.0,2 +83,78,31,19,115,1.0,2 +95,62,24,23,14,1.0,2 +93,59,41,30,48,1.0,2 +84,82,43,32,38,2.0,1 +87,71,33,20,22,2.0,1 +86,44,24,15,18,2.0,1 +86,66,28,24,21,2.0,1 +88,58,31,17,17,2.0,1 +90,61,28,29,31,2.0,1 +88,69,70,24,64,2.0,1 +93,87,18,17,26,2.0,1 +98,58,33,21,28,2.0,1 +91,44,18,18,23,2.0,2 +87,75,37,19,70,2.0,2 +94,91,30,26,25,2.0,2 +88,85,14,15,10,2.0,2 +89,109,26,25,27,2.0,2 +87,59,37,27,34,2.0,2 +93,58,20,23,18,2.0,2 +88,57,9,15,16,2.0,2 +94,65,38,27,17,3.0,1 +91,71,12,22,11,3.0,1 +90,55,20,20,16,3.0,1 +91,64,21,17,26,3.0,2 +88,47,35,26,33,3.0,2 +82,72,31,20,84,3.0,2 +85,58,83,49,51,3.0,2 +91,54,25,22,35,4.0,1 +98,50,27,25,53,4.0,2 +86,62,29,21,26,4.0,2 +89,48,32,22,14,4.0,2 +82,68,20,22,9,4.0,2 +83,70,17,19,23,4.0,2 +96,70,21,26,21,4.0,2 +94,117,77,56,52,4.0,2 +93,45,11,14,21,4.0,2 +93,49,27,21,29,4.0,2 +84,73,46,32,39,4.0,2 +91,63,17,17,46,4.0,2 +90,57,31,18,37,4.0,2 +87,45,19,13,16,4.0,2 +91,68,14,20,19,4.0,2 +86,55,29,35,108,4.0,2 +91,86,52,47,52,4.0,2 +88,46,15,33,55,4.0,2 +85,52,22,23,34,4.0,2 +89,72,33,27,55,4.0,2 +95,59,23,18,19,4.0,2 +94,43,154,82,121,4.0,2 +96,56,38,26,23,5.0,2 +90,52,10,17,12,5.0,2 +94,45,20,16,12,5.0,2 +99,42,14,21,49,5.0,2 +93,102,47,23,37,5.0,2 +94,71,25,26,31,5.0,2 +92,73,33,34,115,5.0,2 +87,54,41,29,23,6.0,1 +92,67,15,14,14,6.0,1 +98,101,31,26,32,6.0,1 +92,53,51,33,92,6.0,1 +97,94,43,43,82,6.0,1 +93,43,11,16,54,6.0,1 +93,68,24,18,19,6.0,1 +95,36,38,19,15,6.0,1 +99,86,58,42,203,6.0,1 +98,66,103,57,114,6.0,1 +92,80,10,26,20,6.0,1 +96,74,27,25,43,6.0,2 +95,93,21,27,47,6.0,2 +86,109,16,22,28,6.0,2 +91,46,30,24,39,7.0,2 +102,82,34,78,203,7.0,2 +85,50,12,18,14,7.0,2 +91,57,33,23,12,8.0,1 +91,52,76,32,24,8.0,1 +93,70,46,30,33,8.0,1 +87,55,36,19,25,8.0,1 +98,123,28,24,31,8.0,1 +82,55,18,23,44,8.0,2 +95,73,20,25,225,8.0,2 +97,80,17,20,53,8.0,2 +100,83,25,24,28,8.0,2 +88,91,56,35,126,9.0,2 +91,138,45,21,48,10.0,1 +92,41,37,22,37,10.0,1 +86,123,20,25,23,10.0,2 +91,93,35,34,37,10.0,2 +87,87,15,23,11,10.0,2 +87,56,52,43,55,10.0,2 +99,75,26,24,41,12.0,1 +96,69,53,43,203,12.0,2 +98,77,55,35,89,15.0,1 +91,68,27,26,14,16.0,1 +98,99,57,45,65,20.0,1