diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 3f41728..d10d2a9 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.20) project(main) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 11) add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp) diff --git a/samples/sample.cpp b/samples/sample.cpp index 7410445..61f8321 100644 --- a/samples/sample.cpp +++ b/samples/sample.cpp @@ -1,28 +1,94 @@ -#include "../src/cppmdlp/tests/ArffFiles.h" #include #include #include +#include +#include +#include +#include #include "../src/cppmdlp/CPPFImdlp.h" +#include "../src/cppmdlp/tests/ArffFiles.h" using namespace std; +using namespace mdlp; -int main(int argc, char** argv) +const string PATH = "../../src/cppmdlp/tests/datasets/"; + +/* print a description of all supported options */ +void usage(const char* path) +{ + /* take only the last portion of the path */ + const char* basename = strrchr(path, '/'); + basename = basename ? basename + 1 : path; + + cout << "usage: " << basename << "[OPTION]" << endl; + cout << " -h, --help\t\t Print this help and exit." << endl; + cout << " -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl; + cout << " -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl; + cout << " -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl; + cout << " -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl; + cout << " -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl; +} + +tuple parse_arguments(int argc, char** argv) +{ + string file_name; + string path = PATH; + int max_depth = numeric_limits::max(); + int min_length = 3; + float max_cutpoints = 0; + static struct option long_options[] = { + { "help", no_argument, 0, 'h' }, + { "file", required_argument, 0, 'f' }, + { "path", required_argument, 0, 'p' }, + { "max_depth", required_argument, 0, 'm' }, + { "max_cutpoints", required_argument, 0, 'c' }, + { "min_length", required_argument, 0, 'n' }, + { 0, 0, 0, 0 } + }; + while (1) { + auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0); + if (c == -1) + break; + switch (c) { + case 'h': + usage(argv[0]); + exit(0); + case 'f': + file_name = optarg; + break; + case 'm': + max_depth = atoi(optarg); + break; + case 'n': + min_length = atoi(optarg); + break; + case 'c': + max_cutpoints = atof(optarg); + break; + case 'p': + path = optarg; + if (path.back() != '/') + path += '/'; + break; + case '?': + usage(argv[0]); + exit(1); + default: + abort(); + } + } + if (file_name.empty()) { + usage(argv[0]); + exit(1); + } + return make_tuple(file_name, path, max_depth, min_length, max_cutpoints); +} + +void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints) { ArffFiles file; - vector lines; - string path = "../../src/cppmdlp/tests/datasets/"; - map datasets = { - {"mfeat-factors", true}, - {"iris", true}, - {"letter", true}, - {"kdd_JapaneseVowels", false} - }; - if (argc != 2 || datasets.find(argv[1]) == datasets.end()) { - cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl; - return 1; - } - file.load(path + argv[1] + ".arff", datasets[argv[1]]); + file.load(path + file_name + ".arff", class_last); auto attributes = file.getAttributes(); int items = file.getSize(); cout << "Number of lines: " << items << endl; @@ -33,22 +99,85 @@ int main(int argc, char** argv) cout << "Class name: " << file.getClassName() << endl; cout << "Class type: " << file.getClassType() << endl; cout << "Data: " << endl; - vector>& X = file.getX(); - vector& y = file.getY(); - for (int i = 0; i < 50; i++) { + vector& X = file.getX(); + labels_t& y = file.getY(); + for (int i = 0; i < 5; i++) { for (auto feature : X) { cout << fixed << setprecision(1) << feature[i] << " "; } cout << y[i] << endl; } - mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0); + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + auto total = 0; for (auto i = 0; i < attributes.size(); i++) { + auto min_max = minmax_element(X[i].begin(), X[i].end()); cout << "Cut points for " << get<0>(attributes[i]) << endl; + cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl; cout << "--------------------------" << setprecision(3) << endl; test.fit(X[i], y); for (auto item : test.getCutPoints()) { cout << item << endl; } + total += test.getCutPoints().size(); + } + cout << "Total cut points ...: " << total << endl; + cout << "Total feature states: " << total + attributes.size() << endl; +} + +void process_all_files(map datasets, string path, int max_depth, int min_length, float max_cutpoints) +{ + cout << "Results: " << "Max_depth: " << max_depth << " Min_length: " << min_length << endl << endl; + printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)"); + printf("==================== ==== ==== ========\n"); + for (auto dataset : datasets) { + ArffFiles file; + file.load(path + dataset.first + ".arff", dataset.second); + auto attributes = file.getAttributes(); + vector& X = file.getX(); + labels_t& y = file.getY(); + size_t timing = 0; + int cut_points = 0; + for (auto i = 0; i < attributes.size(); i++) { + mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints); + std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now(); + test.fit(X[i], y); + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + timing += std::chrono::duration_cast(end - begin).count(); + cut_points += test.getCutPoints().size(); + } + printf("%-20s %4lu %4d %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing); + } +} + + +int main(int argc, char** argv) +{ + map datasets = { + {"glass", true}, + {"iris", true}, + {"kdd_JapaneseVowels", false}, + {"letter", true}, + {"liver-disorders", true}, + {"mfeat-factors", true}, + {"test", true} + }; + string file_name, path; + int max_depth, min_length; + float max_cutpoints; + tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv); + if (datasets.find(file_name) == datasets.end() && file_name != "all") { + cout << "Invalid file name: " << file_name << endl; + usage(argv[0]); + exit(1); + } + if (file_name == "all") + process_all_files(datasets, path, max_depth, min_length, max_cutpoints); + else { + process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints); + cout << "File name ....: " << file_name << endl; + cout << "Max depth ....: " << max_depth << endl; + cout << "Min length ...: " << min_length << endl; + cout << "Max cutpoints : " << max_cutpoints << endl; } return 0; -} +} \ No newline at end of file diff --git a/samples/sample.py b/samples/sample.py index b02bb32..d671ddb 100644 --- a/samples/sample.py +++ b/samples/sample.py @@ -9,13 +9,19 @@ from fimdlp.mdlp import FImdlp datasets = { "mfeat-factors": True, "iris": True, + "glass": True, + "liver-disorders": True, "letter": True, "kdd_JapaneseVowels": False, } ap = argparse.ArgumentParser() ap.add_argument( - "--alternative", dest="proposal", action="store_const", const=1 + "--min_length", type=int, default=3, help="Minimum length of interval" +) +ap.add_argument("--max_depth", type=int, default=9999, help="Maximum depth") +ap.add_argument( + "--max_cuts", type=float, default=0, help="Maximum number of cut points" ) ap.add_argument("dataset", type=str, choices=datasets.keys()) args = ap.parse_args() @@ -30,7 +36,11 @@ class_name = df.columns.to_list()[class_column] X = df.drop(class_name, axis=1) y, _ = pd.factorize(df[class_name]) X = X.to_numpy() -test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0) +test = FImdlp( + min_length=args.min_length, + max_depth=args.max_depth, + max_cuts=args.max_cuts, +) now = time.time() test.fit(X, y) fit_time = time.time() diff --git a/src/cppmdlp b/src/cppmdlp index ed74336..770502c 160000 --- a/src/cppmdlp +++ b/src/cppmdlp @@ -1 +1 @@ -Subproject commit ed7433672d98745115fb5f0bc49fcbd7bf035427 +Subproject commit 770502c8e57a3ea57a722091ce05e4eb08c444d4