Update samples

2025-08-17 16:35:52 +00:00 · 2023-03-14 11:47:30 +01:00
parent ccce9725b3
commit e6a56e3140
4 changed files with 163 additions and 24 deletions
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.20)
 project(main)

-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 11)

 add_executable(sample sample.cpp ../src/cppmdlp/tests/ArffFiles.cpp ../src/cppmdlp/Metrics.cpp ../src/cppmdlp/CPPFImdlp.cpp)
--- a/samples/sample.cpp
+++ b/samples/sample.cpp
@@ -1,28 +1,94 @@
-#include "../src/cppmdlp/tests/ArffFiles.h"
 #include <iostream>
 #include <vector>
 #include <iomanip>
+#include <chrono>
+#include <algorithm>
+#include <cstring>
+#include <getopt.h>
 #include "../src/cppmdlp/CPPFImdlp.h"
+#include "../src/cppmdlp/tests/ArffFiles.h"

 using namespace std;
+using namespace mdlp;

-int main(int argc, char** argv)
+const string PATH = "../../src/cppmdlp/tests/datasets/";
+
+/* print a description of all supported options */
+void usage(const char* path)
 {
-    ArffFiles file;
-    vector<string> lines;
-    string path = "../../src/cppmdlp/tests/datasets/";
-    map<string, bool > datasets = {
-        {"mfeat-factors", true},
-        {"iris", true},
-        {"letter", true},
-        {"kdd_JapaneseVowels", false}
-    };
-    if (argc != 2 || datasets.find(argv[1]) == datasets.end()) {
-        cout << "Usage: " << argv[0] << " {mfeat-factors, iris, letter, kdd_JapaneseVowels}" << endl;
-        return 1;
+    /* take only the last portion of the path */
+    const char* basename = strrchr(path, '/');
+    basename = basename ? basename + 1 : path;
+
+    cout << "usage: " << basename << "[OPTION]" << endl;
+    cout << "  -h, --help\t\t Print this help and exit." << endl;
+    cout << "  -f, --file[=FILENAME]\t {all, glass, iris, kdd_JapaneseVowels, letter, liver-disorders, mfeat-factors, test}." << endl;
+    cout << "  -p, --path[=FILENAME]\t folder where the arff dataset is located, default " << PATH << endl;
+    cout << "  -m, --max_depth=INT\t max_depth pased to discretizer. Default = MAX_INT" << endl;
+    cout << "  -c, --max_cutpoints=FLOAT\t percentage of lines expressed in decimal or integer number or cut points. Default = 0 = any" << endl;
+    cout << "  -n, --min_length=INT\t interval min_length pased to discretizer. Default = 3" << endl;
 }

-    file.load(path + argv[1] + ".arff", datasets[argv[1]]);
+tuple<string, string, int, int, float> parse_arguments(int argc, char** argv)
+{
+    string file_name;
+    string path = PATH;
+    int max_depth = numeric_limits<int>::max();
+    int min_length = 3;
+    float max_cutpoints = 0;
+    static struct option long_options[] = {
+            { "help", no_argument, 0, 'h' },
+            { "file", required_argument, 0, 'f' },
+            { "path", required_argument, 0, 'p' },
+            { "max_depth", required_argument, 0, 'm' },
+            { "max_cutpoints", required_argument, 0, 'c' },
+            { "min_length", required_argument, 0, 'n' },
+            { 0, 0, 0, 0 }
+    };
+    while (1) {
+        auto c = getopt_long(argc, argv, "hf:p:m:c:n:", long_options, 0);
+        if (c == -1)
+            break;
+        switch (c) {
+            case 'h':
+                usage(argv[0]);
+                exit(0);
+            case 'f':
+                file_name = optarg;
+                break;
+            case 'm':
+                max_depth = atoi(optarg);
+                break;
+            case 'n':
+                min_length = atoi(optarg);
+                break;
+            case 'c':
+                max_cutpoints = atof(optarg);
+                break;
+            case 'p':
+                path = optarg;
+                if (path.back() != '/')
+                    path += '/';
+                break;
+            case '?':
+                usage(argv[0]);
+                exit(1);
+            default:
+                abort();
+        }
+    }
+    if (file_name.empty()) {
+        usage(argv[0]);
+        exit(1);
+    }
+    return make_tuple(file_name, path, max_depth, min_length, max_cutpoints);
+}
+
+void process_file(string path, string file_name, bool class_last, int max_depth, int min_length, float max_cutpoints)
+{
+    ArffFiles file;
+
+    file.load(path + file_name + ".arff", class_last);
    auto attributes = file.getAttributes();
    int items = file.getSize();
    cout << "Number of lines: " << items << endl;
@@ -33,22 +99,85 @@ int main(int argc, char** argv)
    cout << "Class name: " << file.getClassName() << endl;
    cout << "Class type: " << file.getClassType() << endl;
    cout << "Data: " << endl;
-    vector<vector<float>>& X = file.getX();
-    vector<int>& y = file.getY();
-    for (int i = 0; i < 50; i++) {
+    vector<samples_t>& X = file.getX();
+    labels_t& y = file.getY();
+    for (int i = 0; i < 5; i++) {
        for (auto feature : X) {
            cout << fixed << setprecision(1) << feature[i] << " ";
        }
        cout << y[i] << endl;
    }
-    mdlp::CPPFImdlp test = mdlp::CPPFImdlp(0);
+    mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
+    auto total = 0;
    for (auto i = 0; i < attributes.size(); i++) {
+        auto min_max = minmax_element(X[i].begin(), X[i].end());
        cout << "Cut points for " << get<0>(attributes[i]) << endl;
+        cout << "Min: " << *min_max.first << " Max: " << *min_max.second << endl;
        cout << "--------------------------" << setprecision(3) << endl;
        test.fit(X[i], y);
        for (auto item : test.getCutPoints()) {
            cout << item << endl;
        }
+        total += test.getCutPoints().size();
+    }
+    cout << "Total cut points ...: " << total << endl;
+    cout << "Total feature states: " << total + attributes.size() << endl;
+}
+
+void process_all_files(map<string, bool> datasets, string path, int max_depth, int min_length, float max_cutpoints)
+{
+    cout << "Results: " << "Max_depth: " << max_depth << "  Min_length: " << min_length << endl << endl;
+    printf("%-20s %4s %4s\n", "Dataset", "Feat", "Cuts Time(ms)");
+    printf("==================== ==== ==== ========\n");
+    for (auto dataset : datasets) {
+        ArffFiles file;
+        file.load(path + dataset.first + ".arff", dataset.second);
+        auto attributes = file.getAttributes();
+        vector<samples_t>& X = file.getX();
+        labels_t& y = file.getY();
+        size_t timing = 0;
+        int cut_points = 0;
+        for (auto i = 0; i < attributes.size(); i++) {
+            mdlp::CPPFImdlp test = mdlp::CPPFImdlp(min_length, max_depth, max_cutpoints);
+            std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
+            test.fit(X[i], y);
+            std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
+            timing += std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count();
+            cut_points += test.getCutPoints().size();
+        }
+        printf("%-20s %4lu %4d %8zu\n", dataset.first.c_str(), attributes.size(), cut_points, timing);
+    }
+}
+
+
+int main(int argc, char** argv)
+{
+    map<string, bool> datasets = {
+            {"glass",              true},
+            {"iris",               true},
+            {"kdd_JapaneseVowels", false},
+            {"letter",             true},
+            {"liver-disorders",    true},
+            {"mfeat-factors",      true},
+            {"test",               true}
+    };
+    string file_name, path;
+    int max_depth, min_length;
+    float max_cutpoints;
+    tie(file_name, path, max_depth, min_length, max_cutpoints) = parse_arguments(argc, argv);
+    if (datasets.find(file_name) == datasets.end() && file_name != "all") {
+        cout << "Invalid file name: " << file_name << endl;
+        usage(argv[0]);
+        exit(1);
+    }
+    if (file_name == "all")
+        process_all_files(datasets, path, max_depth, min_length, max_cutpoints);
+    else {
+        process_file(path, file_name, datasets[file_name], max_depth, min_length, max_cutpoints);
+        cout << "File name ....: " << file_name << endl;
+        cout << "Max depth ....: " << max_depth << endl;
+        cout << "Min length ...: " << min_length << endl;
+        cout << "Max cutpoints : " << max_cutpoints << endl;
    }
    return 0;
 }
--- a/samples/sample.py
+++ b/samples/sample.py
@@ -9,13 +9,19 @@ from fimdlp.mdlp import FImdlp
 datasets = {
    "mfeat-factors": True,
    "iris": True,
+    "glass": True,
+    "liver-disorders": True,
    "letter": True,
    "kdd_JapaneseVowels": False,
 }

 ap = argparse.ArgumentParser()
 ap.add_argument(
-    "--alternative", dest="proposal", action="store_const", const=1
+    "--min_length", type=int, default=3, help="Minimum length of interval"
+)
+ap.add_argument("--max_depth", type=int, default=9999, help="Maximum depth")
+ap.add_argument(
+    "--max_cuts", type=float, default=0, help="Maximum number of cut points"
 )
 ap.add_argument("dataset", type=str, choices=datasets.keys())
 args = ap.parse_args()
@@ -30,7 +36,11 @@ class_name = df.columns.to_list()[class_column]
 X = df.drop(class_name, axis=1)
 y, _ = pd.factorize(df[class_name])
 X = X.to_numpy()
-test = FImdlp(algorithm=args.proposal if args.proposal is not None else 0)
+test = FImdlp(
+    min_length=args.min_length,
+    max_depth=args.max_depth,
+    max_cuts=args.max_cuts,
+)
 now = time.time()
 test.fit(X, y)
 fit_time = time.time()
--- a/src/cppmdlp
+++ b/src/cppmdlp