Add resport score for stree

update param_analysis for stree only
2025-08-15 23:46:03 +00:00 · 2021-03-23 00:50:04 +01:00
parent 08fb237001
commit 47078208bc
4 changed files with 291 additions and 49 deletions
--- a/report_score.py
+++ b/report_score.py
@@ -0,0 +1,236 @@
+import argparse
+import random
+import time
+from datetime import datetime
+import json
+import numpy as np
+from stree import Stree
+from sklearn.model_selection import KFold, cross_validate
+from experimentation.Sets import Datasets
+from experimentation.Database import MySQL
+
+8
+
+
+def parse_arguments():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "-S",
+        "--set-of-files",
+        type=str,
+        choices=["aaai", "tanveer"],
+        required=False,
+        default="tanveer",
+    )
+    ap.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        required=False,
+        default="stree",
+        help="model name, default stree",
+    )
+    ap.add_argument(
+        "-d",
+        "--dataset",
+        type=str,
+        required=True,
+        help="dataset to process, all for everyone",
+    )
+    ap.add_argument(
+        "-s",
+        "--sql",
+        default=False,
+        type=bool,
+        required=False,
+        help="generate report_score.sql",
+    )
+    ap.add_argument(
+        "-p",
+        "--param",
+        default=False,
+        type=bool,
+        required=False,
+        help="Auto generate params",
+    )
+    args = ap.parse_args()
+    return (args.set_of_files, args.model, args.dataset, args.sql, args.param)
+
+
+def compute_auto_hyperparams(X, y):
+    params = {"max_iter": 1e4, "C": 0.1}
+    classes = len(np.unique(y))
+    if classes > 2:
+        params["split_criteria"] = "max_samples"
+    return params
+
+
+def process_dataset(dataset, verbose, model, auto_params):
+    X, y = dt.load(dataset)
+    scores = []
+    times = []
+    if verbose:
+        print(
+            f"* Processing dataset [{dataset}] from Set: {set_of_files} with "
+            f"{model}"
+        )
+        print(f"X.shape: {X.shape}")
+        print(f"{X[:4]}")
+        print(f"Random seeds: {random_seeds}")
+    if auto_params:
+        hyperparameters = compute_auto_hyperparams(X, y)
+    else:
+        hyperparameters = {}
+    for random_state in random_seeds:
+        random.seed(random_state)
+        np.random.seed(random_state)
+        kfold = KFold(shuffle=True, random_state=random_state, n_splits=5)
+        clf = Stree(random_state=random_state)
+        clf.set_params(**hyperparameters)
+        res = cross_validate(clf, X, y, cv=kfold)
+        scores.append(res["test_score"])
+        times.append(res["fit_time"])
+        if verbose:
+            print(
+                f"Random seed: {random_state:5d} Accuracy: "
+                f"{res['test_score'].mean():6.4f}±"
+                f"{res['test_score'].std():6.4f} "
+                f"{res['fit_time'].mean():5.3f}s"
+            )
+    return scores, times, json.dumps(hyperparameters)
+
+
+def store_string(dataset, model, accuracy, time_spent, hyperparameters):
+    attributes = [
+        "date",
+        "time",
+        "type",
+        "accuracy",
+        "accuracy_std",
+        "dataset",
+        "classifier",
+        "norm",
+        "stand",
+        "time_spent",
+        "time_spent_std",
+        "parameters",
+    ]
+    command_insert = (
+        "replace into results ("
+        + ",".join(attributes)
+        + ") values("
+        + ("'%s'," * len(attributes))[:-1]
+        + ");"
+    )
+    now = datetime.now()
+    date = now.strftime("%Y-%m-%d")
+    time = now.strftime("%H:%M:%S")
+    values = (
+        date,
+        time,
+        "crossval",
+        np.mean(accuracy),
+        np.std(accuracy),
+        dataset,
+        model,
+        True,
+        False,
+        np.mean(time_spent),
+        np.std(time_spent),
+        hyperparameters,
+    )
+    result = command_insert % values
+    return result
+
+
+random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
+normalize = True
+standardize = False
+(set_of_files, model, dataset, sql, auto_params) = parse_arguments()
+dbh = MySQL()
+if sql:
+    sql_output = open("report_score.sql", "w")
+database = dbh.get_connection()
+dt = Datasets(normalize, standardize, set_of_files)
+start = time.time()
+if dataset == "all":
+    print(
+        f"* Process all datasets set with {model}: {set_of_files} "
+        f"norm: {normalize} std: {standardize}"
+    )
+    print(f"5 Fold Cross Validation with 10 random seeds {random_seeds}\n")
+    print(
+        "{0:30s} {5:4s} {6:3s} {7:2s} {1:13s} {2:13s} {3:8s} {4:90s}".format(
+            "Dataset",
+            "Acc. computed",
+            "Best Accuracy",
+            "Diff.",
+            "Best accuracy hyperparameters",
+            "Samp",
+            "Var",
+            "Cls",
+        )
+    )
+    print("=" * 30, end=" ")
+    print("=" * 4, end=" ")
+    print("=" * 3, end=" ")
+    print("=" * 3, end=" ")
+    print("=" * 13, end=" ")
+    print("=" * 13, end=" ")
+    print("=" * 8, end=" ")
+    print("=" * 90)
+    for dataset in dt:
+        X, y = dt.load(dataset[0])  # type: ignore
+        samples, features = X.shape
+        classes = len(np.unique(y))
+        print(
+            f"{dataset[0]:30s} {samples:4d} {features:3d} " f"{classes:3d} ",
+            end="",
+        )
+        scores, times, hyperparameters = process_dataset(
+            dataset[0], verbose=False, model=model, auto_params=auto_params
+        )
+        record = dbh.find_best(dataset[0], model, "crossval")
+        if record is not None:
+            parameters = json.loads(record[8] if record[8] != "" else "{}")
+            parameters.pop("random_state", None)
+            accuracy_best = record[5]
+            acc_best_std = record[11]
+        else:
+            parameters = json.loads("{}")
+            accuracy_best = 0.0
+            acc_best_std = 0.0
+        accuracy_computed = np.mean(scores)
+        diff = accuracy_best - accuracy_computed
+        print(
+            f"{accuracy_computed:6.4f}±{np.std(scores):6.4f} "
+            f"{accuracy_best:6.4f}±{acc_best_std:6.4f} {diff:8.5f} "
+            f"{json.dumps(parameters):40s}"
+        )
+        if sql:
+            command = store_string(
+                dataset[0], model, scores, times, hyperparameters
+            )
+            print(command, file=sql_output)
+else:
+    scores, times, hyperparameters = process_dataset(
+        dataset, verbose=True, model=model, auto_params=auto_params
+    )
+    record = dbh.find_best(dataset, model, "crossval")
+    accuracy = np.mean(scores)
+    accuracy_best = record[5] if record is not None else 0.0
+    acc_best_std = record[11] if record is not None else 0.0
+    print(
+        f"* Accuracy Computed : {accuracy:6.4f}±{np.std(scores):6.4f} "
+        f"{np.mean(times):5.3f}s"
+    )
+    print(f"* Accuracy Best ....: {accuracy_best:6.4f}±{acc_best_std:6.4f}")
+    print(f"* Difference .......: {accuracy_best - accuracy:6.4f}")
+stop = time.time()
+print(f"- Auto Hyperparams .: {hyperparameters}")
+hours, rem = divmod(stop - start, 3600)
+minutes, seconds = divmod(rem, 60)
+print(f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s")
+if sql:
+    sql_output.close()
+dbh.close()