Fix normalization & standard. columnwise

2025-08-15 15:36:01 +00:00 · 2021-04-15 00:30:16 +02:00
parent 59f59e6c4f
commit 84795b4c43
4 changed files with 109 additions and 22 deletions
--- a/checknormalize.py
+++ b/checknormalize.py
@@ -24,21 +24,20 @@ def header():
    print("Processing Datasets with stree default.\n")
    print(
        f"{'Dataset':30s} {'No Norm.':9s} {'Normaliz.':9s} "
-        f"{'Col.Norm.':9s} {'Best score in crossval':25s}"
+        f"{'Col.Norm.':9s} {'Context B':9s} {'Best score in crossval':25s}"
    )
-    print("=" * 30 + " " + ("=" * 9 + " ") * 3 + "=" * 25)
+    print("=" * 30 + " " + ("=" * 9 + " ") * 4 + "=" * 25)


-def process_dataset(X, y):
+def process_dataset(X, y, normalize):
    scores = []
    # return random.uniform(0, 1)
-    # Get the optimized parameters
    for random_state in random_seeds:
        random.seed(random_state)
+        clf_test = Stree(random_state=random_state, normalize=normalize)
        np.random.seed(random_state)
        kfold = KFold(shuffle=True, random_state=random_state, n_splits=5)
-        clf = Stree(random_state=random_state)
-        res = cross_validate(clf, X, y, cv=kfold, return_estimator=True)
+        res = cross_validate(clf_test, X, y, cv=kfold, return_estimator=True)
        scores.append(res["test_score"])
    return np.mean(scores)

@@ -58,7 +57,7 @@ database = dbh.get_connection()
 random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
 dt = Datasets(normalize=False, standardize=False, set_of_files="tanveer")
 header()
-total = [0, 0, 0]
+total = [0, 0, 0, 0]
 line = TextColor.LINE1
 for data in dt:
    name = data[0]
@@ -66,22 +65,31 @@ for data in dt:
    record = dbh.find_best(name, models_tree, "crossval")
    X2 = normalize(X)
    X3 = normalize_rows(X)
-    clf = Stree(random_state=1)
-    ac1 = process_dataset(X, y)
-    ac2 = process_dataset(X2, y)
-    ac3 = process_dataset(X3, y)
-    max_value = max(ac1, ac2, ac3)
+    ac1 = process_dataset(X, y, False)
+    ac2 = process_dataset(X2, y, False)
+    ac3 = process_dataset(X3, y, False)
+    ac4 = process_dataset(X, y, True)
+    max_value = round(max(ac1, ac2, ac3, ac4), 6)
    line = TextColor.LINE2 if line == TextColor.LINE1 else TextColor.LINE1
    print(line + f"{name:30s} ", end="", flush=True)
-    total[np.argmax([ac1, ac2, ac3])] += 1
+    total[np.argmax([ac1, ac2, ac3, ac4])] += 1
    color1 = TextColor.SUCCESS if ac1 == max_value else line
    color2 = TextColor.SUCCESS if ac2 == max_value else line
    color3 = TextColor.SUCCESS if ac3 == max_value else line
+    color4 = TextColor.SUCCESS if ac4 == max_value else line
    print(color1 + f"{ac1:9.6f} " + TextColor.ENDC, end="", flush=True)
    print(color2 + f"{ac2:9.6f} " + TextColor.ENDC, end="", flush=True)
-    print(color3 + f"{ac3:9.6f}" + TextColor.ENDC, end="", flush=True)
-    print(line + f"{record[5]:9.6f} {record[3]}" + TextColor.ENDC)
-print(f"{'Total':30s} {total[0]:9d} {total[1]:9d} {total[2]:9d}")
+    print(color3 + f"{ac3:9.6f} " + TextColor.ENDC, end="", flush=True)
+    print(color4 + f"{ac4:9.6f}" + TextColor.ENDC, end="", flush=True)
+    best_accuracy = round(record[5], 6)
+    best_color = TextColor.UNDERLINE if best_accuracy >= max_value else ""
+    print(
+        line
+        + best_color
+        + f"{best_accuracy:9.6f} {record[3]}"
+        + TextColor.ENDC
+    )
+print(f"{'Total':30s} {total[0]:9d} {total[1]:9d} {total[2]:9d} {total[3]:9d}")
 stop = time.time()
 hours, rem = divmod(stop - start, 3600)
 minutes, seconds = divmod(rem, 60)