Add normalization test of datasets

2025-08-15 23:46:03 +00:00 · 2021-04-12 22:56:14 +02:00
parent c8bf92954b
commit 59f59e6c4f
1 changed files with 89 additions and 0 deletions
--- a/checknormalize.py
+++ b/checknormalize.py
@@ -0,0 +1,89 @@
+import random
+import time
+import numpy as np
+from sklearn.model_selection import KFold, cross_validate
+from experimentation.Sets import Datasets
+from stree import Stree
+from experimentation.Utils import TextColor
+from experimentation.Database import MySQL
+
+
+def normalize(data: np.array) -> np.array:
+    min_data = data.min()
+    return (data - min_data) / (data.max() - min_data)
+
+
+def normalize_rows(data: np.array) -> np.array:
+    res = data.copy()
+    for col in range(res.shape[1]):
+        res[:, col] = normalize(res[:, col])
+    return res
+
+
+def header():
+    print("Processing Datasets with stree default.\n")
+    print(
+        f"{'Dataset':30s} {'No Norm.':9s} {'Normaliz.':9s} "
+        f"{'Col.Norm.':9s} {'Best score in crossval':25s}"
+    )
+    print("=" * 30 + " " + ("=" * 9 + " ") * 3 + "=" * 25)
+
+
+def process_dataset(X, y):
+    scores = []
+    # return random.uniform(0, 1)
+    # Get the optimized parameters
+    for random_state in random_seeds:
+        random.seed(random_state)
+        np.random.seed(random_state)
+        kfold = KFold(shuffle=True, random_state=random_state, n_splits=5)
+        clf = Stree(random_state=random_state)
+        res = cross_validate(clf, X, y, cv=kfold, return_estimator=True)
+        scores.append(res["test_score"])
+    return np.mean(scores)
+
+
+start = time.time()
+models_tree = [
+    "stree",
+    "stree_default",
+    "wodt",
+    "j48svm",
+    "oc1",
+    "cart",
+    "baseRaF",
+]
+dbh = MySQL()
+database = dbh.get_connection()
+random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
+dt = Datasets(normalize=False, standardize=False, set_of_files="tanveer")
+header()
+total = [0, 0, 0]
+line = TextColor.LINE1
+for data in dt:
+    name = data[0]
+    X, y = dt.load(name)
+    record = dbh.find_best(name, models_tree, "crossval")
+    X2 = normalize(X)
+    X3 = normalize_rows(X)
+    clf = Stree(random_state=1)
+    ac1 = process_dataset(X, y)
+    ac2 = process_dataset(X2, y)
+    ac3 = process_dataset(X3, y)
+    max_value = max(ac1, ac2, ac3)
+    line = TextColor.LINE2 if line == TextColor.LINE1 else TextColor.LINE1
+    print(line + f"{name:30s} ", end="", flush=True)
+    total[np.argmax([ac1, ac2, ac3])] += 1
+    color1 = TextColor.SUCCESS if ac1 == max_value else line
+    color2 = TextColor.SUCCESS if ac2 == max_value else line
+    color3 = TextColor.SUCCESS if ac3 == max_value else line
+    print(color1 + f"{ac1:9.6f} " + TextColor.ENDC, end="", flush=True)
+    print(color2 + f"{ac2:9.6f} " + TextColor.ENDC, end="", flush=True)
+    print(color3 + f"{ac3:9.6f}" + TextColor.ENDC, end="", flush=True)
+    print(line + f"{record[5]:9.6f} {record[3]}" + TextColor.ENDC)
+print(f"{'Total':30s} {total[0]:9d} {total[1]:9d} {total[2]:9d}")
+stop = time.time()
+hours, rem = divmod(stop - start, 3600)
+minutes, seconds = divmod(rem, 60)
+print(f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s")
+dbh.close()