diff --git a/checknormalize.py b/checknormalize.py new file mode 100755 index 0000000..01e9ab9 --- /dev/null +++ b/checknormalize.py @@ -0,0 +1,89 @@ +import random +import time +import numpy as np +from sklearn.model_selection import KFold, cross_validate +from experimentation.Sets import Datasets +from stree import Stree +from experimentation.Utils import TextColor +from experimentation.Database import MySQL + + +def normalize(data: np.array) -> np.array: + min_data = data.min() + return (data - min_data) / (data.max() - min_data) + + +def normalize_rows(data: np.array) -> np.array: + res = data.copy() + for col in range(res.shape[1]): + res[:, col] = normalize(res[:, col]) + return res + + +def header(): + print("Processing Datasets with stree default.\n") + print( + f"{'Dataset':30s} {'No Norm.':9s} {'Normaliz.':9s} " + f"{'Col.Norm.':9s} {'Best score in crossval':25s}" + ) + print("=" * 30 + " " + ("=" * 9 + " ") * 3 + "=" * 25) + + +def process_dataset(X, y): + scores = [] + # return random.uniform(0, 1) + # Get the optimized parameters + for random_state in random_seeds: + random.seed(random_state) + np.random.seed(random_state) + kfold = KFold(shuffle=True, random_state=random_state, n_splits=5) + clf = Stree(random_state=random_state) + res = cross_validate(clf, X, y, cv=kfold, return_estimator=True) + scores.append(res["test_score"]) + return np.mean(scores) + + +start = time.time() +models_tree = [ + "stree", + "stree_default", + "wodt", + "j48svm", + "oc1", + "cart", + "baseRaF", +] +dbh = MySQL() +database = dbh.get_connection() +random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +dt = Datasets(normalize=False, standardize=False, set_of_files="tanveer") +header() +total = [0, 0, 0] +line = TextColor.LINE1 +for data in dt: + name = data[0] + X, y = dt.load(name) + record = dbh.find_best(name, models_tree, "crossval") + X2 = normalize(X) + X3 = normalize_rows(X) + clf = Stree(random_state=1) + ac1 = process_dataset(X, y) + ac2 = process_dataset(X2, y) + ac3 = process_dataset(X3, y) + max_value = max(ac1, ac2, ac3) + line = TextColor.LINE2 if line == TextColor.LINE1 else TextColor.LINE1 + print(line + f"{name:30s} ", end="", flush=True) + total[np.argmax([ac1, ac2, ac3])] += 1 + color1 = TextColor.SUCCESS if ac1 == max_value else line + color2 = TextColor.SUCCESS if ac2 == max_value else line + color3 = TextColor.SUCCESS if ac3 == max_value else line + print(color1 + f"{ac1:9.6f} " + TextColor.ENDC, end="", flush=True) + print(color2 + f"{ac2:9.6f} " + TextColor.ENDC, end="", flush=True) + print(color3 + f"{ac3:9.6f}" + TextColor.ENDC, end="", flush=True) + print(line + f"{record[5]:9.6f} {record[3]}" + TextColor.ENDC) +print(f"{'Total':30s} {total[0]:9d} {total[1]:9d} {total[2]:9d}") +stop = time.time() +hours, rem = divmod(stop - start, 3600) +minutes, seconds = divmod(rem, 60) +print(f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s") +dbh.close()