mirror of
https://github.com/Doctorado-ML/Stree_datasets.git
synced 2025-08-15 23:46:03 +00:00
Add normalization test of datasets
This commit is contained in:
89
checknormalize.py
Executable file
89
checknormalize.py
Executable file
@@ -0,0 +1,89 @@
|
||||
import random
|
||||
import time
|
||||
import numpy as np
|
||||
from sklearn.model_selection import KFold, cross_validate
|
||||
from experimentation.Sets import Datasets
|
||||
from stree import Stree
|
||||
from experimentation.Utils import TextColor
|
||||
from experimentation.Database import MySQL
|
||||
|
||||
|
||||
def normalize(data: np.array) -> np.array:
|
||||
min_data = data.min()
|
||||
return (data - min_data) / (data.max() - min_data)
|
||||
|
||||
|
||||
def normalize_rows(data: np.array) -> np.array:
|
||||
res = data.copy()
|
||||
for col in range(res.shape[1]):
|
||||
res[:, col] = normalize(res[:, col])
|
||||
return res
|
||||
|
||||
|
||||
def header():
|
||||
print("Processing Datasets with stree default.\n")
|
||||
print(
|
||||
f"{'Dataset':30s} {'No Norm.':9s} {'Normaliz.':9s} "
|
||||
f"{'Col.Norm.':9s} {'Best score in crossval':25s}"
|
||||
)
|
||||
print("=" * 30 + " " + ("=" * 9 + " ") * 3 + "=" * 25)
|
||||
|
||||
|
||||
def process_dataset(X, y):
|
||||
scores = []
|
||||
# return random.uniform(0, 1)
|
||||
# Get the optimized parameters
|
||||
for random_state in random_seeds:
|
||||
random.seed(random_state)
|
||||
np.random.seed(random_state)
|
||||
kfold = KFold(shuffle=True, random_state=random_state, n_splits=5)
|
||||
clf = Stree(random_state=random_state)
|
||||
res = cross_validate(clf, X, y, cv=kfold, return_estimator=True)
|
||||
scores.append(res["test_score"])
|
||||
return np.mean(scores)
|
||||
|
||||
|
||||
start = time.time()
|
||||
models_tree = [
|
||||
"stree",
|
||||
"stree_default",
|
||||
"wodt",
|
||||
"j48svm",
|
||||
"oc1",
|
||||
"cart",
|
||||
"baseRaF",
|
||||
]
|
||||
dbh = MySQL()
|
||||
database = dbh.get_connection()
|
||||
random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
dt = Datasets(normalize=False, standardize=False, set_of_files="tanveer")
|
||||
header()
|
||||
total = [0, 0, 0]
|
||||
line = TextColor.LINE1
|
||||
for data in dt:
|
||||
name = data[0]
|
||||
X, y = dt.load(name)
|
||||
record = dbh.find_best(name, models_tree, "crossval")
|
||||
X2 = normalize(X)
|
||||
X3 = normalize_rows(X)
|
||||
clf = Stree(random_state=1)
|
||||
ac1 = process_dataset(X, y)
|
||||
ac2 = process_dataset(X2, y)
|
||||
ac3 = process_dataset(X3, y)
|
||||
max_value = max(ac1, ac2, ac3)
|
||||
line = TextColor.LINE2 if line == TextColor.LINE1 else TextColor.LINE1
|
||||
print(line + f"{name:30s} ", end="", flush=True)
|
||||
total[np.argmax([ac1, ac2, ac3])] += 1
|
||||
color1 = TextColor.SUCCESS if ac1 == max_value else line
|
||||
color2 = TextColor.SUCCESS if ac2 == max_value else line
|
||||
color3 = TextColor.SUCCESS if ac3 == max_value else line
|
||||
print(color1 + f"{ac1:9.6f} " + TextColor.ENDC, end="", flush=True)
|
||||
print(color2 + f"{ac2:9.6f} " + TextColor.ENDC, end="", flush=True)
|
||||
print(color3 + f"{ac3:9.6f}" + TextColor.ENDC, end="", flush=True)
|
||||
print(line + f"{record[5]:9.6f} {record[3]}" + TextColor.ENDC)
|
||||
print(f"{'Total':30s} {total[0]:9d} {total[1]:9d} {total[2]:9d}")
|
||||
stop = time.time()
|
||||
hours, rem = divmod(stop - start, 3600)
|
||||
minutes, seconds = divmod(rem, 60)
|
||||
print(f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s")
|
||||
dbh.close()
|
Reference in New Issue
Block a user