Fix normalization & standard. columnwise

This commit is contained in:
2021-04-15 00:30:16 +02:00
parent 59f59e6c4f
commit 84795b4c43
4 changed files with 109 additions and 22 deletions

View File

@@ -1,4 +1,5 @@
from experimentation.Sets import Datasets
import numpy as np
dt = Datasets(normalize=False, standardize=False, set_of_files="tanveer")
print("Checking normalized datasets: ")
@@ -7,6 +8,8 @@ for data in dt:
X, y = dt.load(name)
min_value = X.min()
max_value = X.max()
media = np.mean(X)
desv = np.std(X)
resultado = (
"Normalizado"
if min_value <= 1
@@ -15,4 +18,9 @@ for data in dt:
and max_value >= 0
else "No Normalizado"
)
print(f"{name:30s}: {resultado} ")
resultado2 = (
"Standardizado"
if round(media) == 0 and round(desv) == 1
else "No Standardizado"
)
print(f"{name:30s}: {resultado} {resultado2}")

View File

@@ -24,21 +24,20 @@ def header():
print("Processing Datasets with stree default.\n")
print(
f"{'Dataset':30s} {'No Norm.':9s} {'Normaliz.':9s} "
f"{'Col.Norm.':9s} {'Best score in crossval':25s}"
f"{'Col.Norm.':9s} {'Context B':9s} {'Best score in crossval':25s}"
)
print("=" * 30 + " " + ("=" * 9 + " ") * 3 + "=" * 25)
print("=" * 30 + " " + ("=" * 9 + " ") * 4 + "=" * 25)
def process_dataset(X, y):
def process_dataset(X, y, normalize):
scores = []
# return random.uniform(0, 1)
# Get the optimized parameters
for random_state in random_seeds:
random.seed(random_state)
clf_test = Stree(random_state=random_state, normalize=normalize)
np.random.seed(random_state)
kfold = KFold(shuffle=True, random_state=random_state, n_splits=5)
clf = Stree(random_state=random_state)
res = cross_validate(clf, X, y, cv=kfold, return_estimator=True)
res = cross_validate(clf_test, X, y, cv=kfold, return_estimator=True)
scores.append(res["test_score"])
return np.mean(scores)
@@ -58,7 +57,7 @@ database = dbh.get_connection()
random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
dt = Datasets(normalize=False, standardize=False, set_of_files="tanveer")
header()
total = [0, 0, 0]
total = [0, 0, 0, 0]
line = TextColor.LINE1
for data in dt:
name = data[0]
@@ -66,22 +65,31 @@ for data in dt:
record = dbh.find_best(name, models_tree, "crossval")
X2 = normalize(X)
X3 = normalize_rows(X)
clf = Stree(random_state=1)
ac1 = process_dataset(X, y)
ac2 = process_dataset(X2, y)
ac3 = process_dataset(X3, y)
max_value = max(ac1, ac2, ac3)
ac1 = process_dataset(X, y, False)
ac2 = process_dataset(X2, y, False)
ac3 = process_dataset(X3, y, False)
ac4 = process_dataset(X, y, True)
max_value = round(max(ac1, ac2, ac3, ac4), 6)
line = TextColor.LINE2 if line == TextColor.LINE1 else TextColor.LINE1
print(line + f"{name:30s} ", end="", flush=True)
total[np.argmax([ac1, ac2, ac3])] += 1
total[np.argmax([ac1, ac2, ac3, ac4])] += 1
color1 = TextColor.SUCCESS if ac1 == max_value else line
color2 = TextColor.SUCCESS if ac2 == max_value else line
color3 = TextColor.SUCCESS if ac3 == max_value else line
color4 = TextColor.SUCCESS if ac4 == max_value else line
print(color1 + f"{ac1:9.6f} " + TextColor.ENDC, end="", flush=True)
print(color2 + f"{ac2:9.6f} " + TextColor.ENDC, end="", flush=True)
print(color3 + f"{ac3:9.6f}" + TextColor.ENDC, end="", flush=True)
print(line + f"{record[5]:9.6f} {record[3]}" + TextColor.ENDC)
print(f"{'Total':30s} {total[0]:9d} {total[1]:9d} {total[2]:9d}")
print(color3 + f"{ac3:9.6f} " + TextColor.ENDC, end="", flush=True)
print(color4 + f"{ac4:9.6f}" + TextColor.ENDC, end="", flush=True)
best_accuracy = round(record[5], 6)
best_color = TextColor.UNDERLINE if best_accuracy >= max_value else ""
print(
line
+ best_color
+ f"{best_accuracy:9.6f} {record[3]}"
+ TextColor.ENDC
)
print(f"{'Total':30s} {total[0]:9d} {total[1]:9d} {total[2]:9d} {total[3]:9d}")
stop = time.time()
hours, rem = divmod(stop - start, 3600)
minutes, seconds = divmod(rem, 60)

View File

@@ -35,23 +35,39 @@ class Dataset_Base:
"""
pass
def normalize(self, data: np.array) -> np.array:
@staticmethod
def normalize(data: np.array) -> np.array:
min_data = data.min()
return (data - min_data) / (data.max() - min_data)
def standardize(self, data: np.array) -> np.array:
@staticmethod
def normalize_rows(data: np.array) -> np.array:
res = data.copy()
for col in range(res.shape[1]):
res[:, col] = Dataset_Base.normalize(res[:, col])
return res
@staticmethod
def standardize(data: np.array) -> np.array:
return (data - data.mean()) / data.std()
@staticmethod
def standardize_rows(data: np.array) -> np.array:
res = data.copy()
for col in range(res.shape[1]):
res[:, col] = Dataset_Base.standardize(res[:, col])
return res
def get_params(self) -> str:
return f"normalize={self._normalize}, standardize={self._standardize}"
def post_process(self, X: np.array, y: np.array) -> tdataset:
if self._standardize and self._normalize:
X = self.standardize(self.normalize(X))
X = self.standardize_rows(self.normalize_rows(X))
elif self._standardize:
X = self.standardize(X)
X = self.standardize_rows(X)
elif self._normalize:
X = self.normalize(X)
X = self.normalize_rows(X)
return X, y
def __iter__(self) -> Diterator:

View File

@@ -0,0 +1,55 @@
* Process all datasets set with stree_default: tanveer norm: False std: False store in: stree_default
5 Fold Cross Validation with 10 random seeds [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
Dataset Samp Var Cls Nodes Leaves Depth Accuracy Time Parameters
============================== ===== === === ======= ======= ======= =============== =============== ==========
balance-scale 625 4 3 27.52 14.26 8.06 0.909600±0.0265 0.028588±0.0084 {}
balloons 16 4 2 3.32 2.16 2.08 0.666667±0.2544 0.001797±0.0006 {}
breast-cancer-wisc-diag 569 30 2 5.40 3.20 2.90 0.968364±0.0176 0.011921±0.0026 {}
breast-cancer-wisc-prog 198 33 2 7.20 4.10 3.22 0.807038±0.0519 0.030704±0.0080 {}
breast-cancer-wisc 699 9 2 8.64 4.82 4.08 0.966805±0.0136 0.012557±0.0025 {}
breast-cancer 286 9 2 21.52 11.26 5.92 0.731428±0.0463 0.037908±0.0124 {}
cardiotocography-10clases 2126 21 10 176.20 88.60 23.48 0.795250±0.0203 3.384487±0.3374 {}
cardiotocography-3clases 2126 21 3 51.04 26.02 9.46 0.900566±0.0145 1.187683±0.1610 {}
conn-bench-sonar-mines-rocks 208 60 2 6.04 3.52 2.84 0.748351±0.0692 0.013475±0.0036 {}
cylinder-bands 512 35 2 26.32 13.66 6.84 0.708785±0.0412 0.323182±0.1296 {}
dermatology 366 34 6 11.08 6.04 6.04 0.964720±0.0219 0.016816±0.0010 {}
echocardiogram 131 10 2 9.60 5.30 4.04 0.802593±0.0769 0.009271±0.0038 {}
fertility 100 9 2 2.68 1.84 1.78 0.868000±0.0606 0.003031±0.0014 {}
haberman-survival 306 3 2 26.32 13.66 6.26 0.736610±0.0456 0.024594±0.0054 {}
heart-hungarian 294 12 2 13.80 7.40 4.48 0.819719±0.0534 0.023521±0.0042 {}
hepatitis 155 19 2 11.32 6.16 4.52 0.789677±0.0745 0.011099±0.0035 {}
ilpd-indian-liver 583 9 2 17.28 9.14 5.54 0.717137±0.0385 0.037171±0.0176 {}
ionosphere 351 33 2 7.24 4.12 3.60 0.872040±0.0411 0.023299±0.0057 {}
iris 150 4 3 5.32 3.16 3.06 0.964667±0.0323 0.004864±0.0008 {}
led-display 1000 7 10 47.36 24.18 17.74 0.703900±0.0294 0.345882±0.0343 {}
libras 360 90 15 54.64 27.82 23.90 0.744444±0.0485 5.653415±0.9524 {}
low-res-spect 531 100 9 23.80 12.40 10.14 0.854031±0.0356 0.786059±0.0930 {}
lymphography 148 18 4 14.80 7.90 5.42 0.778437±0.0793 0.016700±0.0037 {}
mammographic 961 5 2 8.76 4.88 4.42 0.819148±0.0227 0.048142±0.0106 {}
molec-biol-promoter 106 57 2 3.00 2.00 2.00 0.765238±0.0835 0.001846±0.0001 {}
musk-1 476 166 2 6.72 3.86 3.00 0.843890±0.0322 0.259591±0.0426 {}
oocytes_merluccius_nucleus_4d 1022 41 2 15.36 8.18 4.78 0.812123±0.0248 1.502560±0.2593 {}
oocytes_merluccius_states_2f 1022 25 3 19.96 10.48 5.50 0.915366±0.0192 0.244292±0.0545 {}
oocytes_trisopterus_nucleus_2f 912 25 2 30.20 15.60 7.54 0.800437±0.0237 0.775162±0.1727 {}
oocytes_trisopterus_states_5b 912 32 3 11.40 6.20 4.50 0.916876±0.0191 0.670656±0.2637 {}
parkinsons 195 22 2 8.24 4.62 3.64 0.883077±0.0456 0.010391±0.0025 {}
pima 768 8 2 17.20 9.10 5.66 0.765999±0.0311 0.076647±0.0243 {}
pittsburg-bridges-MATERIAL 106 7 3 10.68 5.84 4.46 0.800736±0.0635 0.009084±0.0019 {}
pittsburg-bridges-REL-L 103 7 3 17.20 9.10 6.28 0.628429±0.1009 0.017778±0.0043 {}
pittsburg-bridges-SPAN 92 7 3 13.72 7.36 5.12 0.629181±0.1008 0.011592±0.0026 {}
pittsburg-bridges-T-OR-D 102 7 2 5.16 3.08 2.90 0.860571±0.0679 0.004102±0.0009 {}
planning 182 12 2 4.00 2.50 2.28 0.703468±0.0751 0.011304±0.0055 {}
post-operative 90 8 3 3.68 2.34 2.26 0.675556±0.0919 0.006637±0.0026 {}
seeds 210 7 3 9.40 5.20 4.32 0.948571±0.0345 0.008227±0.0011 {}
statlog-australian-credit 690 14 2 10.88 5.94 4.48 0.667391±0.0371 0.257995±0.1398 {}
statlog-german-credit 1000 24 2 21.88 11.44 6.44 0.762200±0.0264 0.307171±0.0651 {}
statlog-heart 270 13 2 13.52 7.26 4.72 0.821481±0.0433 0.018700±0.0046 {}
statlog-image 2310 18 7 28.96 14.98 10.14 0.958225±0.0081 1.604884±0.4019 {}
statlog-vehicle 846 18 4 21.28 11.14 6.66 0.783314±0.0397 0.335944±0.0666 {}
synthetic-control 600 60 6 12.64 6.82 6.30 0.941167±0.0317 0.413927±0.0837 {}
tic-tac-toe 958 9 2 3.00 2.00 2.00 0.983296±0.0084 0.015215±0.0018 {}
vertebral-column-2clases 310 6 2 9.32 5.16 4.48 0.850323±0.0414 0.010402±0.0018 {}
wine 178 13 3 5.00 3.00 3.00 0.975254±0.0243 0.002708±0.0001 {}
zoo 101 16 7 13.00 7.00 7.00 0.946619±0.0453 0.009794±0.0004 {}
Time: 0h 15m 36s