mirror of
https://github.com/Doctorado-ML/Stree_datasets.git
synced 2025-08-24 03:46:08 +00:00
Fix normalize error
This commit is contained in:
179
report_score.py
179
report_score.py
@@ -4,10 +4,12 @@ import time
|
||||
from datetime import datetime
|
||||
import json
|
||||
import numpy as np
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from stree import Stree
|
||||
from sklearn.model_selection import KFold, cross_validate
|
||||
from experimentation.Sets import Datasets
|
||||
from experimentation.Database import MySQL
|
||||
from wodt import TreeClassifier
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
@@ -25,8 +27,8 @@ def parse_arguments():
|
||||
"--model",
|
||||
type=str,
|
||||
required=False,
|
||||
default="stree",
|
||||
help="model name, default stree",
|
||||
default="stree_default",
|
||||
help="model name, default stree_default",
|
||||
)
|
||||
ap.add_argument(
|
||||
"-d",
|
||||
@@ -43,49 +45,28 @@ def parse_arguments():
|
||||
required=False,
|
||||
help="generate report_score.sql",
|
||||
)
|
||||
ap.add_argument(
|
||||
"-p",
|
||||
"--param",
|
||||
default=False,
|
||||
type=bool,
|
||||
required=False,
|
||||
help="Auto generate params",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
return (args.set_of_files, args.model, args.dataset, args.sql, args.param)
|
||||
return (args.set_of_files, args.model, args.dataset, args.sql)
|
||||
|
||||
|
||||
def compute_auto_hyperparams(X, y):
|
||||
"""Propuesta de auto configuración de hiperparámetros
|
||||
max_it = 10e4
|
||||
(1 valor)
|
||||
split = impurity si clases==2 y split=max_samples si clases > 2
|
||||
(1 valor)
|
||||
kernel=linear o polinómico
|
||||
(2 valores)
|
||||
C = 0.1, 0.5 y 1.0
|
||||
(3 valores)
|
||||
Caso 1: C=1, max_iter=1e4 + condicional split_max kernel lineal
|
||||
Caso 2: C=0.5, max_iter=1e4 + condicional split_max kernel lineal
|
||||
Caso 3: C=0.1, max_iter=1e4 + condicional split_max kernel lineal
|
||||
Caso 4: C=1, max_iter=1e4 + condicional split_max kernel poly
|
||||
Caso 5: C=0.5, max_iter=1e4 + condicional split_max kernel poly
|
||||
Caso 6: C=0.1, max_iter=1e4 + condicional split_max kernel poly
|
||||
Caso 7: C=1, max_iter=1e4 + condicional + kernel rbf
|
||||
Caso 8: kernel rbf
|
||||
"""
|
||||
# params = {"max_iter": 1e4, "kernel": "rbf"}
|
||||
# classes = len(np.unique(y))
|
||||
# if classes > 2:
|
||||
# params["split_criteria"] = "max_samples"
|
||||
params = {"kernel": "rbf"}
|
||||
return params
|
||||
def get_classifier(model, random_state, hyperparameters):
|
||||
if model == "stree" or model == "stree_default":
|
||||
clf = Stree(random_state=random_state)
|
||||
clf.set_params(**hyperparameters)
|
||||
if model == "wodt":
|
||||
clf = TreeClassifier(random_state=random_state)
|
||||
if model == "cart":
|
||||
clf = DecisionTreeClassifier(random_state=random_state)
|
||||
return clf
|
||||
|
||||
|
||||
def process_dataset(dataset, verbose, model, auto_params):
|
||||
def process_dataset(dataset, verbose, model):
|
||||
X, y = dt.load(dataset)
|
||||
scores = []
|
||||
times = []
|
||||
nodes = []
|
||||
leaves = []
|
||||
depths = []
|
||||
if verbose:
|
||||
print(
|
||||
f"* Processing dataset [{dataset}] from Set: {set_of_files} with "
|
||||
@@ -94,21 +75,31 @@ def process_dataset(dataset, verbose, model, auto_params):
|
||||
print(f"X.shape: {X.shape}")
|
||||
print(f"{X[:4]}")
|
||||
print(f"Random seeds: {random_seeds}")
|
||||
if auto_params:
|
||||
hyperparameters = compute_auto_hyperparams(X, y)
|
||||
else:
|
||||
hyperparameters = {}
|
||||
hyperparameters = json.loads("{}")
|
||||
if model == "stree":
|
||||
# Get the optimized parameters
|
||||
record = dbh.find_best(dataset, model, "gridsearch")
|
||||
hyperparameters = json.loads(record[8] if record[8] != "" else "{}")
|
||||
hyperparameters.pop("random_state", None)
|
||||
for random_state in random_seeds:
|
||||
random.seed(random_state)
|
||||
np.random.seed(random_state)
|
||||
kfold = KFold(shuffle=True, random_state=random_state, n_splits=5)
|
||||
clf = Stree(random_state=random_state)
|
||||
clf.set_params(**hyperparameters)
|
||||
clf = get_classifier(model, random_state, hyperparameters)
|
||||
res = cross_validate(clf, X, y, cv=kfold, return_estimator=True)
|
||||
nodes, leaves = res["estimator"][0].nodes_leaves()
|
||||
depth = res["estimator"][0].depth_
|
||||
scores.append(res["test_score"])
|
||||
times.append(res["fit_time"])
|
||||
for result_item in res["estimator"]:
|
||||
if model == "cart":
|
||||
nodes_item = result_item.tree_.node_count
|
||||
depth_item = result_item.tree_.max_depth
|
||||
leaves_item = result_item.get_n_leaves()
|
||||
else:
|
||||
nodes_item, leaves_item = result_item.nodes_leaves()
|
||||
depth_item = result_item.depth_
|
||||
nodes.append(nodes_item)
|
||||
leaves.append(leaves_item)
|
||||
depths.append(depth_item)
|
||||
if verbose:
|
||||
print(
|
||||
f"Random seed: {random_state:5d} Accuracy: "
|
||||
@@ -116,7 +107,7 @@ def process_dataset(dataset, verbose, model, auto_params):
|
||||
f"{res['test_score'].std():6.4f} "
|
||||
f"{res['fit_time'].mean():5.3f}s"
|
||||
)
|
||||
return scores, times, json.dumps(hyperparameters), nodes, leaves, depth
|
||||
return scores, times, json.dumps(hyperparameters), nodes, leaves, depths
|
||||
|
||||
|
||||
def store_string(
|
||||
@@ -174,82 +165,60 @@ def store_string(
|
||||
random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
normalize = True
|
||||
standardize = False
|
||||
(set_of_files, model, dataset, sql, auto_params) = parse_arguments()
|
||||
(set_of_files, model, dataset, sql) = parse_arguments()
|
||||
dbh = MySQL()
|
||||
if sql:
|
||||
sql_output = open("report_score.sql", "w")
|
||||
sql_output = open(f"{model}.sql", "w")
|
||||
database = dbh.get_connection()
|
||||
dt = Datasets(normalize, standardize, set_of_files)
|
||||
start = time.time()
|
||||
if dataset == "all":
|
||||
print(
|
||||
f"* Process all datasets set with {model}: {set_of_files} "
|
||||
f"norm: {normalize} std: {standardize}"
|
||||
f"norm: {normalize} std: {standardize} store in: {model}"
|
||||
)
|
||||
print(f"5 Fold Cross Validation with 10 random seeds {random_seeds}\n")
|
||||
print(
|
||||
"{0:30s} {5:4s} {6:3s} {7:2s} {8:2s} {9:2s} {10:2s} {1:13s} {2:13s} "
|
||||
"{3:8s} {4:90s}".format(
|
||||
"Dataset",
|
||||
"Acc. computed",
|
||||
"Best Accuracy",
|
||||
"Diff.",
|
||||
"Best accuracy hyperparameters",
|
||||
"Samp",
|
||||
"Var",
|
||||
"Cls",
|
||||
"N",
|
||||
"L",
|
||||
"D",
|
||||
)
|
||||
)
|
||||
print("=" * 30, end=" ")
|
||||
print("=" * 4, end=" ")
|
||||
print("=" * 3, end=" ")
|
||||
print("=" * 3, end=" ")
|
||||
print("=" * 2, end=" ")
|
||||
print("=" * 2, end=" ")
|
||||
print("=" * 2, end=" ")
|
||||
print("=" * 13, end=" ")
|
||||
print("=" * 13, end=" ")
|
||||
print("=" * 8, end=" ")
|
||||
print("=" * 90)
|
||||
header_cols = [
|
||||
"Dataset",
|
||||
"Samp",
|
||||
"Var",
|
||||
"Cls",
|
||||
"Nodes",
|
||||
"Leaves",
|
||||
"Depth",
|
||||
"Accuracy",
|
||||
"Time",
|
||||
"Parameters",
|
||||
]
|
||||
header_lengths = [30, 5, 3, 3, 7, 7, 7, 15, 15, 10]
|
||||
line_col = ""
|
||||
for field, underscore in zip(header_cols, header_lengths):
|
||||
print(f"{field:{underscore}s} ", end="")
|
||||
line_col += "=" * underscore + " "
|
||||
print(f"\n{line_col}")
|
||||
for dataset in dt:
|
||||
X, y = dt.load(dataset[0]) # type: ignore
|
||||
samples, features = X.shape
|
||||
classes = len(np.unique(y))
|
||||
print(
|
||||
f"{dataset[0]:30s} {samples:4d} {features:3d} {classes:3d} ",
|
||||
f"{dataset[0]:30s} {samples:5d} {features:3d} {classes:3d} ",
|
||||
end="",
|
||||
)
|
||||
scores, times, hyperparameters, nodes, leaves, depth = process_dataset(
|
||||
dataset[0], verbose=False, model=model, auto_params=auto_params
|
||||
dataset[0], verbose=False, model=model
|
||||
)
|
||||
complexity = dict(nodes=nodes, leaves=leaves, depth=depth)
|
||||
complexity = dict(
|
||||
nodes=float(np.mean(nodes)),
|
||||
leaves=float(np.mean(leaves)),
|
||||
depth=float(np.mean(depth)),
|
||||
)
|
||||
nodes_item, leaves_item, depth_item = complexity.values()
|
||||
print(
|
||||
f"{nodes:2d} {leaves:2d} {depth:2d} ",
|
||||
f"{nodes_item:7.2f} {leaves_item:7.2f} {depth_item:7.2f} ",
|
||||
end="",
|
||||
)
|
||||
record = dbh.find_best(dataset[0], model, "crossval")
|
||||
if record is not None:
|
||||
parameters = json.loads(record[8] if record[8] != "" else "{}")
|
||||
parameters.pop("random_state", None)
|
||||
accuracy_best = record[5]
|
||||
acc_best_std = record[11]
|
||||
else:
|
||||
parameters = json.loads("{}")
|
||||
accuracy_best = 0.0
|
||||
acc_best_std = 0.0
|
||||
if auto_params:
|
||||
# show parameters computed
|
||||
parameters = json.loads(hyperparameters)
|
||||
accuracy_computed = np.mean(scores)
|
||||
diff = accuracy_best - accuracy_computed
|
||||
print(
|
||||
f"{accuracy_computed:6.4f}±{np.std(scores):6.4f} "
|
||||
f"{accuracy_best:6.4f}±{acc_best_std:6.4f} {diff:8.5f} "
|
||||
f"{json.dumps(parameters):40s}"
|
||||
)
|
||||
print(f"{np.mean(scores):8.6f}±{np.std(scores):6.4f} ", end="")
|
||||
print(f"{np.mean(times):8.6f}±{np.std(times):6.4f} {hyperparameters}")
|
||||
if sql:
|
||||
command = store_string(
|
||||
dataset[0], model, scores, times, hyperparameters, complexity
|
||||
@@ -257,21 +226,25 @@ if dataset == "all":
|
||||
print(command, file=sql_output)
|
||||
else:
|
||||
scores, times, hyperparameters, nodes, leaves, depth = process_dataset(
|
||||
dataset, verbose=True, model=model, auto_params=auto_params
|
||||
dataset, verbose=True, model=model
|
||||
)
|
||||
record = dbh.find_best(dataset, model, "crossval")
|
||||
accuracy = np.mean(scores)
|
||||
accuracy_best = record[5] if record is not None else 0.0
|
||||
acc_best_std = record[11] if record is not None else 0.0
|
||||
print(f"* Normalize/Standard.: {normalize} / {standardize}")
|
||||
print(
|
||||
f"* Accuracy Computed .: {accuracy:6.4f}±{np.std(scores):6.4f} "
|
||||
f"{np.mean(times):5.3f}s"
|
||||
)
|
||||
print(f"* Accuracy Best .....: {accuracy_best:6.4f}±{acc_best_std:6.4f}")
|
||||
print(f"* Difference ........: {accuracy_best - accuracy:6.4f}")
|
||||
print(f"* Nodes/Leaves/Depth : {nodes:2d} {leaves:2d} {depth:2d} ")
|
||||
print(
|
||||
f"* Nodes/Leaves/Depth : {np.mean(nodes):.2f} {np.mean(leaves):.2f} "
|
||||
f"{np.mean(depth):.2f} "
|
||||
)
|
||||
print(f"- Hyperparameters ...: {hyperparameters}")
|
||||
stop = time.time()
|
||||
print(f"- Auto Hyperparams ..: {hyperparameters}")
|
||||
hours, rem = divmod(stop - start, 3600)
|
||||
minutes, seconds = divmod(rem, 60)
|
||||
print(f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s")
|
||||
|
Reference in New Issue
Block a user