Add resport score for stree

update param_analysis for stree only
This commit is contained in:
2021-03-23 00:50:04 +01:00
parent 08fb237001
commit 47078208bc
4 changed files with 291 additions and 49 deletions

236
report_score.py Normal file
View File

@@ -0,0 +1,236 @@
import argparse
import random
import time
from datetime import datetime
import json
import numpy as np
from stree import Stree
from sklearn.model_selection import KFold, cross_validate
from experimentation.Sets import Datasets
from experimentation.Database import MySQL
8
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-S",
"--set-of-files",
type=str,
choices=["aaai", "tanveer"],
required=False,
default="tanveer",
)
ap.add_argument(
"-m",
"--model",
type=str,
required=False,
default="stree",
help="model name, default stree",
)
ap.add_argument(
"-d",
"--dataset",
type=str,
required=True,
help="dataset to process, all for everyone",
)
ap.add_argument(
"-s",
"--sql",
default=False,
type=bool,
required=False,
help="generate report_score.sql",
)
ap.add_argument(
"-p",
"--param",
default=False,
type=bool,
required=False,
help="Auto generate params",
)
args = ap.parse_args()
return (args.set_of_files, args.model, args.dataset, args.sql, args.param)
def compute_auto_hyperparams(X, y):
params = {"max_iter": 1e4, "C": 0.1}
classes = len(np.unique(y))
if classes > 2:
params["split_criteria"] = "max_samples"
return params
def process_dataset(dataset, verbose, model, auto_params):
X, y = dt.load(dataset)
scores = []
times = []
if verbose:
print(
f"* Processing dataset [{dataset}] from Set: {set_of_files} with "
f"{model}"
)
print(f"X.shape: {X.shape}")
print(f"{X[:4]}")
print(f"Random seeds: {random_seeds}")
if auto_params:
hyperparameters = compute_auto_hyperparams(X, y)
else:
hyperparameters = {}
for random_state in random_seeds:
random.seed(random_state)
np.random.seed(random_state)
kfold = KFold(shuffle=True, random_state=random_state, n_splits=5)
clf = Stree(random_state=random_state)
clf.set_params(**hyperparameters)
res = cross_validate(clf, X, y, cv=kfold)
scores.append(res["test_score"])
times.append(res["fit_time"])
if verbose:
print(
f"Random seed: {random_state:5d} Accuracy: "
f"{res['test_score'].mean():6.4f}±"
f"{res['test_score'].std():6.4f} "
f"{res['fit_time'].mean():5.3f}s"
)
return scores, times, json.dumps(hyperparameters)
def store_string(dataset, model, accuracy, time_spent, hyperparameters):
attributes = [
"date",
"time",
"type",
"accuracy",
"accuracy_std",
"dataset",
"classifier",
"norm",
"stand",
"time_spent",
"time_spent_std",
"parameters",
]
command_insert = (
"replace into results ("
+ ",".join(attributes)
+ ") values("
+ ("'%s'," * len(attributes))[:-1]
+ ");"
)
now = datetime.now()
date = now.strftime("%Y-%m-%d")
time = now.strftime("%H:%M:%S")
values = (
date,
time,
"crossval",
np.mean(accuracy),
np.std(accuracy),
dataset,
model,
True,
False,
np.mean(time_spent),
np.std(time_spent),
hyperparameters,
)
result = command_insert % values
return result
random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
normalize = True
standardize = False
(set_of_files, model, dataset, sql, auto_params) = parse_arguments()
dbh = MySQL()
if sql:
sql_output = open("report_score.sql", "w")
database = dbh.get_connection()
dt = Datasets(normalize, standardize, set_of_files)
start = time.time()
if dataset == "all":
print(
f"* Process all datasets set with {model}: {set_of_files} "
f"norm: {normalize} std: {standardize}"
)
print(f"5 Fold Cross Validation with 10 random seeds {random_seeds}\n")
print(
"{0:30s} {5:4s} {6:3s} {7:2s} {1:13s} {2:13s} {3:8s} {4:90s}".format(
"Dataset",
"Acc. computed",
"Best Accuracy",
"Diff.",
"Best accuracy hyperparameters",
"Samp",
"Var",
"Cls",
)
)
print("=" * 30, end=" ")
print("=" * 4, end=" ")
print("=" * 3, end=" ")
print("=" * 3, end=" ")
print("=" * 13, end=" ")
print("=" * 13, end=" ")
print("=" * 8, end=" ")
print("=" * 90)
for dataset in dt:
X, y = dt.load(dataset[0]) # type: ignore
samples, features = X.shape
classes = len(np.unique(y))
print(
f"{dataset[0]:30s} {samples:4d} {features:3d} " f"{classes:3d} ",
end="",
)
scores, times, hyperparameters = process_dataset(
dataset[0], verbose=False, model=model, auto_params=auto_params
)
record = dbh.find_best(dataset[0], model, "crossval")
if record is not None:
parameters = json.loads(record[8] if record[8] != "" else "{}")
parameters.pop("random_state", None)
accuracy_best = record[5]
acc_best_std = record[11]
else:
parameters = json.loads("{}")
accuracy_best = 0.0
acc_best_std = 0.0
accuracy_computed = np.mean(scores)
diff = accuracy_best - accuracy_computed
print(
f"{accuracy_computed:6.4f}±{np.std(scores):6.4f} "
f"{accuracy_best:6.4f}±{acc_best_std:6.4f} {diff:8.5f} "
f"{json.dumps(parameters):40s}"
)
if sql:
command = store_string(
dataset[0], model, scores, times, hyperparameters
)
print(command, file=sql_output)
else:
scores, times, hyperparameters = process_dataset(
dataset, verbose=True, model=model, auto_params=auto_params
)
record = dbh.find_best(dataset, model, "crossval")
accuracy = np.mean(scores)
accuracy_best = record[5] if record is not None else 0.0
acc_best_std = record[11] if record is not None else 0.0
print(
f"* Accuracy Computed : {accuracy:6.4f}±{np.std(scores):6.4f} "
f"{np.mean(times):5.3f}s"
)
print(f"* Accuracy Best ....: {accuracy_best:6.4f}±{acc_best_std:6.4f}")
print(f"* Difference .......: {accuracy_best - accuracy:6.4f}")
stop = time.time()
print(f"- Auto Hyperparams .: {hyperparameters}")
hours, rem = divmod(stop - start, 3600)
minutes, seconds = divmod(rem, 60)
print(f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s")
if sql:
sql_output.close()
dbh.close()