import argparse import random import time from datetime import datetime import json import numpy as np from stree import Stree from sklearn.model_selection import KFold, cross_validate from experimentation.Sets import Datasets from experimentation.Database import MySQL 8 def parse_arguments(): ap = argparse.ArgumentParser() ap.add_argument( "-S", "--set-of-files", type=str, choices=["aaai", "tanveer"], required=False, default="tanveer", ) ap.add_argument( "-m", "--model", type=str, required=False, default="stree", help="model name, default stree", ) ap.add_argument( "-d", "--dataset", type=str, required=True, help="dataset to process, all for everyone", ) ap.add_argument( "-s", "--sql", default=False, type=bool, required=False, help="generate report_score.sql", ) ap.add_argument( "-p", "--param", default=False, type=bool, required=False, help="Auto generate params", ) args = ap.parse_args() return (args.set_of_files, args.model, args.dataset, args.sql, args.param) def compute_auto_hyperparams(X, y): params = {"max_iter": 1e4, "C": 0.1} classes = len(np.unique(y)) if classes > 2: params["split_criteria"] = "max_samples" return params def process_dataset(dataset, verbose, model, auto_params): X, y = dt.load(dataset) scores = [] times = [] if verbose: print( f"* Processing dataset [{dataset}] from Set: {set_of_files} with " f"{model}" ) print(f"X.shape: {X.shape}") print(f"{X[:4]}") print(f"Random seeds: {random_seeds}") if auto_params: hyperparameters = compute_auto_hyperparams(X, y) else: hyperparameters = {} for random_state in random_seeds: random.seed(random_state) np.random.seed(random_state) kfold = KFold(shuffle=True, random_state=random_state, n_splits=5) clf = Stree(random_state=random_state) clf.set_params(**hyperparameters) res = cross_validate(clf, X, y, cv=kfold) scores.append(res["test_score"]) times.append(res["fit_time"]) if verbose: print( f"Random seed: {random_state:5d} Accuracy: " f"{res['test_score'].mean():6.4f}±" f"{res['test_score'].std():6.4f} " f"{res['fit_time'].mean():5.3f}s" ) return scores, times, json.dumps(hyperparameters) def store_string(dataset, model, accuracy, time_spent, hyperparameters): attributes = [ "date", "time", "type", "accuracy", "accuracy_std", "dataset", "classifier", "norm", "stand", "time_spent", "time_spent_std", "parameters", ] command_insert = ( "replace into results (" + ",".join(attributes) + ") values(" + ("'%s'," * len(attributes))[:-1] + ");" ) now = datetime.now() date = now.strftime("%Y-%m-%d") time = now.strftime("%H:%M:%S") values = ( date, time, "crossval", np.mean(accuracy), np.std(accuracy), dataset, model, True, False, np.mean(time_spent), np.std(time_spent), hyperparameters, ) result = command_insert % values return result random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] normalize = True standardize = False (set_of_files, model, dataset, sql, auto_params) = parse_arguments() dbh = MySQL() if sql: sql_output = open("report_score.sql", "w") database = dbh.get_connection() dt = Datasets(normalize, standardize, set_of_files) start = time.time() if dataset == "all": print( f"* Process all datasets set with {model}: {set_of_files} " f"norm: {normalize} std: {standardize}" ) print(f"5 Fold Cross Validation with 10 random seeds {random_seeds}\n") print( "{0:30s} {5:4s} {6:3s} {7:2s} {1:13s} {2:13s} {3:8s} {4:90s}".format( "Dataset", "Acc. computed", "Best Accuracy", "Diff.", "Best accuracy hyperparameters", "Samp", "Var", "Cls", ) ) print("=" * 30, end=" ") print("=" * 4, end=" ") print("=" * 3, end=" ") print("=" * 3, end=" ") print("=" * 13, end=" ") print("=" * 13, end=" ") print("=" * 8, end=" ") print("=" * 90) for dataset in dt: X, y = dt.load(dataset[0]) # type: ignore samples, features = X.shape classes = len(np.unique(y)) print( f"{dataset[0]:30s} {samples:4d} {features:3d} " f"{classes:3d} ", end="", ) scores, times, hyperparameters = process_dataset( dataset[0], verbose=False, model=model, auto_params=auto_params ) record = dbh.find_best(dataset[0], model, "crossval") if record is not None: parameters = json.loads(record[8] if record[8] != "" else "{}") parameters.pop("random_state", None) accuracy_best = record[5] acc_best_std = record[11] else: parameters = json.loads("{}") accuracy_best = 0.0 acc_best_std = 0.0 accuracy_computed = np.mean(scores) diff = accuracy_best - accuracy_computed print( f"{accuracy_computed:6.4f}±{np.std(scores):6.4f} " f"{accuracy_best:6.4f}±{acc_best_std:6.4f} {diff:8.5f} " f"{json.dumps(parameters):40s}" ) if sql: command = store_string( dataset[0], model, scores, times, hyperparameters ) print(command, file=sql_output) else: scores, times, hyperparameters = process_dataset( dataset, verbose=True, model=model, auto_params=auto_params ) record = dbh.find_best(dataset, model, "crossval") accuracy = np.mean(scores) accuracy_best = record[5] if record is not None else 0.0 acc_best_std = record[11] if record is not None else 0.0 print( f"* Accuracy Computed : {accuracy:6.4f}±{np.std(scores):6.4f} " f"{np.mean(times):5.3f}s" ) print(f"* Accuracy Best ....: {accuracy_best:6.4f}±{acc_best_std:6.4f}") print(f"* Difference .......: {accuracy_best - accuracy:6.4f}") stop = time.time() print(f"- Auto Hyperparams .: {hyperparameters}") hours, rem = divmod(stop - start, 3600) minutes, seconds = divmod(rem, 60) print(f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s") if sql: sql_output.close() dbh.close()