diff --git a/src/Experiments.py b/src/Experiments.py index 8470771..029ddcb 100644 --- a/src/Experiments.py +++ b/src/Experiments.py @@ -7,7 +7,12 @@ from datetime import datetime from tqdm import tqdm import numpy as np import pandas as pd -from sklearn.model_selection import StratifiedKFold, KFold, cross_validate +from sklearn.model_selection import ( + StratifiedKFold, + KFold, + GridSearchCV, + cross_validate, +) from Utils import Folders, Files from Models import Models @@ -288,3 +293,119 @@ class Experiment: self._output_results() if self.progress_bar: print(f"Results in {self.output_file}") + + +class GridSearch: + def __init__( + self, + score_name, + model_name, + stratified, + datasets, + platform, + progress_bar=True, + folds=5, + ): + today = datetime.now() + self.time = today.strftime("%H:%M:%S") + self.date = today.strftime("%Y-%m-%d") + self.output_file = os.path.join( + Folders.results, + Files.grid_output( + score_name, + model_name, + ), + ) + self.score_name = score_name + self.model_name = model_name + self.stratified = stratified == "1" + self.stratified_class = StratifiedKFold if self.stratified else KFold + self.datasets = datasets + self.progress_bar = progress_bar + self.folds = folds + self.platform = platform + self.random_seeds = Randomized.seeds + self.grid_file = os.path.join( + Folders.results, Files.grid_input(score_name, model_name) + ) + with open(self.grid_file) as f: + self.grid = json.load(f) + self.duration = 0 + self._init_data() + + def _init_data(self): + # if result file not exist initialize it + try: + with open(self.output_file, "r") as f: + self.results = json.load(f) + except FileNotFoundError: + # init file + output = {} + data = Datasets() + for item in data: + output[item] = [0.0, {}, ""] + with open(self.output_file, "w") as f: + json.dump(output, f) + self.results = output + + def _save_results(self): + with open(self.output_file, "r") as f: + data = json.load(f) + for item in self.datasets: + data[item] = self.results[item] + with open(self.output_file, "w") as f: + json.dump(data, f) + + def _store_result(self, name, grid, duration): + d_message = f"{duration:.3f} s" + if duration > 3600: + d_message = f"{duration / 3600:.3f} h" + elif duration > 60: + d_message = f"{duration / 60:.3f} min" + message = ( + f"v. {self.version}, Computed on {self.platform} on " + f"{self.date} at {self.time} " + f"took {d_message}" + ) + score = grid.best_score_ + hyperparameters = grid.best_params_ + self.results[name] = [score, hyperparameters, message] + + def do_gridsearch(self): + now = time.time() + loop = tqdm( + list(self.datasets), + position=0, + disable=not self.progress_bar, + ) + for name in loop: + loop.set_description(f"{name:30s}") + X, y = self.datasets.load(name) + result = self._n_fold_gridsearch(X, y) + self._store_result(name, result, time.time() - now) + self._save_results() + + def _n_fold_gridsearch(self, X, y): + kfold = self.stratified_class( + shuffle=True, + random_state=self.random_seeds[0], + n_splits=self.folds, + ) + clf = Models.get_model(self.model_name) + self.version = clf.version() if hasattr(clf, "version") else "-" + self._num_warnings = 0 + warnings.warn = self._warn + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + grid = GridSearchCV( + estimator=clf, + cv=kfold, + param_grid=self.grid, + scoring=self.score_name, + n_jobs=-1, + ) + grid.fit(X, y) + return grid + + def _warn(self, *args, **kwargs) -> None: + self._num_warnings += 1 diff --git a/src/Models.py b/src/Models.py index 281d01e..5b12155 100644 --- a/src/Models.py +++ b/src/Models.py @@ -21,7 +21,7 @@ class Models: if name == "SVC": return SVC() if name == "ODTE": - return Odte() + return Odte(base_estimator=Stree()) if name == "BaggingStree": clf = Stree(random_state=random_state) return BaggingClassifier(base_estimator=clf) diff --git a/src/Utils.py b/src/Utils.py index 5cda853..1adad1c 100644 --- a/src/Utils.py +++ b/src/Utils.py @@ -49,6 +49,18 @@ class Files: f"{time}_{stratified}.json" ) + @staticmethod + def grid_input(score, model): + return Files.grid("input", score, model) + + @staticmethod + def grid_output(score, model): + return Files.grid("output", score, model) + + @staticmethod + def grid(kind, score, model): + return f"grid_{kind}_{score.replace('_','-')}_{model}.json" + def split_file_name(self, name): _, score, model, platform, date, time, stratified = name.split("_") stratified = stratified.replace(self.report_ext, "") diff --git a/src/build_grid.py b/src/build_grid.py new file mode 100755 index 0000000..c0fc166 --- /dev/null +++ b/src/build_grid.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +import os +import json +from Utils import Files, Folders + +data = [ + '{"C": 1e4, "gamma": 0.1, "kernel": "rbf"}', + '{"C": 7, "gamma": 0.14, "kernel": "rbf"}', + '{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 0.95, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"kernel": "rbf"}', + '{"kernel": "rbf"}', + '{"C": 1.05, "gamma": "auto","kernel": "rbf"}', + '{"splitter": "random", "max_features": "auto"}', + '{"C": 0.05, "max_features": "auto", "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"kernel": "rbf", "C": 0.05}', + '{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 7, "gamma": 0.1, "kernel": "rbf"}', + '{"kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 7, "gamma": 0.1, "kernel": "rbf"}', + '{"C": 0.25, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 0.08, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 0.001, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 2.8, "kernel": "rbf", "gamma": "auto"}', + '{"kernel": "rbf"}', + '{"C": 0.05, "gamma": 0.1, "kernel": "poly"}', + '{"C": 8.25, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr"}', + '{"kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C":57, "kernel": "rbf"}', + '{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": "ovr"}', + '{"C": 5, "kernel": "rbf", "gamma": "auto"}', + '{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": "ovr"}', + '{"kernel": "rbf", "gamma": 0.001}', + '{"C": 1e4, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 7, "kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 2.83, "kernel": "rbf", "gamma": "auto"}', + '{"C": 0.2, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr"}', + '{"kernel": "liblinear", "multiclass_strategy": "ovr"}', + '{"C": 2, "gamma": "auto", "kernel": "rbf"}', + '{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}', +] + +results = {} +output = [] +hyper = ["C", "gamma", "kernel", "multiclass_strategy"] +kernels = ["linear", "liblinear", "rbf", "poly"] + +# initialize results +for kernel in kernels: + results[kernel] = {} + for item in hyper: + results[kernel][item] = [] +# load data +for item in data: + line = json.loads(item) + if "kernel" not in line: + line["kernel"] = "linear" + kernel = line["kernel"] + for item in hyper: + if item in line: + results[kernel][item].append(line[item]) if line[ + item + ] not in results[kernel][item] else None + +# Add default values and remove inconsistent values +results["linear"]["multiclass_strategy"] = ["ovo"] +del results["linear"]["gamma"] +del results["liblinear"]["gamma"] +results["rbf"]["gamma"].append("scale") +results["poly"]["multiclass_strategy"].append("ovo") +for kernel in kernels: + results[kernel]["C"].append(1.0) + +for item in results: + results_tmp = {} + for key, value in results[item].items(): + new_key = f"base_estimator__{key}" + try: + results_tmp[new_key] = sorted(value) + except: + t1 = sorted( + [ + x + for x in value + if isinstance(x, int) or isinstance(x, float) + ] + ) + t2 = sorted([x for x in value if isinstance(x, str)]) + results_tmp[new_key] = t1 + t2 + output.append(results_tmp) + +# save results +file_name = Files.grid_input("accuracy", "ODTE") +file_output = os.path.join(Folders.results, file_name) +with open(file_output, "w") as f: + json.dump(output, f) +print(f"Grid values saved to {file_output}") diff --git a/src/grid.py b/src/grid.py new file mode 100755 index 0000000..0c693ae --- /dev/null +++ b/src/grid.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +import argparse +from Experiments import GridSearch, Datasets +from Utils import EnvDefault + +"""Do experiment and build result file, optionally print report with results +""" + + +def parse_arguments(): + ap = argparse.ArgumentParser() + ap.add_argument( + "-s", + "--score", + action=EnvDefault, + envvar="score", + type=str, + required=True, + help="score name {accuracy, f1_macro, ...}", + ) + ap.add_argument( + "-P", + "--platform", + action=EnvDefault, + envvar="platform", + type=str, + required=True, + help="Platform where the test is run", + ) + ap.add_argument( + "-m", + "--model", + type=str, + required=True, + help="model name", + ) + ap.add_argument( + "-n", + "--n_folds", + action=EnvDefault, + envvar="n_folds", + type=int, + required=True, + help="number of folds", + ) + ap.add_argument( + "-q", + "--quiet", + type=bool, + default=False, + required=False, + help="Wether to show progress bar or not", + ) + ap.add_argument( + "-t", + "--stratified", + action=EnvDefault, + envvar="stratified", + type=str, + required=True, + help="Stratified", + ) + ap.add_argument( + "-d", + "--dataset", + type=str, + required=True, + default=None, + help="Gridsearch on this dataset", + ) + args = ap.parse_args() + return ( + args.stratified, + args.score, + args.model, + args.n_folds, + args.platform, + args.quiet, + args.dataset, + ) + + +( + stratified, + score, + model, + folds, + platform, + quiet, + dataset, +) = parse_arguments() +job = GridSearch( + score_name=score, + model_name=model, + stratified=stratified, + datasets=Datasets(dataset=dataset), + progress_bar=not quiet, + platform=platform, + folds=folds, +) +job.do_gridsearch()