mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-17 08:25:53 +00:00
Add gridsearch experiment
This commit is contained in:
@@ -7,7 +7,12 @@ from datetime import datetime
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
|
from sklearn.model_selection import (
|
||||||
|
StratifiedKFold,
|
||||||
|
KFold,
|
||||||
|
GridSearchCV,
|
||||||
|
cross_validate,
|
||||||
|
)
|
||||||
from Utils import Folders, Files
|
from Utils import Folders, Files
|
||||||
from Models import Models
|
from Models import Models
|
||||||
|
|
||||||
@@ -288,3 +293,119 @@ class Experiment:
|
|||||||
self._output_results()
|
self._output_results()
|
||||||
if self.progress_bar:
|
if self.progress_bar:
|
||||||
print(f"Results in {self.output_file}")
|
print(f"Results in {self.output_file}")
|
||||||
|
|
||||||
|
|
||||||
|
class GridSearch:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
score_name,
|
||||||
|
model_name,
|
||||||
|
stratified,
|
||||||
|
datasets,
|
||||||
|
platform,
|
||||||
|
progress_bar=True,
|
||||||
|
folds=5,
|
||||||
|
):
|
||||||
|
today = datetime.now()
|
||||||
|
self.time = today.strftime("%H:%M:%S")
|
||||||
|
self.date = today.strftime("%Y-%m-%d")
|
||||||
|
self.output_file = os.path.join(
|
||||||
|
Folders.results,
|
||||||
|
Files.grid_output(
|
||||||
|
score_name,
|
||||||
|
model_name,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.score_name = score_name
|
||||||
|
self.model_name = model_name
|
||||||
|
self.stratified = stratified == "1"
|
||||||
|
self.stratified_class = StratifiedKFold if self.stratified else KFold
|
||||||
|
self.datasets = datasets
|
||||||
|
self.progress_bar = progress_bar
|
||||||
|
self.folds = folds
|
||||||
|
self.platform = platform
|
||||||
|
self.random_seeds = Randomized.seeds
|
||||||
|
self.grid_file = os.path.join(
|
||||||
|
Folders.results, Files.grid_input(score_name, model_name)
|
||||||
|
)
|
||||||
|
with open(self.grid_file) as f:
|
||||||
|
self.grid = json.load(f)
|
||||||
|
self.duration = 0
|
||||||
|
self._init_data()
|
||||||
|
|
||||||
|
def _init_data(self):
|
||||||
|
# if result file not exist initialize it
|
||||||
|
try:
|
||||||
|
with open(self.output_file, "r") as f:
|
||||||
|
self.results = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
# init file
|
||||||
|
output = {}
|
||||||
|
data = Datasets()
|
||||||
|
for item in data:
|
||||||
|
output[item] = [0.0, {}, ""]
|
||||||
|
with open(self.output_file, "w") as f:
|
||||||
|
json.dump(output, f)
|
||||||
|
self.results = output
|
||||||
|
|
||||||
|
def _save_results(self):
|
||||||
|
with open(self.output_file, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
for item in self.datasets:
|
||||||
|
data[item] = self.results[item]
|
||||||
|
with open(self.output_file, "w") as f:
|
||||||
|
json.dump(data, f)
|
||||||
|
|
||||||
|
def _store_result(self, name, grid, duration):
|
||||||
|
d_message = f"{duration:.3f} s"
|
||||||
|
if duration > 3600:
|
||||||
|
d_message = f"{duration / 3600:.3f} h"
|
||||||
|
elif duration > 60:
|
||||||
|
d_message = f"{duration / 60:.3f} min"
|
||||||
|
message = (
|
||||||
|
f"v. {self.version}, Computed on {self.platform} on "
|
||||||
|
f"{self.date} at {self.time} "
|
||||||
|
f"took {d_message}"
|
||||||
|
)
|
||||||
|
score = grid.best_score_
|
||||||
|
hyperparameters = grid.best_params_
|
||||||
|
self.results[name] = [score, hyperparameters, message]
|
||||||
|
|
||||||
|
def do_gridsearch(self):
|
||||||
|
now = time.time()
|
||||||
|
loop = tqdm(
|
||||||
|
list(self.datasets),
|
||||||
|
position=0,
|
||||||
|
disable=not self.progress_bar,
|
||||||
|
)
|
||||||
|
for name in loop:
|
||||||
|
loop.set_description(f"{name:30s}")
|
||||||
|
X, y = self.datasets.load(name)
|
||||||
|
result = self._n_fold_gridsearch(X, y)
|
||||||
|
self._store_result(name, result, time.time() - now)
|
||||||
|
self._save_results()
|
||||||
|
|
||||||
|
def _n_fold_gridsearch(self, X, y):
|
||||||
|
kfold = self.stratified_class(
|
||||||
|
shuffle=True,
|
||||||
|
random_state=self.random_seeds[0],
|
||||||
|
n_splits=self.folds,
|
||||||
|
)
|
||||||
|
clf = Models.get_model(self.model_name)
|
||||||
|
self.version = clf.version() if hasattr(clf, "version") else "-"
|
||||||
|
self._num_warnings = 0
|
||||||
|
warnings.warn = self._warn
|
||||||
|
with warnings.catch_warnings():
|
||||||
|
warnings.filterwarnings("ignore")
|
||||||
|
grid = GridSearchCV(
|
||||||
|
estimator=clf,
|
||||||
|
cv=kfold,
|
||||||
|
param_grid=self.grid,
|
||||||
|
scoring=self.score_name,
|
||||||
|
n_jobs=-1,
|
||||||
|
)
|
||||||
|
grid.fit(X, y)
|
||||||
|
return grid
|
||||||
|
|
||||||
|
def _warn(self, *args, **kwargs) -> None:
|
||||||
|
self._num_warnings += 1
|
||||||
|
@@ -21,7 +21,7 @@ class Models:
|
|||||||
if name == "SVC":
|
if name == "SVC":
|
||||||
return SVC()
|
return SVC()
|
||||||
if name == "ODTE":
|
if name == "ODTE":
|
||||||
return Odte()
|
return Odte(base_estimator=Stree())
|
||||||
if name == "BaggingStree":
|
if name == "BaggingStree":
|
||||||
clf = Stree(random_state=random_state)
|
clf = Stree(random_state=random_state)
|
||||||
return BaggingClassifier(base_estimator=clf)
|
return BaggingClassifier(base_estimator=clf)
|
||||||
|
12
src/Utils.py
12
src/Utils.py
@@ -49,6 +49,18 @@ class Files:
|
|||||||
f"{time}_{stratified}.json"
|
f"{time}_{stratified}.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def grid_input(score, model):
|
||||||
|
return Files.grid("input", score, model)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def grid_output(score, model):
|
||||||
|
return Files.grid("output", score, model)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def grid(kind, score, model):
|
||||||
|
return f"grid_{kind}_{score.replace('_','-')}_{model}.json"
|
||||||
|
|
||||||
def split_file_name(self, name):
|
def split_file_name(self, name):
|
||||||
_, score, model, platform, date, time, stratified = name.split("_")
|
_, score, model, platform, date, time, stratified = name.split("_")
|
||||||
stratified = stratified.replace(self.report_ext, "")
|
stratified = stratified.replace(self.report_ext, "")
|
||||||
|
105
src/build_grid.py
Executable file
105
src/build_grid.py
Executable file
@@ -0,0 +1,105 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from Utils import Files, Folders
|
||||||
|
|
||||||
|
data = [
|
||||||
|
'{"C": 1e4, "gamma": 0.1, "kernel": "rbf"}',
|
||||||
|
'{"C": 7, "gamma": 0.14, "kernel": "rbf"}',
|
||||||
|
'{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 0.95, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"kernel": "rbf"}',
|
||||||
|
'{"kernel": "rbf"}',
|
||||||
|
'{"C": 1.05, "gamma": "auto","kernel": "rbf"}',
|
||||||
|
'{"splitter": "random", "max_features": "auto"}',
|
||||||
|
'{"C": 0.05, "max_features": "auto", "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"kernel": "rbf", "C": 0.05}',
|
||||||
|
'{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 7, "gamma": 0.1, "kernel": "rbf"}',
|
||||||
|
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 7, "gamma": 0.1, "kernel": "rbf"}',
|
||||||
|
'{"C": 0.25, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 0.08, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 0.001, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 2.8, "kernel": "rbf", "gamma": "auto"}',
|
||||||
|
'{"kernel": "rbf"}',
|
||||||
|
'{"C": 0.05, "gamma": 0.1, "kernel": "poly"}',
|
||||||
|
'{"C": 8.25, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C":57, "kernel": "rbf"}',
|
||||||
|
'{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 5, "kernel": "rbf", "gamma": "auto"}',
|
||||||
|
'{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"kernel": "rbf", "gamma": 0.001}',
|
||||||
|
'{"C": 1e4, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 7, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 2.83, "kernel": "rbf", "gamma": "auto"}',
|
||||||
|
'{"C": 0.2, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
'{"C": 2, "gamma": "auto", "kernel": "rbf"}',
|
||||||
|
'{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||||
|
]
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
output = []
|
||||||
|
hyper = ["C", "gamma", "kernel", "multiclass_strategy"]
|
||||||
|
kernels = ["linear", "liblinear", "rbf", "poly"]
|
||||||
|
|
||||||
|
# initialize results
|
||||||
|
for kernel in kernels:
|
||||||
|
results[kernel] = {}
|
||||||
|
for item in hyper:
|
||||||
|
results[kernel][item] = []
|
||||||
|
# load data
|
||||||
|
for item in data:
|
||||||
|
line = json.loads(item)
|
||||||
|
if "kernel" not in line:
|
||||||
|
line["kernel"] = "linear"
|
||||||
|
kernel = line["kernel"]
|
||||||
|
for item in hyper:
|
||||||
|
if item in line:
|
||||||
|
results[kernel][item].append(line[item]) if line[
|
||||||
|
item
|
||||||
|
] not in results[kernel][item] else None
|
||||||
|
|
||||||
|
# Add default values and remove inconsistent values
|
||||||
|
results["linear"]["multiclass_strategy"] = ["ovo"]
|
||||||
|
del results["linear"]["gamma"]
|
||||||
|
del results["liblinear"]["gamma"]
|
||||||
|
results["rbf"]["gamma"].append("scale")
|
||||||
|
results["poly"]["multiclass_strategy"].append("ovo")
|
||||||
|
for kernel in kernels:
|
||||||
|
results[kernel]["C"].append(1.0)
|
||||||
|
|
||||||
|
for item in results:
|
||||||
|
results_tmp = {}
|
||||||
|
for key, value in results[item].items():
|
||||||
|
new_key = f"base_estimator__{key}"
|
||||||
|
try:
|
||||||
|
results_tmp[new_key] = sorted(value)
|
||||||
|
except:
|
||||||
|
t1 = sorted(
|
||||||
|
[
|
||||||
|
x
|
||||||
|
for x in value
|
||||||
|
if isinstance(x, int) or isinstance(x, float)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
t2 = sorted([x for x in value if isinstance(x, str)])
|
||||||
|
results_tmp[new_key] = t1 + t2
|
||||||
|
output.append(results_tmp)
|
||||||
|
|
||||||
|
# save results
|
||||||
|
file_name = Files.grid_input("accuracy", "ODTE")
|
||||||
|
file_output = os.path.join(Folders.results, file_name)
|
||||||
|
with open(file_output, "w") as f:
|
||||||
|
json.dump(output, f)
|
||||||
|
print(f"Grid values saved to {file_output}")
|
101
src/grid.py
Executable file
101
src/grid.py
Executable file
@@ -0,0 +1,101 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import argparse
|
||||||
|
from Experiments import GridSearch, Datasets
|
||||||
|
from Utils import EnvDefault
|
||||||
|
|
||||||
|
"""Do experiment and build result file, optionally print report with results
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument(
|
||||||
|
"-s",
|
||||||
|
"--score",
|
||||||
|
action=EnvDefault,
|
||||||
|
envvar="score",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="score name {accuracy, f1_macro, ...}",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-P",
|
||||||
|
"--platform",
|
||||||
|
action=EnvDefault,
|
||||||
|
envvar="platform",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Platform where the test is run",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-m",
|
||||||
|
"--model",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="model name",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--n_folds",
|
||||||
|
action=EnvDefault,
|
||||||
|
envvar="n_folds",
|
||||||
|
type=int,
|
||||||
|
required=True,
|
||||||
|
help="number of folds",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-q",
|
||||||
|
"--quiet",
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
required=False,
|
||||||
|
help="Wether to show progress bar or not",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-t",
|
||||||
|
"--stratified",
|
||||||
|
action=EnvDefault,
|
||||||
|
envvar="stratified",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Stratified",
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--dataset",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
default=None,
|
||||||
|
help="Gridsearch on this dataset",
|
||||||
|
)
|
||||||
|
args = ap.parse_args()
|
||||||
|
return (
|
||||||
|
args.stratified,
|
||||||
|
args.score,
|
||||||
|
args.model,
|
||||||
|
args.n_folds,
|
||||||
|
args.platform,
|
||||||
|
args.quiet,
|
||||||
|
args.dataset,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
(
|
||||||
|
stratified,
|
||||||
|
score,
|
||||||
|
model,
|
||||||
|
folds,
|
||||||
|
platform,
|
||||||
|
quiet,
|
||||||
|
dataset,
|
||||||
|
) = parse_arguments()
|
||||||
|
job = GridSearch(
|
||||||
|
score_name=score,
|
||||||
|
model_name=model,
|
||||||
|
stratified=stratified,
|
||||||
|
datasets=Datasets(dataset=dataset),
|
||||||
|
progress_bar=not quiet,
|
||||||
|
platform=platform,
|
||||||
|
folds=folds,
|
||||||
|
)
|
||||||
|
job.do_gridsearch()
|
Reference in New Issue
Block a user