Add gridsearch experiment

2025-08-15 23:45:54 +00:00 · 2022-03-09 13:44:49 +01:00
parent 3d0ab041ee
commit c3e05f7d27
5 changed files with 341 additions and 2 deletions
--- a/src/Experiments.py
+++ b/src/Experiments.py
@@ -7,7 +7,12 @@ from datetime import datetime
 from tqdm import tqdm
 import numpy as np
 import pandas as pd
-from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
+from sklearn.model_selection import (
+    StratifiedKFold,
+    KFold,
+    GridSearchCV,
+    cross_validate,
+)
 from Utils import Folders, Files
 from Models import Models

@@ -288,3 +293,119 @@ class Experiment:
        self._output_results()
        if self.progress_bar:
            print(f"Results in {self.output_file}")
+
+
+class GridSearch:
+    def __init__(
+        self,
+        score_name,
+        model_name,
+        stratified,
+        datasets,
+        platform,
+        progress_bar=True,
+        folds=5,
+    ):
+        today = datetime.now()
+        self.time = today.strftime("%H:%M:%S")
+        self.date = today.strftime("%Y-%m-%d")
+        self.output_file = os.path.join(
+            Folders.results,
+            Files.grid_output(
+                score_name,
+                model_name,
+            ),
+        )
+        self.score_name = score_name
+        self.model_name = model_name
+        self.stratified = stratified == "1"
+        self.stratified_class = StratifiedKFold if self.stratified else KFold
+        self.datasets = datasets
+        self.progress_bar = progress_bar
+        self.folds = folds
+        self.platform = platform
+        self.random_seeds = Randomized.seeds
+        self.grid_file = os.path.join(
+            Folders.results, Files.grid_input(score_name, model_name)
+        )
+        with open(self.grid_file) as f:
+            self.grid = json.load(f)
+        self.duration = 0
+        self._init_data()
+
+    def _init_data(self):
+        # if result file not exist initialize it
+        try:
+            with open(self.output_file, "r") as f:
+                self.results = json.load(f)
+        except FileNotFoundError:
+            # init file
+            output = {}
+            data = Datasets()
+            for item in data:
+                output[item] = [0.0, {}, ""]
+            with open(self.output_file, "w") as f:
+                json.dump(output, f)
+                self.results = output
+
+    def _save_results(self):
+        with open(self.output_file, "r") as f:
+            data = json.load(f)
+        for item in self.datasets:
+            data[item] = self.results[item]
+        with open(self.output_file, "w") as f:
+            json.dump(data, f)
+
+    def _store_result(self, name, grid, duration):
+        d_message = f"{duration:.3f} s"
+        if duration > 3600:
+            d_message = f"{duration / 3600:.3f} h"
+        elif duration > 60:
+            d_message = f"{duration / 60:.3f} min"
+        message = (
+            f"v. {self.version}, Computed on {self.platform} on "
+            f"{self.date} at {self.time} "
+            f"took {d_message}"
+        )
+        score = grid.best_score_
+        hyperparameters = grid.best_params_
+        self.results[name] = [score, hyperparameters, message]
+
+    def do_gridsearch(self):
+        now = time.time()
+        loop = tqdm(
+            list(self.datasets),
+            position=0,
+            disable=not self.progress_bar,
+        )
+        for name in loop:
+            loop.set_description(f"{name:30s}")
+            X, y = self.datasets.load(name)
+            result = self._n_fold_gridsearch(X, y)
+            self._store_result(name, result, time.time() - now)
+        self._save_results()
+
+    def _n_fold_gridsearch(self, X, y):
+        kfold = self.stratified_class(
+            shuffle=True,
+            random_state=self.random_seeds[0],
+            n_splits=self.folds,
+        )
+        clf = Models.get_model(self.model_name)
+        self.version = clf.version() if hasattr(clf, "version") else "-"
+        self._num_warnings = 0
+        warnings.warn = self._warn
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            grid = GridSearchCV(
+                estimator=clf,
+                cv=kfold,
+                param_grid=self.grid,
+                scoring=self.score_name,
+                n_jobs=-1,
+            )
+            grid.fit(X, y)
+            return grid
+
+    def _warn(self, *args, **kwargs) -> None:
+        self._num_warnings += 1
--- a/src/Models.py
+++ b/src/Models.py
@@ -21,7 +21,7 @@ class Models:
        if name == "SVC":
            return SVC()
        if name == "ODTE":
-            return Odte()
+            return Odte(base_estimator=Stree())
        if name == "BaggingStree":
            clf = Stree(random_state=random_state)
            return BaggingClassifier(base_estimator=clf)
--- a/src/Utils.py
+++ b/src/Utils.py
@@ -49,6 +49,18 @@ class Files:
            f"{time}_{stratified}.json"
        )

+    @staticmethod
+    def grid_input(score, model):
+        return Files.grid("input", score, model)
+
+    @staticmethod
+    def grid_output(score, model):
+        return Files.grid("output", score, model)
+
+    @staticmethod
+    def grid(kind, score, model):
+        return f"grid_{kind}_{score.replace('_','-')}_{model}.json"
+
    def split_file_name(self, name):
        _, score, model, platform, date, time, stratified = name.split("_")
        stratified = stratified.replace(self.report_ext, "")
--- a/src/build_grid.py
+++ b/src/build_grid.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+import os
+import json
+from Utils import Files, Folders
+
+data = [
+    '{"C": 1e4, "gamma": 0.1, "kernel": "rbf"}',
+    '{"C": 7, "gamma": 0.14, "kernel": "rbf"}',
+    '{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 0.95, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"kernel": "rbf"}',
+    '{"kernel": "rbf"}',
+    '{"C": 1.05, "gamma": "auto","kernel": "rbf"}',
+    '{"splitter": "random", "max_features": "auto"}',
+    '{"C": 0.05, "max_features": "auto", "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"kernel": "rbf", "C": 0.05}',
+    '{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 7, "gamma": 0.1, "kernel": "rbf"}',
+    '{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 7, "gamma": 0.1, "kernel": "rbf"}',
+    '{"C": 0.25, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 0.08, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 0.001, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 2.8, "kernel": "rbf", "gamma": "auto"}',
+    '{"kernel": "rbf"}',
+    '{"C": 0.05, "gamma": 0.1, "kernel": "poly"}',
+    '{"C": 8.25, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr"}',
+    '{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C":57, "kernel": "rbf"}',
+    '{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": "ovr"}',
+    '{"C": 5, "kernel": "rbf", "gamma": "auto"}',
+    '{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": "ovr"}',
+    '{"kernel": "rbf", "gamma": 0.001}',
+    '{"C": 1e4, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 7, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 2.83, "kernel": "rbf", "gamma": "auto"}',
+    '{"C": 0.2, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr"}',
+    '{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
+    '{"C": 2, "gamma": "auto", "kernel": "rbf"}',
+    '{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
+]
+
+results = {}
+output = []
+hyper = ["C", "gamma", "kernel", "multiclass_strategy"]
+kernels = ["linear", "liblinear", "rbf", "poly"]
+
+# initialize results
+for kernel in kernels:
+    results[kernel] = {}
+    for item in hyper:
+        results[kernel][item] = []
+# load data
+for item in data:
+    line = json.loads(item)
+    if "kernel" not in line:
+        line["kernel"] = "linear"
+    kernel = line["kernel"]
+    for item in hyper:
+        if item in line:
+            results[kernel][item].append(line[item]) if line[
+                item
+            ] not in results[kernel][item] else None
+
+# Add default values and remove inconsistent values
+results["linear"]["multiclass_strategy"] = ["ovo"]
+del results["linear"]["gamma"]
+del results["liblinear"]["gamma"]
+results["rbf"]["gamma"].append("scale")
+results["poly"]["multiclass_strategy"].append("ovo")
+for kernel in kernels:
+    results[kernel]["C"].append(1.0)
+
+for item in results:
+    results_tmp = {}
+    for key, value in results[item].items():
+        new_key = f"base_estimator__{key}"
+        try:
+            results_tmp[new_key] = sorted(value)
+        except:
+            t1 = sorted(
+                [
+                    x
+                    for x in value
+                    if isinstance(x, int) or isinstance(x, float)
+                ]
+            )
+            t2 = sorted([x for x in value if isinstance(x, str)])
+            results_tmp[new_key] = t1 + t2
+    output.append(results_tmp)
+
+# save results
+file_name = Files.grid_input("accuracy", "ODTE")
+file_output = os.path.join(Folders.results, file_name)
+with open(file_output, "w") as f:
+    json.dump(output, f)
+print(f"Grid values saved to {file_output}")
--- a/src/grid.py
+++ b/src/grid.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+import argparse
+from Experiments import GridSearch, Datasets
+from Utils import EnvDefault
+
+"""Do experiment and build result file, optionally print report with results
+"""
+
+
+def parse_arguments():
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "-s",
+        "--score",
+        action=EnvDefault,
+        envvar="score",
+        type=str,
+        required=True,
+        help="score name {accuracy, f1_macro, ...}",
+    )
+    ap.add_argument(
+        "-P",
+        "--platform",
+        action=EnvDefault,
+        envvar="platform",
+        type=str,
+        required=True,
+        help="Platform where the test is run",
+    )
+    ap.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        required=True,
+        help="model name",
+    )
+    ap.add_argument(
+        "-n",
+        "--n_folds",
+        action=EnvDefault,
+        envvar="n_folds",
+        type=int,
+        required=True,
+        help="number of folds",
+    )
+    ap.add_argument(
+        "-q",
+        "--quiet",
+        type=bool,
+        default=False,
+        required=False,
+        help="Wether to show progress bar or not",
+    )
+    ap.add_argument(
+        "-t",
+        "--stratified",
+        action=EnvDefault,
+        envvar="stratified",
+        type=str,
+        required=True,
+        help="Stratified",
+    )
+    ap.add_argument(
+        "-d",
+        "--dataset",
+        type=str,
+        required=True,
+        default=None,
+        help="Gridsearch on this dataset",
+    )
+    args = ap.parse_args()
+    return (
+        args.stratified,
+        args.score,
+        args.model,
+        args.n_folds,
+        args.platform,
+        args.quiet,
+        args.dataset,
+    )
+
+
+(
+    stratified,
+    score,
+    model,
+    folds,
+    platform,
+    quiet,
+    dataset,
+) = parse_arguments()
+job = GridSearch(
+    score_name=score,
+    model_name=model,
+    stratified=stratified,
+    datasets=Datasets(dataset=dataset),
+    progress_bar=not quiet,
+    platform=platform,
+    folds=folds,
+)
+job.do_gridsearch()