Become benchmark suite

This commit is contained in:
2022-04-20 19:43:22 +02:00
parent 637bed4981
commit 3360651e8e
297 changed files with 201 additions and 67532 deletions

466
benchmark/Experiments.py Normal file
View File

@@ -0,0 +1,466 @@
import os
import json
import random
import warnings
import time
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import (
StratifiedKFold,
KFold,
GridSearchCV,
cross_validate,
)
from .Utils import Folders, Files, EnvData
from .Models import Models
class Randomized:
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
class Diterator:
def __init__(self, data):
self._stack = data.copy()
def __next__(self):
if len(self._stack) == 0:
raise StopIteration()
return self._stack.pop(0)
class DatasetsTanveer:
@staticmethod
def dataset_names(name):
return f"{name}_R.dat"
@staticmethod
def folder():
return "data"
def load(self, name):
file_name = os.path.join(self.folder(), self.dataset_names(name))
try:
data = pd.read_csv(
file_name,
sep="\t",
index_col=0,
)
except FileNotFoundError:
print(f"Couldn't open data file {file_name}")
exit(1)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
return X, y
class DatasetsSurcov:
@staticmethod
def dataset_names(name):
return f"{name}.csv"
@staticmethod
def folder():
return "datasets"
def load(self, name):
file_name = os.path.join(self.folder(), self.dataset_names(name))
try:
data = pd.read_csv(
file_name,
index_col=0,
)
except FileNotFoundError:
print(f"Couldn't open data file {file_name}")
exit(1)
data.dropna(axis=0, how="any", inplace=True)
self.columns = data.columns
X = data.drop("class", axis=1).to_numpy()
y = data["class"].to_numpy()
return X, y
class Datasets:
def __init__(self, dataset_name=None):
envData = EnvData.load()
class_name = getattr(
__import__(__name__),
f"Datasets{envData['source_data']}",
)
self.dataset = class_name()
if dataset_name is None:
file_name = os.path.join(self.dataset.folder(), Files.index)
try:
with open(file_name) as f:
self.data_sets = f.read().splitlines()
except FileNotFoundError:
print(f"Couldn't open index file {file_name}")
exit(1)
else:
self.data_sets = [dataset_name]
def load(self, name):
return self.dataset.load(name)
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)
class BestResults:
def __init__(self, score, model, datasets):
self.score_name = score
self.datasets = datasets
self.model = model
self.data = {}
def _get_file_name(self):
return os.path.join(
Folders.results, Files.best_results(self.score_name, self.model)
)
def load(self, dictionary):
self.file_name = self._get_file_name()
try:
with open(self.file_name) as f:
self.data = json.load(f)
except FileNotFoundError:
raise ValueError(f"{self.file_name} does not exist")
return self.fill(dictionary, self.data)
def fill(self, dictionary, data=None):
if data is None:
data = {}
for dataset in self.datasets:
if dataset not in data:
data[dataset] = (0.0, dictionary, "")
return data
def _process_datafile(self, results, data, file_name):
for record in data["results"]:
dataset = record["dataset"]
if dataset in results:
if record["score"] >= results[dataset]["score"]:
record["file_name"] = file_name
results[dataset] = record
else:
record["file_name"] = file_name
results[dataset] = record
def build(self):
results = {}
init_suffix, end_suffix = Files.results_suffixes(
score=self.score_name, model=self.model
)
all_files = sorted(list(os.walk(Folders.results)))
for root, _, files in tqdm(all_files, desc="files"):
for name in files:
if name.startswith(init_suffix) and name.endswith(end_suffix):
file_name = os.path.join(root, name)
with open(file_name) as fp:
data = json.load(fp)
self._process_datafile(results, data, name)
# Build best results json file
output = {}
datasets = Datasets()
for name in tqdm(list(datasets), desc="datasets"):
output[name] = (
results[name]["score"],
results[name]["hyperparameters"],
results[name]["file_name"],
)
self.data = output
with open(self._get_file_name(), "w") as fp:
json.dump(output, fp)
class Experiment:
def __init__(
self,
score_name,
model_name,
stratified,
datasets,
hyperparams_dict,
hyperparams_file,
grid_paramfile,
platform,
title,
progress_bar=True,
folds=5,
):
today = datetime.now()
self.time = today.strftime("%H:%M:%S")
self.date = today.strftime("%Y-%m-%d")
self.output_file = os.path.join(
Folders.results,
Files.results(
score_name,
model_name,
platform,
self.date,
self.time,
stratified,
),
)
self.score_name = score_name
self.model_name = model_name
self.title = title
self.stratified = stratified == "1"
self.stratified_class = StratifiedKFold if self.stratified else KFold
self.datasets = datasets
dictionary = json.loads(hyperparams_dict)
hyper = BestResults(
score=score_name, model=model_name, datasets=datasets
)
if hyperparams_file:
self.hyperparameters_dict = hyper.load(
dictionary=dictionary,
)
elif grid_paramfile:
grid_file = os.path.join(
Folders.results, Files.grid_output(score_name, model_name)
)
with open(grid_file) as f:
self.hyperparameters_dict = json.load(f)
else:
self.hyperparameters_dict = hyper.fill(
dictionary=dictionary,
)
self.platform = platform
self.progress_bar = progress_bar
self.folds = folds
self.random_seeds = Randomized.seeds
self.results = []
self.duration = 0
self._init_experiment()
def get_output_file(self):
return self.output_file
def _build_classifier(self, random_state, hyperparameters):
self.model = Models.get_model(self.model_name, random_state)
clf = self.model
clf.set_params(**hyperparameters)
clf.set_params(random_state=random_state)
return clf
def _init_experiment(self):
self.scores = []
self.times = []
self.nodes = []
self.leaves = []
self.depths = []
def _n_fold_crossval(self, X, y, hyperparameters):
if self.scores != []:
raise ValueError("Must init experiment before!")
loop = tqdm(
self.random_seeds,
position=1,
leave=False,
disable=not self.progress_bar,
)
for random_state in loop:
loop.set_description(f"Seed({random_state:4d})")
random.seed(random_state)
np.random.seed(random_state)
kfold = self.stratified_class(
shuffle=True, random_state=random_state, n_splits=self.folds
)
clf = self._build_classifier(random_state, hyperparameters)
self.version = clf.version() if hasattr(clf, "version") else "-"
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
res = cross_validate(
clf,
X,
y,
cv=kfold,
return_estimator=True,
scoring=self.score_name,
)
self.scores.append(res["test_score"])
self.times.append(res["fit_time"])
for result_item in res["estimator"]:
nodes_item, leaves_item, depth_item = Models.get_complexity(
self.model_name, result_item
)
self.nodes.append(nodes_item)
self.leaves.append(leaves_item)
self.depths.append(depth_item)
def _add_results(self, name, hyperparameters, samples, features, classes):
record = {}
record["dataset"] = name
record["samples"] = samples
record["features"] = features
record["classes"] = classes
record["hyperparameters"] = hyperparameters
record["nodes"] = np.mean(self.nodes)
record["leaves"] = np.mean(self.leaves)
record["depth"] = np.mean(self.depths)
record["score"] = np.mean(self.scores)
record["score_std"] = np.std(self.scores)
record["time"] = np.mean(self.times)
record["time_std"] = np.std(self.times)
self.results.append(record)
def _output_results(self):
output = {}
output["score_name"] = self.score_name
output["title"] = self.title
output["model"] = self.model_name
output["version"] = self.version
output["stratified"] = self.stratified
output["folds"] = self.folds
output["date"] = self.date
output["time"] = self.time
output["duration"] = self.duration
output["seeds"] = self.random_seeds
output["platform"] = self.platform
output["results"] = self.results
with open(self.output_file, "w") as f:
json.dump(output, f)
f.flush()
def do_experiment(self):
now = time.time()
loop = tqdm(
list(self.datasets),
position=0,
disable=not self.progress_bar,
)
self.duration = 0.0
for name in loop:
loop.set_description(f"{name:30s}")
X, y = self.datasets.load(name)
samp, feat = X.shape
n_classes = len(np.unique(y))
hyperparameters = self.hyperparameters_dict[name][1]
self._init_experiment()
self._n_fold_crossval(X, y, hyperparameters)
self._add_results(name, hyperparameters, samp, feat, n_classes)
self._output_results()
self.duration = time.time() - now
self._output_results()
if self.progress_bar:
print(f"Results in {self.output_file}")
class GridSearch:
def __init__(
self,
score_name,
model_name,
stratified,
datasets,
platform,
progress_bar=True,
folds=5,
):
today = datetime.now()
self.time = today.strftime("%H:%M:%S")
self.date = today.strftime("%Y-%m-%d")
self.output_file = os.path.join(
Folders.results,
Files.grid_output(
score_name,
model_name,
),
)
self.score_name = score_name
self.model_name = model_name
self.stratified = stratified == "1"
self.stratified_class = StratifiedKFold if self.stratified else KFold
self.datasets = datasets
self.progress_bar = progress_bar
self.folds = folds
self.platform = platform
self.random_seeds = Randomized.seeds
self.grid_file = os.path.join(
Folders.results, Files.grid_input(score_name, model_name)
)
with open(self.grid_file) as f:
self.grid = json.load(f)
self.duration = 0
self._init_data()
def _init_data(self):
# if result file not exist initialize it
try:
with open(self.output_file, "r") as f:
self.results = json.load(f)
except FileNotFoundError:
# init file
output = {}
data = Datasets()
for item in data:
output[item] = [0.0, {}, ""]
with open(self.output_file, "w") as f:
json.dump(output, f)
self.results = output
def _save_results(self):
with open(self.output_file, "r") as f:
data = json.load(f)
for item in self.datasets:
data[item] = self.results[item]
with open(self.output_file, "w") as f:
json.dump(data, f, indent=4)
def _store_result(self, name, grid, duration):
d_message = f"{duration:.3f} s"
if duration > 3600:
d_message = f"{duration / 3600:.3f} h"
elif duration > 60:
d_message = f"{duration / 60:.3f} min"
message = (
f"v. {self.version}, Computed on {self.platform} on "
f"{self.date} at {self.time} "
f"took {d_message}"
)
score = grid.best_score_
hyperparameters = grid.best_params_
self.results[name] = [score, hyperparameters, message]
print(f"{name:30s} {score} {hyperparameters} {message}")
def do_gridsearch(self):
now = time.time()
loop = tqdm(
list(self.datasets),
position=0,
disable=not self.progress_bar,
)
for name in loop:
loop.set_description(f"{name:30s}")
X, y = self.datasets.load(name)
result = self._n_fold_gridsearch(X, y)
self._store_result(name, result, time.time() - now)
self._save_results()
def _n_fold_gridsearch(self, X, y):
kfold = self.stratified_class(
shuffle=True,
random_state=self.random_seeds[0],
n_splits=self.folds,
)
clf = Models.get_model(self.model_name)
self.version = clf.version() if hasattr(clf, "version") else "-"
self._num_warnings = 0
warnings.warn = self._warn
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
grid = GridSearchCV(
estimator=clf,
cv=kfold,
param_grid=self.grid,
scoring=self.score_name,
n_jobs=-1,
)
grid.fit(X, y)
return grid
def _warn(self, *args, **kwargs) -> None:
self._num_warnings += 1

79
benchmark/Models.py Normal file
View File

@@ -0,0 +1,79 @@
from statistics import mean
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (
RandomForestClassifier,
BaggingClassifier,
AdaBoostClassifier,
)
from sklearn.svm import SVC
from stree import Stree
from wodt import Wodt
from odte import Odte
class Models:
@staticmethod
def get_model(name, random_state=None):
if name == "STree":
return Stree()
if name == "Cart":
return DecisionTreeClassifier()
if name == "ExtraTree":
return ExtraTreeClassifier()
if name == "Wodt":
return Wodt()
if name == "SVC":
return SVC()
if name == "ODTE":
return Odte(base_estimator=Stree())
if name == "BaggingStree":
clf = Stree(random_state=random_state)
return BaggingClassifier(base_estimator=clf)
if name == "BaggingWodt":
clf = Wodt(random_state=random_state)
return BaggingClassifier(base_estimator=clf)
if name == "AdaBoostStree":
clf = Stree(random_state=random_state)
return AdaBoostClassifier(base_estimator=clf)
if name == "RandomForest":
return RandomForestClassifier()
msg = f"No model recognized {name}"
if name in ("Stree", "stree"):
msg += ", did you mean STree?"
elif name in ("odte", "Odte"):
msg += ", did you mean ODTE?"
raise ValueError(msg)
@staticmethod
def get_complexity(name, result):
if name == "Cart":
nodes = result.tree_.node_count
depth = result.tree_.max_depth
leaves = result.get_n_leaves()
elif name == "ExtraTree":
nodes = 0
leaves = result.get_n_leaves()
depth = 0
elif name.startswith("Bagging") or name.startswith("AdaBoost"):
if hasattr(result.base_estimator_, "nodes_leaves"):
nodes, leaves = list(
zip(*[x.nodes_leaves() for x in result.estimators_])
)
nodes, leaves = mean(nodes), mean(leaves)
depth = mean([x.depth_ for x in result.estimators_])
elif hasattr(result.base_estimator_, "tree_"):
nodes = mean([x.tree_.node_count for x in result.estimators_])
leaves = mean([x.get_n_leaves() for x in result.estimators_])
depth = mean([x.get_depth() for x in result.estimators_])
else:
nodes = leaves = depth = 0
elif name == "RandomForest":
leaves = mean([x.get_n_leaves() for x in result.estimators_])
depth = mean([x.get_depth() for x in result.estimators_])
nodes = mean([x.tree_.node_count for x in result.estimators_])
elif name == "SVC":
nodes = leaves = depth = 0
else:
nodes, leaves = result.nodes_leaves()
depth = result.depth_ if hasattr(result, "depth_") else 0
return nodes, leaves, depth

1216
benchmark/Results.py Normal file

File diff suppressed because it is too large Load Diff

168
benchmark/Utils.py Normal file
View File

@@ -0,0 +1,168 @@
import os
import subprocess
import argparse
BEST_ACCURACY_STREE = 40.282203
class Folders:
results = "results"
hidden_results = "hidden_results"
exreport = "exreport"
report = os.path.join(exreport, "exreport_output")
@staticmethod
def src():
return os.path.dirname(os.path.abspath(__file__))
class Files:
index = "all.txt"
report_ext = ".json"
cmd_open_macos = "/usr/bin/open"
cmd_open_linux = "/usr/bin/xdg-open"
exreport_pdf = "Rplots.pdf"
benchmark_r = "benchmark.r"
dot_env = ".env"
@staticmethod
def exreport_output(score):
return f"exreport_{score.replace('_','-')}.txt"
@staticmethod
def exreport_err(score):
return f"exreport_err_{score.replace('_','-')}.txt"
@staticmethod
def exreport_excel(score):
return f"exreport_{score.replace('_','-')}.xlsx"
@staticmethod
def exreport(score):
return f"exreport_{score.replace('_','-')}.csv"
@staticmethod
def best_results(score, model):
return f"best_results_{score.replace('_','-')}_{model}.json"
@staticmethod
def results(score, model, platform, date, time, stratified):
return (
f"results_{score.replace('_','-')}_{model}_{platform}_{date}_"
f"{time}_{stratified}.json"
)
@staticmethod
def grid_input(score, model):
return Files.grid("input", score, model)
@staticmethod
def grid_output(score, model):
return Files.grid("output", score, model)
@staticmethod
def grid(kind, score, model):
return f"grid_{kind}_{score.replace('_','-')}_{model}.json"
def split_file_name(self, name):
_, score, model, platform, date, time, stratified = name.split("_")
stratified = stratified.replace(self.report_ext, "")
return score, model, platform, date, time, stratified
@staticmethod
def results_suffixes(score="", model=""):
suffix = Files.report_ext
if model == "" and score == "":
return "results_", suffix
if model == "":
return f"results_{score}_", suffix
return f"results_{score}_{model}_", suffix
@staticmethod
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
@staticmethod
def open(name):
if os.path.isfile(name):
command = (
Files.cmd_open_macos
if Files.is_exe(Files.cmd_open_macos)
else Files.cmd_open_linux
)
subprocess.run([command, name])
def get_all_results(self, hidden) -> list[str]:
first_path = "."
first_try = os.path.join(
first_path, Folders.hidden_results if hidden else Folders.results
)
second_path = ".."
second_try = os.path.join(second_path, first_try)
if os.path.isdir(first_try):
files_list = os.listdir(first_try)
elif os.path.isdir(second_try):
files_list = os.listdir(second_try)
else:
raise ValueError(f"{first_try} or {second_try} does not exist")
result = []
prefix, suffix = self.results_suffixes()
for result_file in files_list:
if result_file.startswith(prefix) and result_file.endswith(suffix):
result.append(result_file)
return sorted(result)
class Symbols:
check_mark = "\N{heavy check mark}"
exclamation = "\N{heavy exclamation mark symbol}"
black_star = "\N{black star}"
equal_best = check_mark
better_best = black_star
class EnvData:
@staticmethod
def load():
args = {}
with open(Files.dot_env) as f:
for line in f.read().splitlines():
if line == "" or line.startswith("#"):
continue
key, value = line.split("=")
args[key] = value
return args
class EnvDefault(argparse.Action):
# Thanks to https://stackoverflow.com/users/445507/russell-heilling
def __init__(self, envvar, required=True, default=None, **kwargs):
self._args = EnvData.load()
if not default and envvar in self._args:
default = self._args[envvar]
if required and default:
required = False
super(EnvDefault, self).__init__(
default=default, required=required, **kwargs
)
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
class TextColor:
BLUE = "\033[94m"
CYAN = "\033[96m"
GREEN = "\033[92m"
MAGENTA = "\033[95m"
YELLOW = "\033[93m"
RED = "\033[91m"
HEADER = MAGENTA
LINE1 = BLUE
LINE2 = CYAN
SUCCESS = GREEN
WARNING = YELLOW
FAIL = RED
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"

10
benchmark/__init__.py Normal file
View File

@@ -0,0 +1,10 @@
from .Experiments import Experiment, Datasets, DatasetsSurcov, DatasetsTanveer
from .Results import Report, Summary
from .Utils import EnvDefault
__author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez"
__license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Experiment", "Datasets", "Report", "Summary", "EnvDefault"]

1
benchmark/_version.py Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.1.1"

40
benchmark/benchmark.r Normal file
View File

@@ -0,0 +1,40 @@
library(glue)
args = commandArgs(trailingOnly=TRUE)
if (length(args)!=1) {
stop("Only one argument must be supplied (score).n", call.=FALSE)
}
csv_file <- glue("results/exreport_{args[1]}.csv")
destination <- "exreport/"
results <- read.csv(csv_file)
library(exreport)
experiment <- expCreate(results, method="classifier", problem="dataset", name="Ranking")
testScore <- testMultipleControl(experiment, args[1], "max")
summary(testScore)
table1 <- tabularTestSummary(testScore, columns = c("pvalue", "rank", "wtl"))
table1
plot1 <- plotExpSummary(experiment, args[1], columns = 3)
plot2 <- plotCumulativeRank(testScore)
plot3 <- plotRankDistribution(testScore)
report <- exreport("Ranking Report")
# Add the experiment object for reference:
report <- exreportAdd(report, experiment)
# Now add the test:
report <- exreportAdd(report, testScore)
# Finally you can add the different tables and plots.
report <- exreportAdd(report, list(plot1, plot2, table1, plot3))
# At this point we would like to include an additional item in our report. We need a detailed table of our experiment,
# as we are preparing a scientific paper and we would like to have an overview of it to be included in an annex,
# despite the good summaries that we are providing with the plots and tests. Fortnunately, we have another built in
# function for this.
# We have decided to generate the table at this point of the tutorial to discusse some special formating parameters of this function. Concretely, some of the tabular outputs generated by exreport have some properties that are only useful when rendering the objets in a graphic report, and have no effect in the object representation in the R console. In this case, we will tell the function to boldface the method that maximices the result for each column, and to split the table into to pieces when rendering.
# We create the table:
table2 <- tabularExpSummary(experiment, args[1], digits=4, format="f", boldfaceColumns="max", rowsAsMethod=FALSE)
# And add it to the report:
report <- exreportAdd(report, table2)
# Now that we have finished adding elements to the report it is time to render it. We want to generate an HTML report, so we call the appropiate function, by default it renders and opens the report in your browser using a temporary file, but you can optionally specify a folder in which the report will be saved for future use.
# Render the report:
exreportRender(report, destination=destination, target = "html", visualize = T)

View File

@@ -0,0 +1,375 @@
{
"balance-scale": [
0,
{
"C": 10000.0,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"balloons": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"breast-cancer-wisc-diag": [
0,
{
"C": 0.2,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"breast-cancer-wisc-prog": [
0,
{
"C": 0.2,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"breast-cancer-wisc": [
0,
{ "kernel": "rbf", "multiclass_strategy": "ovr" },
""
],
"breast-cancer": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"cardiotocography-10clases": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"cardiotocography-3clases": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"conn-bench-sonar-mines-rocks": [
0,
{ "kernel": "rbf", "multiclass_strategy": "ovr" },
""
],
"cylinder-bands": [0, { "kernel": "rbf", "multiclass_strategy": "ovr" }, ""],
"dermatology": [
0,
{
"C": 55,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"echocardiogram": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "poly",
"max_features": "auto",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"fertility": [
0,
{
"C": 0.05,
"max_features": "auto",
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"haberman-survival": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"heart-hungarian": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"hepatitis": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"ilpd-indian-liver": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"ionosphere": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"iris": [0, { "kernel": "liblinear", "multiclass_strategy": "ovr" }, ""],
"led-display": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"libras": [
0,
{
"C": 0.08,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"low-res-spect": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"lymphography": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"mammographic": [0, { "kernel": "rbf", "multiclass_strategy": "ovr" }, ""],
"molec-biol-promoter": [
0,
{ "kernel": "rbf", "multiclass_strategy": "ovr" },
""
],
"musk-1": [
0,
{
"C": 0.05,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"oocytes_merluccius_nucleus_4d": [
0,
{ "C": 8.25, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr" },
""
],
"oocytes_merluccius_states_2f": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"oocytes_trisopterus_nucleus_2f": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"oocytes_trisopterus_states_5b": [
0,
{
"C": 0.11,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"parkinsons": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"pima": [0, { "kernel": "liblinear", "multiclass_strategy": "ovr" }, ""],
"pittsburg-bridges-MATERIAL": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"pittsburg-bridges-REL-L": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"pittsburg-bridges-SPAN": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"pittsburg-bridges-T-OR-D": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"planning": [
0,
{
"C": 7,
"gamma": 10.0,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"post-operative": [
0,
{
"C": 55,
"degree": 5,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"seeds": [
0,
{
"C": 10000.0,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"statlog-australian-credit": [
0,
{
"C": 0.05,
"max_features": "auto",
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"statlog-german-credit": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"statlog-heart": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"statlog-image": [
0,
{
"C": 7,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"statlog-vehicle": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"synthetic-control": [
0,
{
"C": 0.55,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"tic-tac-toe": [
0,
{
"C": 0.2,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"vertebral-column-2clases": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"wine": [
0,
{
"C": 0.55,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"zoo": [
0,
{
"C": 0.1,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
]
}

View File

@@ -0,0 +1,351 @@
{
"balance-scale": [
0,
{
"C": 10000.0,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0
},
""
],
"balloons": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0
},
""
],
"breast-cancer-wisc-diag": [
0,
{
"C": 0.2,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"breast-cancer-wisc-prog": [
0,
{
"C": 0.2,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"breast-cancer-wisc": [0, { "kernel": "rbf" }, ""],
"breast-cancer": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"cardiotocography-10clases": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"cardiotocography-3clases": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"conn-bench-sonar-mines-rocks": [0, { "kernel": "rbf" }, ""],
"cylinder-bands": [0, { "kernel": "rbf" }, ""],
"dermatology": [
0,
{
"C": 55,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"echocardiogram": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0
},
""
],
"fertility": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"haberman-survival": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"heart-hungarian": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"hepatitis": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0
},
""
],
"ilpd-indian-liver": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"ionosphere": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0
},
""
],
"iris": [0, { "kernel": "linear", "decision_function_shape": "ovr" }, ""],
"led-display": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"libras": [
0,
{
"C": 0.08,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"low-res-spect": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"lymphography": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"mammographic": [0, { "kernel": "rbf" }, ""],
"molec-biol-promoter": [0, { "kernel": "rbf" }, ""],
"musk-1": [
0,
{
"C": 0.05,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0
},
""
],
"oocytes_merluccius_nucleus_4d": [
0,
{ "C": 8.25, "gamma": 0.1, "kernel": "poly" },
""
],
"oocytes_merluccius_states_2f": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"oocytes_trisopterus_nucleus_2f": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"oocytes_trisopterus_states_5b": [
0,
{
"C": 0.11,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"parkinsons": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"pima": [0, { "kernel": "linear", "decision_function_shape": "ovr" }, ""],
"pittsburg-bridges-MATERIAL": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0
},
""
],
"pittsburg-bridges-REL-L": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"pittsburg-bridges-SPAN": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"pittsburg-bridges-T-OR-D": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"planning": [
0,
{
"C": 7,
"gamma": 10.0,
"kernel": "rbf",
"max_iter": 10000.0
},
""
],
"post-operative": [
0,
{
"C": 55,
"degree": 5,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0
},
""
],
"seeds": [
0,
{
"C": 10000.0,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"statlog-australian-credit": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"statlog-german-credit": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"statlog-heart": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"statlog-image": [
0,
{
"C": 7,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"statlog-vehicle": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"synthetic-control": [
0,
{
"C": 0.55,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"tic-tac-toe": [
0,
{
"C": 0.2,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0
},
""
],
"vertebral-column-2clases": [
0,
{ "kernel": "linear", "decision_function_shape": "ovr" },
""
],
"wine": [
0,
{
"C": 0.55,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
],
"zoo": [
0,
{
"C": 0.1,
"max_iter": 10000.0,
"kernel": "linear",
"decision_function_shape": "ovr"
},
""
]
}

View File

@@ -0,0 +1 @@
{"balance-scale": {"C": 10000.0, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "balloons": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "breast-cancer-wisc-diag": {"C": 0.2, "max_iter": 10000.0, "kernel": "liblinear"}, "breast-cancer-wisc-prog": {"C": 0.2, "max_iter": 10000.0, "kernel": "liblinear"}, "breast-cancer-wisc": {"kernel": "rbf"}, "breast-cancer": {"kernel": "liblinear"}, "cardiotocography-10clases": {"kernel": "liblinear"}, "cardiotocography-3clases": {"kernel": "liblinear"}, "conn-bench-sonar-mines-rocks": {"kernel": "rbf"}, "cylinder-bands": {"kernel": "rbf"}, "dermatology": {"C": 55, "max_iter": 10000.0, "kernel": "liblinear"}, "echocardiogram": {"C": 7, "gamma": 0.1, "kernel": "poly", "max_features": "auto", "max_iter": 10000.0}, "fertility": {"C": 0.05, "max_features": "auto", "max_iter": 10000.0, "kernel": "liblinear"}, "haberman-survival": {"kernel": "liblinear"}, "heart-hungarian": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "hepatitis": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "ilpd-indian-liver": {"kernel": "liblinear"}, "ionosphere": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "iris": {"kernel": "liblinear"}, "led-display": {"kernel": "liblinear"}, "libras": {"C": 0.08, "max_iter": 10000.0, "kernel": "liblinear"}, "low-res-spect": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "lymphography": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "mammographic": {"kernel": "rbf"}, "molec-biol-promoter": {"kernel": "rbf"}, "musk-1": {"C": 0.05, "gamma": 0.1, "kernel": "poly", "max_iter": 10000.0}, "oocytes_merluccius_nucleus_4d": {"C": 8.25, "gamma": 0.1, "kernel": "poly"}, "oocytes_merluccius_states_2f": {"kernel": "liblinear"}, "oocytes_trisopterus_nucleus_2f": {"kernel": "liblinear"}, "oocytes_trisopterus_states_5b": {"C": 0.11, "max_iter": 10000.0, "kernel": "liblinear"}, "parkinsons": {"kernel": "liblinear"}, "pima": {"kernel": "liblinear"}, "pittsburg-bridges-MATERIAL": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "pittsburg-bridges-REL-L": {"kernel": "liblinear"}, "pittsburg-bridges-SPAN": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "pittsburg-bridges-T-OR-D": {"kernel": "liblinear"}, "planning": {"C": 7, "gamma": 10.0, "kernel": "rbf", "max_iter": 10000.0}, "post-operative": {"C": 55, "degree": 5, "gamma": 0.1, "kernel": "poly", "max_iter": 10000.0}, "seeds": {"C": 10000.0, "max_iter": 10000.0, "kernel": "liblinear"}, "statlog-australian-credit": {"C": 0.05, "max_features": "auto", "max_iter": 10000.0, "kernel": "liblinear"}, "statlog-german-credit": {"kernel": "liblinear"}, "statlog-heart": {"kernel": "liblinear"}, "statlog-image": {"C": 7, "max_iter": 10000.0, "kernel": "liblinear"}, "statlog-vehicle": {"kernel": "liblinear"}, "synthetic-control": {"C": 0.55, "max_iter": 10000.0, "kernel": "liblinear"}, "tic-tac-toe": {"C": 0.2, "gamma": 0.1, "kernel": "poly", "max_iter": 10000.0}, "vertebral-column-2clases": {"kernel": "liblinear"}, "wine": {"C": 0.55, "max_iter": 10000.0, "kernel": "liblinear"}, "zoo": {"C": 0.1, "max_iter": 10000.0, "kernel": "liblinear"}}

37
benchmark/scripts/be_benchmark Executable file
View File

@@ -0,0 +1,37 @@
#!/usr/bin/env python
from benchmark.Results import Benchmark
from benchmark.Utils import Files, EnvDefault
import argparse
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-s",
"--score",
action=EnvDefault,
envvar="score",
type=str,
required=True,
help="score name {accuracy, f1_macro, ...}",
)
ap.add_argument(
"-x",
"--excel",
type=bool,
required=False,
help="Generate Excel File",
)
args = ap.parse_args()
return (args.score, args.excel)
(score, excel) = parse_arguments()
benchmark = Benchmark(score)
benchmark.compile_results()
benchmark.save_results()
benchmark.report()
benchmark.exreport()
if excel:
benchmark.excel()
Files.open(benchmark.get_excel_file_name())

43
benchmark/scripts/be_best Executable file
View File

@@ -0,0 +1,43 @@
#!/usr/bin/env python
import argparse
import json
from ..Results import Summary
from ..Utils import EnvDefault
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-s",
"--score",
type=str,
action=EnvDefault,
envvar="score",
required=True,
help="score name {accuracy, f1_micro, f1_macro, all}",
)
args = ap.parse_args()
return (args.score,)
(score,) = parse_arguments()
all_metrics = ["accuracy", "f1-macro", "f1-micro"]
metrics = all_metrics if score == "all" else [score]
summary = Summary()
summary.acquire()
nl = 50
num = 100
for metric in metrics:
title = f"BEST RESULTS of {metric} for datasets"
best = summary.best_results_datasets(score=metric)
for key, item in best.items():
print(f"{key:30s} {item[2]:{nl}s}")
print("-" * num)
print(f"{item[0]:30.7f} {json.dumps(item[1]):{nl}s}")
print("-" * num)
print(f"{item[3]:{nl+num}s}")
print("*" * num)

48
benchmark/scripts/be_build_best Executable file
View File

@@ -0,0 +1,48 @@
#!/usr/bin/env python
import argparse
from benchmark.Results import ReportBest
from benchmark.Experiments import Datasets, BestResults
from benchmark.Utils import EnvDefault
"""Build a json file with the best results of a model and its hyperparameters
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-s",
"--score",
action=EnvDefault,
envvar="score",
type=str,
required=True,
help="score name {accuracy, f1_macro, ...}",
)
ap.add_argument(
"-m",
"--model",
action=EnvDefault,
envvar="model",
type=str,
required=True,
help="model name.",
)
ap.add_argument(
"-r",
"--report",
type=bool,
required=False,
help="Generate Report",
)
args = ap.parse_args()
return (args.score, args.model, args.report)
(score, model, report) = parse_arguments()
datasets = Datasets()
best = BestResults(score, model, datasets)
best.build()
if report:
report = ReportBest(score, model)
report.report()

107
benchmark/scripts/be_build_grid Executable file
View File

@@ -0,0 +1,107 @@
#!/usr/bin/env python
import os
import json
from benchmark.Utils import Files, Folders
data = [
'{"C": 1e4, "gamma": 0.1, "kernel": "rbf"}',
'{"C": 7, "gamma": 0.14, "kernel": "rbf"}',
'{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 0.2, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 0.95, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"kernel": "rbf"}',
'{"kernel": "rbf"}',
'{"C": 1.05, "gamma": "auto","kernel": "rbf"}',
'{"splitter": "random", "max_features": "auto"}',
'{"C": 0.05, "max_features": "auto", "kernel": "liblinear", '
'"multiclass_strategy": "ovr"}',
'{"kernel": "rbf", "C": 0.05}',
'{"C": 0.05, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 7, "gamma": 0.1, "kernel": "rbf"}',
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 7, "gamma": 0.1, "kernel": "rbf"}',
'{"C": 0.25, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 0.08, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 0.001, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 2.8, "kernel": "rbf", "gamma": "auto"}',
'{"kernel": "rbf"}',
'{"C": 0.05, "gamma": 0.1, "kernel": "poly"}',
'{"C": 8.25, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": '
'"ovr"}',
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C":57, "kernel": "rbf"}',
'{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": "ovr"}',
'{"C": 5, "kernel": "rbf", "gamma": "auto"}',
'{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", '
'"multiclass_strategy": "ovr"}',
'{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": "ovr"}',
'{"kernel": "rbf", "gamma": 0.001}',
'{"C": 1e4, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 7, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 2.83, "kernel": "rbf", "gamma": "auto"}',
'{"C": 0.2, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr"}',
'{"kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 2, "gamma": "auto", "kernel": "rbf"}',
'{"C": 1.75, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
]
results = {}
output = []
hyper = ["C", "gamma", "kernel", "multiclass_strategy"]
kernels = ["linear", "liblinear", "rbf", "poly"]
# initialize results
for kernel in kernels:
results[kernel] = {}
for item in hyper:
results[kernel][item] = []
# load data
for sample in data:
line = json.loads(sample)
if "kernel" not in line:
line["kernel"] = "linear"
kernel = line["kernel"]
for item in hyper:
if item in line and line[item] not in results[kernel][item]:
results[kernel][item].append(line[item])
# Add default values and remove inconsistent values
results["linear"]["multiclass_strategy"] = ["ovo"]
del results["linear"]["gamma"]
del results["liblinear"]["gamma"]
results["rbf"]["gamma"].append("scale")
results["poly"]["gamma"].append("scale")
results["poly"]["multiclass_strategy"].append("ovo")
for kernel in kernels:
results[kernel]["C"].append(1.0)
for item in results:
results_tmp = {"n_jobs": [-1], "n_estimators": [100]}
for key, value in results[item].items():
new_key = f"base_estimator__{key}"
try:
results_tmp[new_key] = sorted(value)
except TypeError:
t1 = sorted(
[
x
for x in value
if isinstance(x, int) or isinstance(x, float)
]
)
t2 = sorted([x for x in value if isinstance(x, str)])
results_tmp[new_key] = t1 + t2
output.append(results_tmp)
# save results
file_name = Files.grid_input("accuracy", "ODTE")
file_output = os.path.join(Folders.results, file_name)
with open(file_output, "w") as f:
json.dump(output, f, indent=4)
print(f"Grid values saved to {file_output}")

101
benchmark/scripts/be_grid Executable file
View File

@@ -0,0 +1,101 @@
#!/usr/bin/env python
import argparse
from benchmark.Experiments import GridSearch, Datasets
from benchmark.Utils import EnvDefault
"""Do experiment and build result file, optionally print report with results
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-s",
"--score",
action=EnvDefault,
envvar="score",
type=str,
required=True,
help="score name {accuracy, f1_macro, ...}",
)
ap.add_argument(
"-P",
"--platform",
action=EnvDefault,
envvar="platform",
type=str,
required=True,
help="Platform where the test is run",
)
ap.add_argument(
"-m",
"--model",
type=str,
required=True,
help="model name",
)
ap.add_argument(
"-n",
"--n_folds",
action=EnvDefault,
envvar="n_folds",
type=int,
required=True,
help="number of folds",
)
ap.add_argument(
"-q",
"--quiet",
type=bool,
default=False,
required=False,
help="Wether to show progress bar or not",
)
ap.add_argument(
"-t",
"--stratified",
action=EnvDefault,
envvar="stratified",
type=str,
required=True,
help="Stratified",
)
ap.add_argument(
"-d",
"--dataset",
type=str,
required=True,
default=None,
help="Gridsearch on this dataset",
)
args = ap.parse_args()
return (
args.stratified,
args.score,
args.model,
args.n_folds,
args.platform,
args.quiet,
args.dataset,
)
(
stratified,
score,
model,
folds,
platform,
quiet,
dataset,
) = parse_arguments()
job = GridSearch(
score_name=score,
model_name=model,
stratified=stratified,
datasets=Datasets(dataset_name=dataset),
progress_bar=not quiet,
platform=platform,
folds=folds,
)
job.do_gridsearch()

71
benchmark/scripts/be_list Executable file
View File

@@ -0,0 +1,71 @@
#! /usr/bin/env python
import argparse
from benchmark.Results import Summary
"""List experiments of a model
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-x",
"--excel",
type=bool,
required=False,
help="Generate Excel file",
)
ap.add_argument(
"-s",
"--score",
type=str,
required=False,
help="score used in experiment",
)
ap.add_argument(
"-m",
"--model",
type=str,
required=False,
help="model used in experiment",
)
ap.add_argument(
"-k",
"--key",
type=str,
required=False,
default="date",
help="key to sort results",
)
ap.add_argument(
"--hidden",
type=str,
required=False,
default=False,
help="Show hidden results",
)
ap.add_argument(
"-n",
"--number",
type=int,
required=False,
default=0,
help="number of results to show, 0 to any",
)
args = ap.parse_args()
return (
args.excel,
args.score,
args.model,
args.key,
args.number,
args.hidden,
)
if __name__ == "__main__":
(excel, score, model, key, number, hidden) = parse_arguments()
data = Summary(hidden=hidden)
data.acquire()
data.list_results(score=score, model=model, sort_key=key, number=number)

158
benchmark/scripts/be_main Executable file
View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python
import os
import argparse
from benchmark.Experiments import Experiment, Datasets
from benchmark.Results import Report
from benchmark.Utils import EnvDefault
"""Do experiment and build result file, optionally print report with results
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-s",
"--score",
action=EnvDefault,
envvar="score",
type=str,
required=True,
help="score name {accuracy, f1_macro, ...}",
)
ap.add_argument(
"-P",
"--platform",
action=EnvDefault,
envvar="platform",
type=str,
required=True,
help="Platform where the test is run",
)
ap.add_argument(
"-m",
"--model",
type=str,
required=True,
help="model name",
)
ap.add_argument(
"-n",
"--n_folds",
action=EnvDefault,
envvar="n_folds",
type=int,
required=True,
help="number of folds",
)
ap.add_argument(
"-p", "--hyperparameters", type=str, required=False, default="{}"
)
ap.add_argument(
"-f",
"--paramfile",
type=bool,
required=False,
default=False,
help="Use best hyperparams file?",
)
ap.add_argument(
"-g",
"--grid_paramfile",
type=bool,
required=False,
default=False,
help="Use grid searched hyperparams file?",
)
ap.add_argument(
"--title", type=str, required=True, help="experiment title"
)
ap.add_argument(
"-q",
"--quiet",
type=bool,
default=False,
required=False,
help="Wether to show progress bar or not",
)
ap.add_argument(
"-r",
"--report",
type=bool,
default=False,
required=False,
help="Report results",
)
ap.add_argument(
"-t",
"--stratified",
action=EnvDefault,
envvar="stratified",
type=str,
required=True,
help="Stratified",
)
ap.add_argument(
"-d",
"--dataset",
type=str,
required=False,
default=None,
help="Experiment with only this dataset",
)
args = ap.parse_args()
return (
args.stratified,
args.score,
args.model,
args.n_folds,
args.platform,
args.quiet,
args.hyperparameters,
args.paramfile,
args.grid_paramfile,
args.report,
args.title,
args.dataset,
)
if __name__ == "__main__":
(
stratified,
score,
model,
folds,
platform,
quiet,
hyperparameters,
paramfile,
grid_paramfile,
report,
experiment_title,
dataset,
) = parse_arguments()
report = report or dataset is not None
if grid_paramfile:
paramfile = False
job = Experiment(
score_name=score,
model_name=model,
stratified=stratified,
datasets=Datasets(dataset_name=dataset),
hyperparams_dict=hyperparameters,
hyperparams_file=paramfile,
grid_paramfile=grid_paramfile,
progress_bar=not quiet,
platform=platform,
title=experiment_title,
folds=folds,
)
job.do_experiment()
if report:
result_file = job.get_output_file()
report = Report(result_file)
report.report()
if dataset is not None:
print(f"Partial result file removed: {result_file}")
os.remove(result_file)

71
benchmark/scripts/be_pair_check Executable file
View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python
import argparse
from benchmark.Results import PairCheck
from benchmark.Utils import EnvDefault
"""Check best results of two models giving scores and win-tie-loose results
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-s",
"--score",
action=EnvDefault,
envvar="score",
type=str,
required=True,
help="score name {accuracy, f1_macro, ...}",
)
ap.add_argument(
"-m1",
"--model1",
type=str,
required=True,
help="model 1 name",
)
ap.add_argument(
"-m2",
"--model2",
type=str,
required=True,
help="model 2 name",
)
ap.add_argument(
"-w",
"--win",
type=bool,
default=False,
required=False,
help="show win results",
)
ap.add_argument(
"-l",
"--lose",
type=bool,
default=False,
required=False,
help="show lose results",
)
args = ap.parse_args()
return (
args.score,
args.model1,
args.model2,
args.win,
args.lose,
)
if __name__ == "__main__":
(
score,
model1,
model2,
win_results,
lose_results,
) = parse_arguments()
pair_check = PairCheck(score, model1, model2, win_results, lose_results)
pair_check.compute()
pair_check.report()

129
benchmark/scripts/be_print_strees Executable file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python
import os
import subprocess
import argparse
import json
from stree import Stree
from graphviz import Source
from benchmark.Experiments import Datasets
from benchmark.Utils import Files, Folders
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-c",
"--color",
type=bool,
required=False,
default=False,
help="use colors for the tree",
)
ap.add_argument(
"-d",
"--dataset",
type=str,
required=False,
default="all",
help="dataset to print or all",
)
ap.add_argument(
"-q",
"--quiet",
type=bool,
required=False,
default=False,
help="don't print generated tree(s)",
)
args = ap.parse_args()
return (args.color, args.dataset, args.quiet)
def compute_stree(X, y, random_state):
clf = Stree(random_state=random_state)
clf.fit(X, y)
return clf
def load_hyperparams(score_name, model_name):
grid_file = os.path.join(
Folders.results, Files.grid_output(score_name, model_name)
)
with open(grid_file) as f:
return json.load(f)
def hyperparam_filter(hyperparams):
res = {}
for key, value in hyperparams.items():
if key.startswith("base_estimator"):
newkey = key.split("__")[1]
res[newkey] = value
return res
def build_title(dataset, accuracy, n_samples, n_features, n_classes, nodes):
dataset_chars = f"-{dataset}- f={n_features} s={n_samples} c={n_classes}"
return (
f'<font point-size="25" color="brown">{dataset_chars}<BR/></font>'
f'<font point-size="20" color="red">accuracy: {accuracy:.6f} / '
f"{nodes} nodes</font>"
)
def add_color(source):
return (
source.replace( # Background and title font color
"fontcolor=blue", "fontcolor=white\nbgcolor=darkslateblue"
)
.replace("brown", "cyan") # subtitle font color
.replace( # Fill leaves
"style=filled", 'style="filled" fillcolor="/blues5/1:/blues5/4"'
)
.replace( # Fill nodes
"fontcolor=black",
'style=radial fillcolor="orange:white" gradientangle=60',
)
.replace("color=black", "color=white") # arrow color
.replace( # accuracy / # nodes
'color="red"', 'color="darkolivegreen1"'
)
)
def print_stree(clf, dataset, X, y, color, quiet):
output_folder = "img"
samples, features = X.shape
classes = max(y) + 1
accuracy = clf.score(X, y)
nodes, _ = clf.nodes_leaves()
title = build_title(dataset, accuracy, samples, features, classes, nodes)
dot_source = clf.graph(title)
if color:
dot_source = add_color(dot_source)
grp = Source(dot_source)
file_name = os.path.join(output_folder, f"stree_{dataset}")
grp.render(format="png", filename=f"{file_name}")
os.remove(f"{file_name}")
print(f"File {file_name}.png generated")
if not quiet:
cmd_open = "/usr/bin/open"
if os.path.isfile(cmd_open) and os.access(cmd_open, os.X_OK):
subprocess.run([cmd_open, f"{file_name}.png"])
if __name__ == "__main__":
(color, dataset_chosen, quiet) = parse_arguments()
hyperparameters = load_hyperparams("accuracy", "ODTE")
random_state = 57
dt = Datasets()
for dataset in dt:
if dataset == dataset_chosen or dataset_chosen == "all":
X, y = dt.load(dataset)
clf = Stree(random_state=random_state)
hyperparams_dataset = hyperparam_filter(
hyperparameters[dataset][1]
)
clf.set_params(**hyperparams_dataset)
clf.fit(X, y)
print_stree(clf, dataset, X, y, color, quiet)

23
benchmark/scripts/be_repara Executable file
View File

@@ -0,0 +1,23 @@
#!/usr/bin/env python
import os
import json
from benchmark.Experiments import Files, Folders
versions = dict(SVC="-", STree="1.2.3", ODTE="0.3.2")
results = Files().get_all_results(hidden=False)
for result in results:
print(result)
file_name = os.path.join(Folders.results, result)
with open(file_name) as f:
data = json.load(f)
if "title" not in data:
print(f"Repairing title in {result}")
data["title"] = "default"
if "version" not in data:
print(f"Repairing version in {result}")
model = data["model"]
data["version"] = versions[model] if model in versions else "-"
with open(file_name, "w") as f:
json.dump(data, f, indent=4)

136
benchmark/scripts/be_report Executable file
View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python
import argparse
import numpy as np
from benchmark.Experiments import Datasets
from benchmark.Results import Report, Excel, SQL, ReportBest
from benchmark.Utils import Files, TextColor, EnvDefault
"""Build report on screen of a result file, optionally generate excel and sql
file, and can compare results of report with best results obtained by model
If no argument is set, displays the datasets and its characteristics
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-f",
"--file",
type=str,
required=False,
help="Result file",
)
ap.add_argument(
"-x",
"--excel",
type=bool,
required=False,
help="Generate Excel file",
)
ap.add_argument(
"-q",
"--sql",
type=bool,
required=False,
help="Generate sql file",
)
ap.add_argument(
"-c",
"--compare",
type=bool,
required=False,
help="Compare accuracy with best results",
)
ap.add_argument(
"-b",
"--best",
type=str,
required=False,
help="best results of models",
)
ap.add_argument(
"-g",
"--grid",
type=str,
required=False,
help="grid results of model",
)
ap.add_argument(
"-m",
"--model",
action=EnvDefault,
envvar="model",
type=str,
required=True,
help="model name",
)
ap.add_argument(
"-s",
"--score",
action=EnvDefault,
envvar="score",
type=str,
required=True,
help="score name {accuracy, f1_macro, ...}",
)
args = ap.parse_args()
return (
args.file,
args.excel,
args.sql,
args.compare,
args.best,
args.grid,
args.score,
args.model,
)
def default_report():
sets = Datasets()
color_line = TextColor.LINE1
print(color_line, end="")
print(f"{'Dataset':30s} Samp. Feat Cls Balance")
print("=" * 30 + " ===== ==== === " + "=" * 40)
for line in sets:
X, y = sets.load(line)
color_line = (
TextColor.LINE2
if color_line == TextColor.LINE1
else TextColor.LINE1
)
values, counts = np.unique(y, return_counts=True)
comp = ""
sep = ""
for value, count in zip(values, counts):
comp += f"{sep}{count/sum(counts)*100:5.2f}%"
sep = "/ "
print(color_line, end="")
print(
f"{line:30s} {X.shape[0]:5,d} {X.shape[1]:4d} "
f"{len(np.unique(y)):3d} {comp:40s}"
)
if __name__ == "__main__":
(file, excel, sql, compare, best, grid, score, model) = parse_arguments()
if grid:
best = False
if file is None and best is None:
default_report()
else:
if best is not None or grid is not None:
report = ReportBest(score, model, best, grid)
report.report()
else:
report = Report(file, compare)
report.report()
if excel:
excel = Excel(file, compare)
excel.report()
Files.open(excel.get_file_name())
if sql:
sql = SQL(file)
sql.report()

64
benchmark/scripts/be_summary Executable file
View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python
import argparse
from benchmark.Results import Summary
from benchmark.Utils import EnvDefault
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-m",
"--model",
type=str,
action=EnvDefault,
envvar="model",
required=True,
help="model name",
)
ap.add_argument(
"-s",
"--score",
type=str,
action=EnvDefault,
envvar="score",
required=True,
help="score name {accuracy, f1_micro, f1_macro, all}",
)
ap.add_argument(
"-l",
"--list",
type=bool,
required=False,
default=False,
help="List all results",
)
args = ap.parse_args()
return (
args.score,
args.model,
args.list,
)
if __name__ == "__main__":
(
score,
model,
list_results,
) = parse_arguments()
all_metrics = ["accuracy", "f1-macro", "f1-micro"]
metrics = all_metrics if score == "all" else [score]
summary = Summary()
summary.acquire()
for metric in metrics:
title = f"BEST RESULT of {metric} for {model}"
best = summary.best_result(
criterion="model", value=model, score=metric
)
summary.show_result(data=best, title=title)
summary.show_result(
summary.best_result(score=metric), title=f"BEST RESULT of {metric}"
)
summary.show_top(score=metric, n=10)
if list_results:
summary.list_results()

46
benchmark/scripts/be_td Executable file
View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python
import sys
import time
from benchmark.Experiments import Datasets
from benchmark.mufs import MUFS
mufs_i = MUFS()
mufs_c = MUFS()
mufs_f = MUFS()
datasets = Datasets()
iwss_t = iwss_tl = cfs_t = cfs_tl = fcbf_t = fcbf_tl = 0
for i in datasets:
X, y = datasets.load(i)
now = time.time()
mufs_i.iwss(X, y, float(sys.argv[1]))
iwss = time.time() - now
iwss_r = len(mufs_i.get_results())
now = time.time()
mufs_c.cfs(X, y)
cfs = time.time() - now
cfs_r = len(mufs_c.get_results())
now = time.time()
mufs_f.fcbf(X, y, 1e-5)
fcbf = time.time() - now
fcbf_r = len(mufs_f.get_results())
print(
f"{i:30s} {iwss:.4f}({iwss_r:2d}) {cfs:.4f}({cfs_r:2d}) {fcbf:.4f}"
f"({fcbf_r:2d})"
)
iwss_t += iwss
iwss_tl += iwss_r
cfs_t += cfs
cfs_tl += cfs_r
fcbf_t += fcbf
fcbf_tl += fcbf_r
num = len(list(datasets))
iwss_t /= num
iwss_tl /= num
cfs_t /= num
cfs_tl /= num
fcbf_t /= num
fcbf_tl /= num
print(
f"{'Average ..: ':30s} {iwss_t:.4f}({iwss_tl:.2f}) {cfs_t:.4f}"
f"({cfs_tl:.2f}) {fcbf_t:.4f}({fcbf_tl:.2f})"
)