Create benchmark

This commit is contained in:
2021-09-24 11:18:38 +02:00
parent ebe768f566
commit 2fc188adca
65 changed files with 27900 additions and 0 deletions

267
src/Experiments.py Normal file
View File

@@ -0,0 +1,267 @@
import os
import json
import random
import warnings
import time
from datetime import datetime
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from stree import Stree
from Utils import Folders, Files
class Randomized:
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
class Models:
@staticmethod
def get_model(name):
if name == "STree":
return Stree
elif name == "Cart":
return DecisionTreeClassifier
else:
msg = f"No model recognized {name}"
if name == "Stree" or name == "stree":
msg += ", did you mean STree?"
raise ValueError(msg)
class Diterator:
def __init__(self, data):
self._stack = data.copy()
def __next__(self):
if len(self._stack) == 0:
raise StopIteration()
return self._stack.pop(0)
class Datasets:
def __init__(self):
with open(os.path.join(Folders.data, Files.index)) as f:
self.data_sets = f.read().splitlines()
def load(self, name):
data = pd.read_csv(
os.path.join(Folders.data, Files.dataset(name)),
sep="\t",
index_col=0,
)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
return X, y
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)
class BestResults:
def __init__(self, model, datasets):
self.datasets = datasets
self.model = model
self.data = {}
def _get_file_name(self):
return os.path.join(Folders.results, Files.best_results(self.model))
def load(self, dictionary):
self.file_name = self._get_file_name()
try:
with open(self.file_name) as f:
self.data = json.load(f)
except FileNotFoundError:
raise ValueError(f"{self.file_name} does not exist")
return self.fill(dictionary, self.data)
def fill(self, dictionary, data=None):
if data is None:
data = {}
for dataset in self.datasets:
if dataset not in data:
data[dataset] = (0.0, dictionary, "")
return data
def _process_datafile(self, results, data, file_name):
for record in data["results"]:
dataset = record["dataset"]
if dataset in results:
if record["accuracy"] > results[dataset]["accuracy"]:
record["file_name"] = file_name
results[dataset] = record
else:
record["file_name"] = file_name
results[dataset] = record
def build(self):
results = {}
init_suffix, end_suffix = Files.results_suffixes(self.model)
all_files = list(os.walk(Folders.results))
for root, _, files in tqdm(all_files, desc="files"):
for name in files:
if name.startswith(init_suffix) and name.endswith(end_suffix):
file_name = os.path.join(root, name)
with open(file_name) as fp:
data = json.load(fp)
self._process_datafile(results, data, name)
# Build best results json file
output = {}
datasets = Datasets()
for name in tqdm(list(datasets), desc="datasets"):
output[name] = (
results[name]["accuracy"],
results[name]["hyperparameters"],
results[name]["file_name"],
)
self.data = output
with open(self._get_file_name(), "w") as fp:
json.dump(output, fp)
class Experiment:
def __init__(
self,
model_name,
datasets,
hyperparams_dict,
hyperparams_file,
platform,
progress_bar=True,
folds=5,
):
today = datetime.now()
self.time = today.strftime("%H:%M:%S")
self.date = today.strftime("%Y-%m-%d")
self.output_file = os.path.join(
Folders.results,
Files.results(model_name, platform, self.date, self.time),
)
self.model_name = model_name
self.model = Models.get_model(model_name)
self.datasets = datasets
dictionary = json.loads(hyperparams_dict)
hyper = BestResults(model=model_name, datasets=datasets)
if hyperparams_file:
self.hyperparameters_dict = hyper.load(
dictionary=dictionary,
)
else:
self.hyperparameters_dict = hyper.fill(
dictionary=dictionary,
)
self.platform = platform
self.progress_bar = progress_bar
self.folds = folds
self.random_seeds = Randomized.seeds
self.results = []
self.duration = 0
self._init_experiment()
def get_output_file(self):
return self.output_file
def _build_classifier(self, random_state, hyperparameters):
clf = self.model(random_state=random_state)
clf.set_params(**hyperparameters)
return clf
def _init_experiment(self):
self.scores = []
self.times = []
self.nodes = []
self.leaves = []
self.depths = []
def _n_fold_crossval(self, X, y, hyperparameters):
if self.scores != []:
raise ValueError("Must init experiment before!")
loop = tqdm(
self.random_seeds,
position=1,
leave=False,
disable=not self.progress_bar,
)
for random_state in loop:
loop.set_description(f"Seed({random_state:4d})")
random.seed(random_state)
np.random.seed(random_state)
kfold = StratifiedKFold(
shuffle=True, random_state=random_state, n_splits=self.folds
)
clf = self._build_classifier(random_state, hyperparameters)
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
res = cross_validate(
clf, X, y, cv=kfold, return_estimator=True
)
self.scores.append(res["test_score"])
self.times.append(res["fit_time"])
for result_item in res["estimator"]:
if self.model_name == "Cart":
nodes_item = result_item.tree_.node_count
depth_item = result_item.tree_.max_depth
leaves_item = result_item.get_n_leaves()
else:
nodes_item, leaves_item = result_item.nodes_leaves()
depth_item = (
result_item.depth_
if hasattr(result_item, "depth_")
else 0
)
self.nodes.append(nodes_item)
self.leaves.append(leaves_item)
self.depths.append(depth_item)
def _add_results(self, name, hyperparameters, samples, features, classes):
record = {}
record["dataset"] = name
record["samples"] = samples
record["features"] = features
record["classes"] = classes
record["hyperparameters"] = hyperparameters
record["nodes"] = np.mean(self.nodes)
record["leaves"] = np.mean(self.leaves)
record["depth"] = np.mean(self.depths)
record["accuracy"] = np.mean(self.scores)
record["accuracy_std"] = np.std(self.scores)
record["time"] = np.mean(self.times)
record["time_std"] = np.std(self.times)
self.results.append(record)
def _output_results(self):
output = {}
output["model"] = self.model_name
output["folds"] = self.folds
output["date"] = self.date
output["time"] = self.time
output["duration"] = self.duration
output["seeds"] = self.random_seeds
output["platform"] = self.platform
output["results"] = self.results
with open(self.output_file, "w") as f:
json.dump(output, f)
def do_experiment(self):
now = time.time()
loop = tqdm(
list(self.datasets),
position=0,
disable=not self.progress_bar,
)
for name in loop:
loop.set_description(f"{name:30s}")
X, y = self.datasets.load(name)
samp, feat = X.shape
n_classes = len(np.unique(y))
hyperparameters = self.hyperparameters_dict[name][1]
self._init_experiment()
self._n_fold_crossval(X, y, hyperparameters)
self._add_results(name, hyperparameters, samp, feat, n_classes)
self.duration = time.time() - now
self._output_results()
if self.progress_bar:
print(f"Results in {self.output_file}")

396
src/Results.py Normal file
View File

@@ -0,0 +1,396 @@
import os
import json
import abc
import xlsxwriter
from Experiments import Datasets, BestResults
from Utils import Folders, Files, Symbols
class BaseReport(abc.ABC):
def __init__(self, file_name, best_file=False):
self.file_name = file_name
if not os.path.isfile(file_name):
raise ValueError(f"{file_name} does not exists!")
with open(file_name) as f:
self.data = json.load(f)
self.best_acc_file = best_file
self.lines = self.data if best_file else self.data["results"]
def _get_accuracy(self, item):
return self.data[item][0] if self.best_acc_file else item["accuracy"]
def report(self):
self.header()
accuracy_total = 0.0
for result in self.lines:
self.print_line(result)
accuracy_total += self._get_accuracy(result)
self.footer(accuracy_total)
def _load_best_results(self, model):
best = BestResults(model, Datasets())
self.best_results = best.load({})
def _compute_status(self, dataset, accuracy):
best = self.best_results[dataset][0]
status = " "
if accuracy == best:
status = Symbols.equal_best
elif accuracy > best:
status = Symbols.better_best
if status != " ":
if status not in self._compare_totals:
self._compare_totals[status] = 1
else:
self._compare_totals[status] += 1
return status
@staticmethod
def _status_meaning(status):
meaning = {
Symbols.equal_best: "Equal to best",
Symbols.better_best: "Better than best",
}
return meaning[status]
@abc.abstractmethod
def header(self):
pass
@abc.abstractmethod
def print_line(self, result):
pass
@abc.abstractmethod
def footer(self, accuracy):
pass
class Report(BaseReport):
header_lengths = [30, 5, 3, 3, 7, 7, 7, 15, 15, 15]
header_cols = [
"Dataset",
"Samp",
"Var",
"Cls",
"Nodes",
"Leaves",
"Depth",
"Accuracy",
"Time",
"Hyperparameters",
]
def __init__(self, file_name, compare=False):
super().__init__(file_name)
self.compare = compare
def header_line(self, text):
length = sum(self.header_lengths) + len(self.header_lengths) - 3
if text == "*":
print("*" * (length + 2))
else:
print(f"*{text:{length}s}*")
def print_line(self, result):
hl = self.header_lengths
i = 0
print(f"{result['dataset']:{hl[i]}s} ", end="")
i += 1
print(f"{result['samples']:{hl[i]},d} ", end="")
i += 1
print(f"{result['features']:{hl[i]}d} ", end="")
i += 1
print(f"{result['classes']:{hl[i]}d} ", end="")
i += 1
print(f"{result['nodes']:{hl[i]}.2f} ", end="")
i += 1
print(f"{result['leaves']:{hl[i]}.2f} ", end="")
i += 1
print(f"{result['depth']:{hl[i]}.2f} ", end="")
i += 1
if self.compare:
status = self._compute_status(
result["dataset"], result["accuracy"]
)
else:
status = " "
print(
f"{result['accuracy']:8.6f}±{result['accuracy_std']:6.4f}{status}",
end="",
)
i += 1
print(
f"{result['time']:8.6f}±{result['time_std']:6.4f} ",
end="",
)
i += 1
print(f"{str(result['hyperparameters']):{hl[i]}s} ")
def header(self):
if self.compare:
self._load_best_results(self.data["model"])
self._compare_totals = {}
self.header_line("*")
self.header_line(
f" Report {self.data['model']} with {self.data['folds']} Folds "
f"cross validation and {len(self.data['seeds'])} random seeds"
)
self.header_line(f" Random seeds: {self.data['seeds']}")
self.header_line(
f" Execution took {self.data['duration']:7.2f} seconds on an "
f"{self.data['platform']}"
)
self.header_line("*")
print("")
line_col = ""
for field, underscore in zip(self.header_cols, self.header_lengths):
print(f"{field:{underscore}s} ", end="")
line_col += "=" * underscore + " "
print(f"\n{line_col}")
def footer(self, accuracy):
self.header_line("*")
if self.compare:
for key, value in self._compare_totals.items():
self.header_line(
f" {key} {self._status_meaning(key)} .....: {value:2d}"
)
self.header_line(
f" Accuracy compared to stree_default (liblinear-ovr) .: "
f"{accuracy/40.282203:7.4f}"
)
self.header_line("*")
class ReportBest(BaseReport):
header_lengths = [30, 8, 50, 35]
header_cols = [
"Dataset",
"Accuracy",
"File",
"Hyperparameters",
]
def __init__(self, model):
file_name = os.path.join(Folders.results, Files.best_results(model))
super().__init__(file_name, best_file=True)
self.compare = False
self.model = model
def header_line(self, text):
length = sum(self.header_lengths) + len(self.header_lengths) - 3
if text == "*":
print("*" * (length + 2))
else:
print(f"*{text:{length}s}*")
def print_line(self, result):
hl = self.header_lengths
print(f"{result:{hl[0]}s} ", end="")
print(
f"{self.data[result][0]:8.6f} ",
end="",
)
print(
f"{self.data[result][2]:{hl[2]}s} ",
end="",
)
print(f"{str(self.data[result][1]):{hl[1]}s} ")
def header(self):
self.header_line("*")
self.header_line(
f" Report Best Accuracies with {self.model}" f" in any platform"
)
self.header_line("*")
print("")
line_col = ""
for field, underscore in zip(self.header_cols, self.header_lengths):
print(f"{field:{underscore}s} ", end="")
line_col += "=" * underscore + " "
print(f"\n{line_col}")
def footer(self, accuracy):
self.header_line("*")
if self.compare:
for key, value in self._compare_totals.items():
self.header_line(
f" {key} {self._status_meaning(key)} .....: {value:2d}"
)
self.header_line(
f" Accuracy compared to stree_default (liblinear-ovr) .: "
f"{accuracy/40.282203:7.4f}"
)
self.header_line("*")
class Excel(BaseReport):
row = 4
def __init__(self, file_name, compare=False):
super().__init__(file_name)
self.compare = compare
def header(self):
if self.compare:
self._load_best_results(self.data["model"])
self._compare_totals = {}
file_name = self.file_name.replace(".json", ".xlsx")
self.book = xlsxwriter.Workbook(file_name)
self.sheet = self.book.add_worksheet(self.data["model"])
header = self.book.add_format()
header.set_font_size(18)
subheader = self.book.add_format()
subheader.set_font_size(16)
self.sheet.write(
0,
0,
f" Report {self.data['model']} with {self.data['folds']} Folds "
f"cross validation and {len(self.data['seeds'])} random seeds",
header,
)
self.sheet.write(
1,
0,
f" Execution took {self.data['duration']:7.2f} seconds on an "
f"{self.data['platform']}",
subheader,
)
self.sheet.write(
1, 5, f"Random seeds: {self.data['seeds']}", subheader
)
header_cols = [
("Dataset", 30),
("Samples", 10),
("Variables", 7),
("Classes", 7),
("Nodes", 7),
("Leaves", 7),
("Depth", 7),
("Accuracy", 10),
("Acc. Std.", 10),
("Time", 10),
("Time Std.", 10),
("Parameters", 50),
]
if self.compare:
header_cols.insert(8, ("Stat", 3))
bold = self.book.add_format({"bold": True, "font_size": 14})
i = 0
for item, length in header_cols:
self.sheet.write(3, i, item, bold)
self.sheet.set_column(i, i, length)
i += 1
def print_line(self, result):
size_n = 14
decimal = self.book.add_format(
{"num_format": "0.000000", "font_size": size_n}
)
integer = self.book.add_format(
{"num_format": "#,###", "font_size": size_n}
)
normal = self.book.add_format({"font_size": size_n})
col = 0
self.sheet.write(self.row, col, result["dataset"], normal)
self.sheet.write(self.row, col + 1, result["samples"], integer)
self.sheet.write(self.row, col + 2, result["features"], normal)
self.sheet.write(self.row, col + 3, result["classes"], normal)
self.sheet.write(self.row, col + 4, result["nodes"], normal)
self.sheet.write(self.row, col + 5, result["leaves"], normal)
self.sheet.write(self.row, col + 6, result["depth"], normal)
self.sheet.write(self.row, col + 7, result["accuracy"], decimal)
if self.compare:
status = self._compute_status(
result["dataset"], result["accuracy"]
)
self.sheet.write(self.row, col + 8, status, normal)
col = 9
else:
col = 8
self.sheet.write(self.row, col, result["accuracy_std"], decimal)
self.sheet.write(self.row, col + 1, result["time"], decimal)
self.sheet.write(self.row, col + 2, result["time_std"], decimal)
self.sheet.write(
self.row, col + 3, str(result["hyperparameters"]), normal
)
self.row += 1
def footer(self, accuracy):
if self.compare:
self.row += 2
bold = self.book.add_format({"bold": True, "font_size": 16})
for key, total in self._compare_totals.items():
self.sheet.write(self.row, 1, key, bold)
self.sheet.write(self.row, 2, total, bold)
self.sheet.write(self.row, 3, self._status_meaning(key), bold)
self.row += 1
message = (
f"** Accuracy compared to stree_default (liblinear-ovr) .: "
f"{accuracy/40.282203:7.4f}"
)
bold = self.book.add_format({"bold": True, "font_size": 14})
self.sheet.write(self.row + 1, 0, message, bold)
self.book.close()
class SQL(BaseReport):
table_name = "results"
def header(self):
file_name = self.file_name.replace(".json", ".sql")
self.file = open(file_name, "w")
def print_line(self, result):
attributes = [
"date",
"time",
"type",
"accuracy",
"accuracy_std",
"dataset",
"classifier",
"norm",
"stand",
"time_spent",
"time_spent_std",
"parameters",
"nodes",
"leaves",
"depth",
"platform",
"nfolds",
"seeds",
]
command_insert = (
f"replace into {self.table_name} ("
+ ",".join(attributes)
+ ") values("
+ ("'%s'," * len(attributes))[:-1]
+ ");\n"
)
values = (
self.data["date"],
self.data["time"],
"crossval",
result["accuracy"],
result["accuracy_std"],
result["dataset"],
self.data["model"],
0,
1,
result["time"],
result["time_std"],
str(result["hyperparameters"]).replace("'", '"'),
result["nodes"],
result["leaves"],
result["depth"],
self.data["platform"],
self.data["folds"],
str(self.data["seeds"]),
)
self.file.write(command_insert % values)
def footer(self, accuracy):
self.file.close()

40
src/Utils.py Normal file
View File

@@ -0,0 +1,40 @@
import os
class Folders:
data = "data"
results = "results"
src = "src"
report = os.path.join("exreport", "exreport_output")
class Files:
index = "all.txt"
exreport_output = "exreport.txt"
exreport_err = "exreport_err.txt"
cmd_open = "/usr/bin/open"
exreport_pdf = "Rplots.pdf"
@staticmethod
def best_results(model):
return f"best_results_{model}.json"
@staticmethod
def results(model, platform, date, time):
return f"results_{model}_{platform}_{date}_{time}.json"
@staticmethod
def results_suffixes(model):
return f"results_{model}_", ".json"
@staticmethod
def dataset(name):
return f"{name}_R.dat"
class Symbols:
check_mark = "\N{heavy check mark}"
exclamation = "\N{heavy exclamation mark symbol}"
black_star = "\N{black star}"
equal_best = check_mark
better_best = black_star

3
src/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .Experiments import Datasets, Experiment
__all__ = ["Datasets", "Experiment"]

46
src/benchmark.py Normal file
View File

@@ -0,0 +1,46 @@
import os
import shutil
import subprocess
from Utils import Files, Folders
def end_message(message, file):
length = 100
print("*" * length)
print(message)
print("*" * length)
with open(os.path.join(Folders.results, file)) as f:
data = f.read().splitlines()
for line in data:
print(line)
def is_exe(fpath):
return os.path.isfile(fpath) and os.access(fpath, os.X_OK)
# Remove previous results
try:
shutil.rmtree(Folders.report)
os.remove(Files.exreport_pdf)
except FileNotFoundError:
pass
except OSError as e:
print("Error: %s : %s" % (Folders.report, e.strerror))
# Compute Friedman & Holm Tests
fout = open(os.path.join(Folders.results, Files.exreport_output), "w")
ferr = open(os.path.join(Folders.results, Files.exreport_err), "w")
result = subprocess.run(
["Rscript", os.path.join(Folders.src, "benchmark.r")],
stdout=fout,
stderr=ferr,
)
fout.close()
ferr.close()
if result.returncode != 0:
end_message("Error computing benchmark", Files.exreport_err)
else:
end_message("Benchmark Ok", Files.exreport_output)
if is_exe(Files.cmd_open):
subprocess.run([Files.cmd_open, Files.exreport_pdf])

35
src/benchmark.r Normal file
View File

@@ -0,0 +1,35 @@
csv_file <- "results/exreport.csv"
destination <- "exreport/"
results <- read.csv(csv_file)
library(exreport)
experiment <- expCreate(results, method="classifier", problem="dataset", name="Stree")
testAccuracy <- testMultipleControl(experiment, "accuracy", "max")
summary(testAccuracy)
table1 <- tabularTestSummary(testAccuracy, columns = c("pvalue", "rank", "wtl"))
table1
plot1 <- plotExpSummary(experiment, "accuracy", columns = 3)
plot2 <- plotCumulativeRank(testAccuracy)
plot3 <- plotRankDistribution(testAccuracy)
report <- exreport("Stree Report")
# Add the experiment object for reference:
report <- exreportAdd(report, experiment)
# Now add the test:
report <- exreportAdd(report, testAccuracy)
# Finally you can add the different tables and plots.
report <- exreportAdd(report, list(plot1,plot2,table1,plot3))
# At this point we would like to include an additional item in our report. We need a detailed table of our experiment,
# as we are preparing a scientific paper and we would like to have an overview of it to be included in an annex,
# despite the good summaries that we are providing with the plots and tests. Fortnunately, we have another built in
# function for this.
# We have decided to generate the table at this point of the tutorial to discusse some special formating parameters of this function. Concretely, some of the tabular outputs generated by exreport have some properties that are only useful when rendering the objets in a graphic report, and have no effect in the object representation in the R console. In this case, we will tell the function to boldface the method that maximices the result for each column, and to split the table into to pieces when rendering.
# We create the table:
table2 <- tabularExpSummary(experiment, "accuracy", digits=4, format="f", boldfaceColumns="max", tableSplit=2)
# And add it to the report:
report <- exreportAdd(report, table2)
# Now that we have finished adding elements to the report it is time to render it. We want to generate an HTML report, so we call the appropiate function, by default it renders and opens the report in your browser using a temporary file, but you can optionally specify a folder in which the report will be saved for future use.
# Render the report:
exreportRender(report, destination=destination, target = "html", visualize = T)

View File

@@ -0,0 +1,375 @@
{
"balance-scale": [
0,
{
"C": 10000.0,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"balloons": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"breast-cancer-wisc-diag": [
0,
{
"C": 0.2,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"breast-cancer-wisc-prog": [
0,
{
"C": 0.2,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"breast-cancer-wisc": [
0,
{ "kernel": "rbf", "multiclass_strategy": "ovr" },
""
],
"breast-cancer": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"cardiotocography-10clases": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"cardiotocography-3clases": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"conn-bench-sonar-mines-rocks": [
0,
{ "kernel": "rbf", "multiclass_strategy": "ovr" },
""
],
"cylinder-bands": [0, { "kernel": "rbf", "multiclass_strategy": "ovr" }, ""],
"dermatology": [
0,
{
"C": 55,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"echocardiogram": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "poly",
"max_features": "auto",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"fertility": [
0,
{
"C": 0.05,
"max_features": "auto",
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"haberman-survival": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"heart-hungarian": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"hepatitis": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"ilpd-indian-liver": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"ionosphere": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"iris": [0, { "kernel": "liblinear", "multiclass_strategy": "ovr" }, ""],
"led-display": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"libras": [
0,
{
"C": 0.08,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"low-res-spect": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"lymphography": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"mammographic": [0, { "kernel": "rbf", "multiclass_strategy": "ovr" }, ""],
"molec-biol-promoter": [
0,
{ "kernel": "rbf", "multiclass_strategy": "ovr" },
""
],
"musk-1": [
0,
{
"C": 0.05,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"oocytes_merluccius_nucleus_4d": [
0,
{ "C": 8.25, "gamma": 0.1, "kernel": "poly", "multiclass_strategy": "ovr" },
""
],
"oocytes_merluccius_states_2f": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"oocytes_trisopterus_nucleus_2f": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"oocytes_trisopterus_states_5b": [
0,
{
"C": 0.11,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"parkinsons": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"pima": [0, { "kernel": "liblinear", "multiclass_strategy": "ovr" }, ""],
"pittsburg-bridges-MATERIAL": [
0,
{
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"pittsburg-bridges-REL-L": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"pittsburg-bridges-SPAN": [
0,
{
"C": 0.05,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"pittsburg-bridges-T-OR-D": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"planning": [
0,
{
"C": 7,
"gamma": 10.0,
"kernel": "rbf",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"post-operative": [
0,
{
"C": 55,
"degree": 5,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"seeds": [
0,
{
"C": 10000.0,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"statlog-australian-credit": [
0,
{
"C": 0.05,
"max_features": "auto",
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"statlog-german-credit": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"statlog-heart": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"statlog-image": [
0,
{
"C": 7,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"statlog-vehicle": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"synthetic-control": [
0,
{
"C": 0.55,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"tic-tac-toe": [
0,
{
"C": 0.2,
"gamma": 0.1,
"kernel": "poly",
"max_iter": 10000.0,
"multiclass_strategy": "ovr"
},
""
],
"vertebral-column-2clases": [
0,
{ "kernel": "liblinear", "multiclass_strategy": "ovr" },
""
],
"wine": [
0,
{
"C": 0.55,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
],
"zoo": [
0,
{
"C": 0.1,
"max_iter": 10000.0,
"kernel": "liblinear",
"multiclass_strategy": "ovr"
},
""
]
}

36
src/build_best.py Normal file
View File

@@ -0,0 +1,36 @@
import argparse
from Results import ReportBest
from Experiments import Datasets, BestResults
"""Build a json file with the best results of a model and its hyperparameters
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-m",
"--model",
type=str,
required=False,
default="STree",
help="model name, dfault STree",
)
ap.add_argument(
"-r",
"--report",
type=bool,
required=False,
help="Generate Report",
)
args = ap.parse_args()
return (args.model, args.report)
(model, report) = parse_arguments()
datasets = Datasets()
best = BestResults(model, datasets)
best.build()
if report:
report = ReportBest(model)
report.report()

90
src/main.py Normal file
View File

@@ -0,0 +1,90 @@
import argparse
from Experiments import Experiment, Datasets
from Results import Report
"""Do experiment and build result file, optionally print report with results
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-P",
"--platform",
type=str,
required=True,
help="Platform where the test is run",
)
ap.add_argument(
"-m",
"--model",
type=str,
required=False,
default="STree",
help="model name, dfault STree",
)
ap.add_argument(
"-n",
"--n_folds",
type=int,
required=False,
default=5,
help="number of folds",
)
ap.add_argument(
"-p", "--hyperparameters", type=str, required=False, default="{}"
)
ap.add_argument(
"-f", "--paramfile", type=bool, required=False, default=False
)
ap.add_argument(
"-q",
"--quiet",
type=bool,
default=False,
required=False,
help="Wether to show progress bar or not",
)
ap.add_argument(
"-r",
"--report",
type=bool,
default=False,
required=False,
help="Report results",
)
args = ap.parse_args()
return (
args.model,
args.n_folds,
args.platform,
args.quiet,
args.hyperparameters,
args.paramfile,
args.report,
)
(
model,
folds,
platform,
quiet,
hyperparameters,
paramfile,
report,
) = parse_arguments()
job = Experiment(
model_name=model,
datasets=Datasets(),
hyperparams_dict=hyperparameters,
hyperparams_file=paramfile,
progress_bar=not quiet,
platform=platform,
folds=folds,
)
job.do_experiment()
if report:
result_file = job.get_output_file()
report = Report(result_file)
report.report()

View File

@@ -0,0 +1 @@
{"balance-scale": {"C": 10000.0, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "balloons": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "breast-cancer-wisc-diag": {"C": 0.2, "max_iter": 10000.0, "kernel": "liblinear"}, "breast-cancer-wisc-prog": {"C": 0.2, "max_iter": 10000.0, "kernel": "liblinear"}, "breast-cancer-wisc": {"kernel": "rbf"}, "breast-cancer": {"kernel": "liblinear"}, "cardiotocography-10clases": {"kernel": "liblinear"}, "cardiotocography-3clases": {"kernel": "liblinear"}, "conn-bench-sonar-mines-rocks": {"kernel": "rbf"}, "cylinder-bands": {"kernel": "rbf"}, "dermatology": {"C": 55, "max_iter": 10000.0, "kernel": "liblinear"}, "echocardiogram": {"C": 7, "gamma": 0.1, "kernel": "poly", "max_features": "auto", "max_iter": 10000.0}, "fertility": {"C": 0.05, "max_features": "auto", "max_iter": 10000.0, "kernel": "liblinear"}, "haberman-survival": {"kernel": "liblinear"}, "heart-hungarian": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "hepatitis": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "ilpd-indian-liver": {"kernel": "liblinear"}, "ionosphere": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "iris": {"kernel": "liblinear"}, "led-display": {"kernel": "liblinear"}, "libras": {"C": 0.08, "max_iter": 10000.0, "kernel": "liblinear"}, "low-res-spect": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "lymphography": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "mammographic": {"kernel": "rbf"}, "molec-biol-promoter": {"kernel": "rbf"}, "musk-1": {"C": 0.05, "gamma": 0.1, "kernel": "poly", "max_iter": 10000.0}, "oocytes_merluccius_nucleus_4d": {"C": 8.25, "gamma": 0.1, "kernel": "poly"}, "oocytes_merluccius_states_2f": {"kernel": "liblinear"}, "oocytes_trisopterus_nucleus_2f": {"kernel": "liblinear"}, "oocytes_trisopterus_states_5b": {"C": 0.11, "max_iter": 10000.0, "kernel": "liblinear"}, "parkinsons": {"kernel": "liblinear"}, "pima": {"kernel": "liblinear"}, "pittsburg-bridges-MATERIAL": {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0}, "pittsburg-bridges-REL-L": {"kernel": "liblinear"}, "pittsburg-bridges-SPAN": {"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear"}, "pittsburg-bridges-T-OR-D": {"kernel": "liblinear"}, "planning": {"C": 7, "gamma": 10.0, "kernel": "rbf", "max_iter": 10000.0}, "post-operative": {"C": 55, "degree": 5, "gamma": 0.1, "kernel": "poly", "max_iter": 10000.0}, "seeds": {"C": 10000.0, "max_iter": 10000.0, "kernel": "liblinear"}, "statlog-australian-credit": {"C": 0.05, "max_features": "auto", "max_iter": 10000.0, "kernel": "liblinear"}, "statlog-german-credit": {"kernel": "liblinear"}, "statlog-heart": {"kernel": "liblinear"}, "statlog-image": {"C": 7, "max_iter": 10000.0, "kernel": "liblinear"}, "statlog-vehicle": {"kernel": "liblinear"}, "synthetic-control": {"C": 0.55, "max_iter": 10000.0, "kernel": "liblinear"}, "tic-tac-toe": {"C": 0.2, "gamma": 0.1, "kernel": "poly", "max_iter": 10000.0}, "vertebral-column-2clases": {"kernel": "liblinear"}, "wine": {"C": 0.55, "max_iter": 10000.0, "kernel": "liblinear"}, "zoo": {"C": 0.1, "max_iter": 10000.0, "kernel": "liblinear"}}

81
src/report.py Normal file
View File

@@ -0,0 +1,81 @@
import argparse
import numpy as np
from Experiments import Datasets
from Results import Report, Excel, SQL, ReportBest
"""Build report on screen of a result file, optionally generate excel and sql
file, and can compare results of report with best results obtained by model
If no argument is set, displays the datasets and its characteristics
"""
def parse_arguments():
ap = argparse.ArgumentParser()
ap.add_argument(
"-f",
"--file",
type=str,
required=False,
help="Result file",
)
ap.add_argument(
"-x",
"--excel",
type=bool,
required=False,
help="Generate Excel file",
)
ap.add_argument(
"-q",
"--sql",
type=bool,
required=False,
help="Generate sql file",
)
ap.add_argument(
"-c",
"--compare",
type=bool,
required=False,
help="Compare accuracy with best results",
)
ap.add_argument(
"-b",
"--best",
type=str,
required=False,
help="best results of models",
)
args = ap.parse_args()
return (args.file, args.excel, args.sql, args.compare, args.best)
def default_report():
sets = Datasets()
print(f"{'Dataset':30s} Samp. Feat Cls")
print("=" * 30 + " ===== ==== ===")
for line in sets:
X, y = sets.load(line)
print(
f"{line:30s} {X.shape[0]:5,d} {X.shape[1]:4d} "
f"{len(np.unique(y)):3d}"
)
(file, excel, sql, compare, best) = parse_arguments()
if file is None and best is None:
default_report()
else:
if best is not None:
report = ReportBest(best)
report.report()
else:
report = Report(file, compare)
report.report()
if excel:
excel = Excel(file, compare)
excel.report()
if sql:
sql = SQL(file)
sql.report()

4
src/requirements.txt Normal file
View File

@@ -0,0 +1,4 @@
pandas
stree
mufs
xlsxwriter