diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index b447d9a..98e2709 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -163,16 +163,18 @@ class Datasets: attr = SimpleNamespace() attr.dataset = name values, counts = np.unique(y, return_counts=True) - comp = "" - sep = "" - for count in counts: - comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) " - sep = "/ " - attr.balance = comp - attr.classes = len(np.unique(y)) + attr.classes = len(values) attr.samples = X.shape[0] attr.features = X.shape[1] attr.cont_features = len(self.get_continuous_features()) + attr.distribution = {} + comp = "" + sep = "" + for value, count in zip(values, counts): + comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) " + sep = "/ " + attr.distribution[value.item()] = count / sum(counts) + attr.balance = comp self.discretize = tmp return attr diff --git a/benchmark/Results.py b/benchmark/Results.py index 76b1874..fa7349f 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -1,139 +1,14 @@ +import math import os -import sys -from pathlib import Path from operator import itemgetter from types import SimpleNamespace -import math -import json -import abc -import shutil -import subprocess + import xlsxwriter -from xlsxwriter.exceptions import DuplicateWorksheetName -import numpy as np -from .Experiments import BestResults + from .Datasets import Datasets -from .Arguments import EnvData, ALL_METRICS -from .Utils import ( - Folders, - Files, - Symbols, - TextColor, - NO_RESULTS, -) -from ._version import __version__ - - -def get_input(message="", is_test=False): - return "test" if is_test else input(message) - - -class BestResultsEver: - def __init__(self): - self.data = {} - for i in ["Tanveer", "Surcov", "Arff"]: - self.data[i] = {} - for metric in ALL_METRICS: - self.data[i][metric.replace("-", "_")] = ["self", 1.0] - self.data[i][metric] = ["self", 1.0] - self.data["Tanveer"]["accuracy"] = [ - "STree_default (liblinear-ovr)", - 40.282203, - ] - self.data["Arff"]["accuracy"] = [ - "STree_default (linear-ovo)", - 22.109799, - ] - - def get_name_value(self, key, score): - return self.data[key][score] - - -class BaseReport(abc.ABC): - def __init__(self, file_name, best_file=False): - self.file_name = file_name - if not os.path.isfile(file_name): - if not os.path.isfile(os.path.join(Folders.results, file_name)): - raise FileNotFoundError(f"{file_name} does not exists!") - else: - self.file_name = os.path.join(Folders.results, file_name) - with open(self.file_name) as f: - self.data = json.load(f) - self.best_acc_file = best_file - if best_file: - self.lines = self.data - else: - self.lines = self.data["results"] - self.score_name = self.data["score_name"] - self.__compute_best_results_ever() - # Set the labels for nodes, leaves, depth - env_data = EnvData.load() - self.nodes_label = env_data["nodes"] - self.leaves_label = env_data["leaves"] - self.depth_label = env_data["depth"] - - def __compute_best_results_ever(self): - args = EnvData.load() - key = args["source_data"] - best = BestResultsEver() - self.best_score_name, self.best_score_value = best.get_name_value( - key, self.score_name - ) - - def _get_accuracy(self, item): - return self.data[item][0] if self.best_acc_file else item["score"] - - def report(self): - self.header() - accuracy_total = 0.0 - for result in self.lines: - self.print_line(result) - accuracy_total += self._get_accuracy(result) - self.footer(accuracy_total) - - def _load_best_results(self, score, model): - best = BestResults(score, model, Datasets()) - self.best_results = best.load({}) - - def _compute_status(self, dataset, accuracy: float): - best = self.best_results[dataset][0] - status = " " - if accuracy == best: - status = Symbols.equal_best - elif accuracy > best: - status = Symbols.better_best - if status != " ": - if status not in self._compare_totals: - self._compare_totals[status] = 1 - else: - self._compare_totals[status] += 1 - return status - - @staticmethod - def _status_meaning(status): - meaning = { - Symbols.equal_best: "Equal to best", - Symbols.better_best: "Better than best", - } - return meaning[status] - - def _get_best_accuracy(self): - return self.best_score_value - - def _get_message_best_accuracy(self): - return f"{self.score_name} compared to {self.best_score_name} .:" - - @abc.abstractmethod - def header(self) -> None: - pass - - @abc.abstractmethod - def print_line(self, result) -> None: - pass - - @abc.abstractmethod - def footer(self, accuracy: float) -> None: - pass +from .ResultsBase import BaseReport, StubReport +from .ResultsFiles import Excel +from .Utils import NO_RESULTS, Files, Folders, TextColor class Report(BaseReport): @@ -186,10 +61,7 @@ class Report(BaseReport): i += 1 print(f"{result['depth']:{hl[i]}.2f} ", end="") i += 1 - if self.compare: - status = self._compute_status(result["dataset"], result["score"]) - else: - status = " " + status = self._compute_status(result["dataset"], result["score"]) print( f"{result['score']:8.6f}±{result['score_std']:6.4f}{status}", end="", @@ -207,7 +79,7 @@ class Report(BaseReport): self._load_best_results( self.data["score_name"], self.data["model"] ) - self._compare_totals = {} + self._compare_totals = {} self.header_line("*") self.header_line( f" {self.data['model']} ver. {self.data['version']}" @@ -238,11 +110,13 @@ class Report(BaseReport): def footer(self, accuracy: float) -> None: self.header_line("*") - if self.compare: - for key, value in self._compare_totals.items(): - self.header_line( - f" {key} {self._status_meaning(key)} .....: {value:2d}" - ) + for key, value in self._compare_totals.items(): + meaning = self._status_meaning(key) + self.header_line( + f" {key} {meaning}" + + "." * (30 - len(meaning)) + + f": {value:2d}" + ) self.header_line( f" {self._get_message_best_accuracy()} " f"{accuracy/self._get_best_accuracy():7.4f}" @@ -315,1063 +189,6 @@ class ReportBest(BaseReport): self.header_line("*") -class Excel(BaseReport): - row = 6 - # alternate lines colors - color1 = "#DCE6F1" - color2 = "#FDE9D9" - color3 = "#B1A0C7" - - def __init__(self, file_name, compare=False, book=None): - super().__init__(file_name) - self.compare = compare - if self.compare: - self._load_best_results( - self.data["score_name"], self.data["model"] - ) - self._compare_totals = {} - if book is None: - self.excel_file_name = Path(self.file_name).name.replace( - Files.report_ext, ".xlsx" - ) - self.book = xlsxwriter.Workbook( - os.path.join(Folders.excel, self.excel_file_name), - {"nan_inf_to_errors": True}, - ) - self.set_book_properties() - self.close = True - else: - self.book = book - self.close = False - suffix = "" - num = 1 - while True: - try: - self.sheet = self.book.add_worksheet( - self.data["model"] + suffix - ) - break - except DuplicateWorksheetName: - num += 1 - suffix = str(num) - self.max_hyper_width = 0 - self.col_hyperparams = 0 - - @staticmethod - def set_properties(book, title): - book.set_properties( - { - "title": title, - "subject": "Machine learning results", - "author": "Ricardo Montañana Gómez", - "manager": "Dr. J. A. Gámez, Dr. J. M. Puerta", - "company": "UCLM", - "comments": "Created with Python and XlsxWriter", - } - ) - - def set_book_properties(self): - self.set_properties(self.book, self.get_title()) - - def get_title(self): - return ( - f" {self.data['model']} ver. {self.data['version']}" - f" {self.data['language']} ver. {self.data['language_version']}" - f" with {self.data['folds']} Folds " - f"cross validation and {len(self.data['seeds'])} random seeds. " - f"{self.data['date']} {self.data['time']}" - ) - - def get_file_name(self): - return self.excel_file_name - - def header(self): - merge_format = self.book.add_format( - { - "border": 1, - "bold": 1, - "align": "center", - "valign": "vcenter", - "font_size": 18, - "bg_color": self.color3, - } - ) - merge_format_subheader = self.book.add_format( - { - "border": 1, - "bold": 1, - "align": "center", - "valign": "vcenter", - "font_size": 16, - "bg_color": self.color1, - } - ) - merge_format_subheader_left = self.book.add_format( - { - "border": 1, - "bold": 1, - "align": "left", - "valign": "vcenter", - "font_size": 12, - "bg_color": self.color1, - } - ) - header_text = self.get_title() - self.sheet.merge_range(0, 0, 0, 11, header_text, merge_format) - self.sheet.merge_range( - 1, 0, 1, 11, f" {self.data['title']}", merge_format_subheader - ) - self.sheet.merge_range( - 2, - 0, - 3, - 0, - f" Score is {self.data['score_name']}", - merge_format_subheader, - ) - self.sheet.merge_range( - 2, - 1, - 3, - 3, - " Execution time", - merge_format_subheader, - ) - hours = self.data["duration"] / 3600 - self.sheet.merge_range( - 2, - 4, - 2, - 5, - f"{self.data['duration']:7,.2f} s", - merge_format_subheader, - ) - self.sheet.merge_range( - 3, - 4, - 3, - 5, - f" {hours:5.2f} h", - merge_format_subheader, - ) - self.sheet.merge_range( - 2, - 6, - 3, - 6, - " ", - merge_format_subheader, - ) - self.sheet.merge_range( - 2, - 7, - 3, - 7, - "Platform", - merge_format_subheader, - ) - self.sheet.merge_range( - 2, - 8, - 3, - 8, - f"{self.data['platform']}", - merge_format_subheader, - ) - self.sheet.merge_range( - 2, - 9, - 2, - 11, - f"Random seeds: {self.data['seeds']}", - merge_format_subheader_left, - ) - self.sheet.merge_range( - 3, - 9, - 3, - 10, - f"Stratified: {self.data['stratified']}", - merge_format_subheader_left, - ) - self.sheet.write( - 3, - 11, - f"Discretized: {self.data['discretized']}", - merge_format_subheader_left, - ) - header_cols = [ - ("Dataset", 30), - ("Samples", 10), - ("Features", 7), - ("Classes", 7), - (self.nodes_label, 7), - (self.leaves_label, 7), - (self.depth_label, 7), - ("Score", 12), - ("Score Std.", 12), - ("Time", 12), - ("Time Std.", 12), - ("Hyperparameters", 50), - ] - if self.compare: - header_cols.insert(8, ("Stat", 3)) - bold = self.book.add_format( - { - "bold": True, - "font_size": 14, - "bg_color": self.color3, - "border": 1, - } - ) - i = 0 - for item, length in header_cols: - self.sheet.write(5, i, item, bold) - self.sheet.set_column(i, i, length) - i += 1 - - def print_line(self, result): - size_n = 14 - decimal = self.book.add_format( - {"num_format": "0.000000", "font_size": size_n, "border": 1} - ) - integer = self.book.add_format( - {"num_format": "#,###", "font_size": size_n, "border": 1} - ) - normal = self.book.add_format({"font_size": size_n, "border": 1}) - col = 0 - if self.row % 2 == 0: - normal.set_bg_color(self.color1) - decimal.set_bg_color(self.color1) - integer.set_bg_color(self.color1) - else: - normal.set_bg_color(self.color2) - decimal.set_bg_color(self.color2) - integer.set_bg_color(self.color2) - self.sheet.write(self.row, col, result["dataset"], normal) - self.sheet.write(self.row, col + 1, result["samples"], integer) - self.sheet.write(self.row, col + 2, result["features"], integer) - self.sheet.write(self.row, col + 3, result["classes"], normal) - self.sheet.write(self.row, col + 4, result["nodes"], normal) - self.sheet.write(self.row, col + 5, result["leaves"], normal) - self.sheet.write(self.row, col + 6, result["depth"], normal) - self.sheet.write(self.row, col + 7, result["score"], decimal) - if self.compare: - status = self._compute_status(result["dataset"], result["score"]) - self.sheet.write(self.row, col + 8, status, normal) - col = 9 - else: - col = 8 - self.sheet.write(self.row, col, result["score_std"], decimal) - self.sheet.write(self.row, col + 1, result["time"], decimal) - self.sheet.write(self.row, col + 2, result["time_std"], decimal) - self.sheet.write( - self.row, col + 3, str(result["hyperparameters"]), normal - ) - self.col_hyperparams = col + 3 - self.max_hyper_width = max( - self.max_hyper_width, len(str(result["hyperparameters"])) - ) - self.row += 1 - - def footer(self, accuracy): - if self.compare: - self.row += 2 - bold = self.book.add_format({"bold": True, "font_size": 16}) - for key, total in self._compare_totals.items(): - self.sheet.write(self.row, 1, key, bold) - self.sheet.write(self.row, 2, total, bold) - self.sheet.write(self.row, 3, self._status_meaning(key), bold) - self.row += 1 - message = ( - f"** {self._get_message_best_accuracy()} " - f"{accuracy/self._get_best_accuracy():7.4f}" - ) - bold = self.book.add_format({"bold": True, "font_size": 14}) - # set width of the hyperparams column with the maximum width - self.sheet.set_column( - self.col_hyperparams, - self.col_hyperparams, - max(self.max_hyper_width + 1, 23), - ) - self.sheet.write(self.row + 1, 0, message, bold) - for c in range(self.row + 2): - self.sheet.set_row(c, 20) - self.sheet.set_row(0, 25) - self.sheet.freeze_panes(6, 1) - self.sheet.hide_gridlines(2) - if self.close: - self.book.close() - - -class ReportDatasets: - row = 6 - # alternate lines colors - color1 = "#DCE6F1" - color2 = "#FDE9D9" - color3 = "#B1A0C7" - - def __init__(self, excel=False, book=None): - self.excel = excel - self.env = EnvData().load() - self.close = False - self.output = True - self.header_text = f"Datasets used in benchmark ver. {__version__}" - if excel: - self.max_length = 0 - if book is None: - self.excel_file_name = os.path.join( - Folders.excel, Files.datasets_report_excel - ) - self.book = xlsxwriter.Workbook( - self.excel_file_name, {"nan_inf_to_errors": True} - ) - self.set_properties(self.get_title()) - self.close = True - else: - self.book = book - self.output = False - self.sheet = self.book.add_worksheet("Datasets") - - def set_properties(self, title): - self.book.set_properties( - { - "title": title, - "subject": "Machine learning results", - "author": "Ricardo Montañana Gómez", - "manager": "Dr. J. A. Gámez, Dr. J. M. Puerta", - "company": "UCLM", - "comments": "Created with Python and XlsxWriter", - } - ) - - @staticmethod - def get_python_version(): - return "{}.{}".format(sys.version_info.major, sys.version_info.minor) - - def get_title(self): - return ( - f" Benchmark ver. {__version__} - " - f" Python ver. {self.get_python_version()}" - f" with {self.env['n_folds']} Folds cross validation " - f" Discretization: {self.env['discretize']} " - f"Stratification: {self.env['stratified']}" - ) - - def get_file_name(self): - return self.excel_file_name - - def header(self): - merge_format = self.book.add_format( - { - "border": 1, - "bold": 1, - "align": "center", - "valign": "vcenter", - "font_size": 18, - "bg_color": self.color3, - } - ) - merge_format_subheader = self.book.add_format( - { - "border": 1, - "bold": 1, - "align": "center", - "valign": "vcenter", - "font_size": 16, - "bg_color": self.color1, - } - ) - merge_format_subheader_right = self.book.add_format( - { - "border": 1, - "bold": 1, - "align": "right", - "valign": "vcenter", - "font_size": 16, - "bg_color": self.color1, - } - ) - merge_format_subheader_left = self.book.add_format( - { - "border": 1, - "bold": 1, - "align": "left", - "valign": "vcenter", - "font_size": 16, - "bg_color": self.color1, - } - ) - self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format) - self.sheet.merge_range( - 1, - 0, - 4, - 0, - f" Default score {self.env['score']}", - merge_format_subheader, - ) - self.sheet.merge_range( - 1, - 1, - 1, - 4, - "Cross validation", - merge_format_subheader_right, - ) - self.sheet.write( - 1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left - ) - self.sheet.merge_range( - 2, - 1, - 2, - 4, - "Stratified", - merge_format_subheader_right, - ) - self.sheet.write( - 2, - 5, - f"{'True' if self.env['stratified']=='1' else 'False'}", - merge_format_subheader_left, - ) - self.sheet.merge_range( - 3, - 1, - 3, - 4, - "Discretized", - merge_format_subheader_right, - ) - self.sheet.write( - 3, - 5, - f"{'True' if self.env['discretize']=='1' else 'False'}", - merge_format_subheader_left, - ) - self.sheet.merge_range( - 4, - 1, - 4, - 4, - "Seeds", - merge_format_subheader_right, - ) - self.sheet.write( - 4, 5, f"{self.env['seeds']}", merge_format_subheader_left - ) - self.update_max_length(len(self.env["seeds"]) + 1) - header_cols = [ - ("Dataset", 30), - ("Samples", 10), - ("Features", 10), - ("Continuous", 10), - ("Classes", 10), - ("Balance", 50), - ] - bold = self.book.add_format( - { - "bold": True, - "font_size": 14, - "bg_color": self.color3, - "border": 1, - } - ) - i = 0 - for item, length in header_cols: - self.sheet.write(5, i, item, bold) - self.sheet.set_column(i, i, length) - i += 1 - - def footer(self): - # set Balance column width to max length - self.sheet.set_column(5, 5, self.max_length) - self.sheet.freeze_panes(6, 1) - self.sheet.hide_gridlines(2) - if self.close: - self.book.close() - - def print_line(self, result): - size_n = 14 - integer = self.book.add_format( - {"num_format": "#,###", "font_size": size_n, "border": 1} - ) - normal = self.book.add_format({"font_size": size_n, "border": 1}) - col = 0 - if self.row % 2 == 0: - normal.set_bg_color(self.color1) - integer.set_bg_color(self.color1) - else: - normal.set_bg_color(self.color2) - integer.set_bg_color(self.color2) - self.sheet.write(self.row, col, result.dataset, normal) - self.sheet.write(self.row, col + 1, result.samples, integer) - self.sheet.write(self.row, col + 2, result.features, integer) - self.sheet.write(self.row, col + 3, result.cont_features, integer) - self.sheet.write(self.row, col + 4, result.classes, normal) - self.sheet.write(self.row, col + 5, result.balance, normal) - self.update_max_length(len(result.balance)) - self.row += 1 - - def update_max_length(self, value): - if value > self.max_length: - self.max_length = value - - def report(self): - data_sets = Datasets() - max_len = max( - [len(data_sets.get_attributes(data).balance) for data in data_sets] - ) - color_line = TextColor.LINE1 - if self.output: - print(color_line, end="") - print(self.header_text) - print("") - print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance") - print("=" * 30 + " ====== ===== ==== === " + "=" * max_len) - if self.excel: - self.header() - for dataset in data_sets: - attributes = data_sets.get_attributes(dataset) - if self.excel: - self.print_line(attributes) - color_line = ( - TextColor.LINE2 - if color_line == TextColor.LINE1 - else TextColor.LINE1 - ) - if self.output: - print(color_line, end="") - print( - f"{dataset:30s} {attributes.samples:6,d} " - f"{attributes.features:5,d} {attributes.cont_features:4,d}" - f" {attributes.classes:3d} {attributes.balance:40s}" - ) - if self.excel: - self.footer() - - -class SQL(BaseReport): - table_name = "results" - - def header(self): - file_name = self.file_name.replace(".json", ".sql") - self.file = open(file_name, "w") - - def print_line(self, result): - attributes = [ - "date", - "time", - "type", - "title", - "stratified", - "score_name", - "score", - "score_std", - "dataset", - "classifier", - "version", - "norm", - "stand", - "time_spent", - "time_spent_std", - "parameters", - "nodes", - "leaves", - "depth", - "platform", - "nfolds", - "seeds", - ] - command_insert = ( - f"replace into {self.table_name} (" - + ",".join(attributes) - + ") values(" - + ("'%s'," * len(attributes))[:-1] - + ");\n" - ) - values = ( - self.data["date"], - self.data["time"], - "crossval", - self.data["title"], - "1" if self.data["stratified"] else "0", - self.data["score_name"], - result["score"], - result["score_std"], - result["dataset"], - self.data["model"], - self.data["version"], - 0, - 1, - result["time"], - result["time_std"], - str(result["hyperparameters"]).replace("'", '"'), - result["nodes"], - result["leaves"], - result["depth"], - self.data["platform"], - self.data["folds"], - str(self.data["seeds"]), - ) - self.file.write(command_insert % values) - - def footer(self, accuracy): - self.file.close() - - -class Benchmark: - def __init__(self, score, visualize=True): - self._score = score - self._results = [] - self._models = [] - self._report = {} - self._datasets = set() - self.visualize = visualize - self.__compute_best_results_ever() - - def __compute_best_results_ever(self): - args = EnvData.load() - key = args["source_data"] - best = BestResultsEver() - _, self.best_score_value = best.get_name_value(key, self._score) - - def get_result_file_name(self): - return os.path.join(Folders.exreport, Files.exreport(self._score)) - - def compile_results(self): - summary = Summary() - summary.acquire(given_score=self._score) - self._models = summary.get_models() - if self._models == []: - raise ValueError(NO_RESULTS) - for model in self._models: - best = summary.best_result( - criterion="model", value=model, score=self._score - ) - file_name = os.path.join(Folders.results, best["file"]) - with open(file_name) as fi: - experiment = json.load(fi) - for result in experiment["results"]: - dataset = result["dataset"] - record = { - "model": model, - "dataset": dataset, - "score": result["score"], - "score_std": result["score_std"], - "file_name": file_name, - } - self._results.append(record) - if model not in self._report: - self._report[model] = {} - self._report[model][dataset] = record - self._datasets.add(dataset) - self._datasets = sorted(self._datasets) - - def save_results(self): - # build Files.exreport - result_file_name = self.get_result_file_name() - with open(result_file_name, "w") as f: - f.write( - f"classifier, dataset, {self._score.replace('-','')}, " - "stdev, file_name\n" - ) - for record in self._results: - f.write( - f"{record['model']}, {record['dataset']}, " - f"{record['score']}, {record['score_std']}, " - f"{record['file_name']}\n" - ) - - def exreport(self): - def end_message(message, file): - length = 100 - print("*" * length) - print(message) - print("*" * length) - with open(os.path.join(Folders.exreport, file)) as f: - data = f.read().splitlines() - for line in data: - print(line) - - # Remove previous results - if os.path.exists(Folders.report): - shutil.rmtree(Folders.report) - if os.path.exists(Files.exreport_pdf): - os.remove(Files.exreport_pdf) - # Compute Friedman & Holm Tests - fout = open( - os.path.join(Folders.exreport, Files.exreport_output(self._score)), - "w", - ) - ferr = open( - os.path.join(Folders.exreport, Files.exreport_err(self._score)), - "w", - ) - result = subprocess.run( - [ - "Rscript", - os.path.join(Folders.src(), Files.benchmark_r), - self._score.replace("-", ""), - os.path.join(Folders.exreport, f"exreport_{self._score}"), - "1" if self.visualize else "0", - ], - stdout=fout, - stderr=ferr, - ) - fout.close() - ferr.close() - if result.returncode != 0: - end_message( - "Error computing benchmark", Files.exreport_err(self._score) - ) - else: - end_message("Benchmark Ok", Files.exreport_output(self._score)) - Files.open(Files.exreport_pdf) - - def report(self, tex_output): - # Report Header - print(f"{'Dataset':30s} ", end="") - lines = "=" * 30 + " " - for model in self._models: - print(f"{model:^13s} ", end="") - lines += "=" * 13 + " " - print(f"\n{lines}") - if tex_output: - self.print_tex_header() - # Report Body - for num, dataset in enumerate(self._datasets): - print(f"{dataset:30s} ", end="") - scores = [] - for model in self._models: - result = self._report[model][dataset] - score = float(result["score"]) - score_std = float(result["score_std"]) - print(f"{score:.5f}±", end="") - print(f"{score_std:.3f} ", end="") - scores.append((score, score_std)) - print("") - if tex_output: - self.print_tex_line(num, dataset, scores) - if tex_output: - self.print_tex_footer() - # Summary of result files used - d_name = next(iter(self._datasets)) - print(f"\n{'Model':30s} {'File Name':75s} Score") - print("=" * 30 + " " + "=" * 75 + " ========") - for model in self._models: - file_name = self._report[model][d_name]["file_name"] - report = StubReport(file_name) - report.report() - print(f"{model:^30s} {file_name:75s} {report.score:8.5f}") - - def get_tex_file(self): - return os.path.join(Folders.exreport, Files.tex_output(self._score)) - - def print_tex_header(self): - with open(self.get_tex_file(), "w") as f: - header_data = "# & Dataset & \\#S & \\#F & \\#L & " + " & ".join( - self._models - ) - tabular = "{rlrrr" + "c" * len(self._models) + "}" - header = ( - "\\begin{sidewaystable}[ht]\n" - "\\centering\n" - "\\renewcommand{\\arraystretch}{1.2}\n" - "\\renewcommand{\\tabcolsep}{0.07cm}\n" - "\\caption{Accuracy results (mean ± std) for all the " - "algorithms and datasets}\n" - "\\label{table:datasets}\n" - "\\resizebox{0.95\\textwidth}{!}{\n" - "\\begin {tabular} {" + tabular + "}\\hline\n" - "\\" + header_data + "\\\\\n" - "\\hline\n" - ) - f.write(header) - - def print_tex_line(self, num, dataset, scores): - dt = Datasets() - with open(self.get_tex_file(), "a") as f: - X, y = dt.load(dataset) - samples, features = X.shape - n_classes = len(np.unique(y)) - dataset_name = dataset.replace("_", "\\_") - print_line = ( - f"{num + 1} & {dataset_name} & {samples} & {features} " - f"& {n_classes}" - ) - max_value = max(scores)[0] - for score, score_std in scores: - # Add score and score_std - value = f"{score:.4f}±{score_std:.3f}" - value_formated = ( - "\\bfseries " + value + " " - if score == max_value - else value - ) - print_line += " & " + value_formated - print_line += "\\\\" - f.write(f"{print_line}\n") - - def print_tex_footer(self): - with open(self.get_tex_file(), "a") as f: - f.write("\\hline\n\\end{tabular}}\n\\end{sidewaystable}\n") - - def get_excel_file_name(self): - return os.path.join(Folders.excel, Files.exreport_excel(self._score)) - - def excel(self): - book = xlsxwriter.Workbook( - self.get_excel_file_name(), {"nan_inf_to_errors": True} - ) - Excel.set_properties(book, "Experimentation summary") - sheet = book.add_worksheet("Benchmark") - normal = book.add_format({"font_size": 14, "border": 1}) - decimal = book.add_format( - {"num_format": "0.000000", "font_size": 14, "border": 1} - ) - decimal_total = book.add_format( - { - "num_format": "0.000000", - "font_size": 14, - "border": 1, - "bold": True, - "bg_color": Excel.color3, - } - ) - two_decimal_total = book.add_format( - { - "num_format": "0.00", - "font_size": 14, - "border": 1, - "bold": True, - "bg_color": Excel.color3, - } - ) - merge_format_header = book.add_format( - { - "border": 1, - "bold": 1, - "align": "center", - "valign": "vcenter", - "font_size": 14, - "bg_color": Excel.color1, - } - ) - merge_format = book.add_format( - { - "border": 1, - "bold": 1, - "align": "center", - "valign": "vcenter", - "font_size": 14, - "bg_color": Excel.color3, - } - ) - merge_format_normal = book.add_format( - { - "border": 1, - "valign": "vcenter", - "font_size": 14, - } - ) - row = row_init = 4 - - def header(): - nonlocal row - sheet.merge_range( - 0, 0, 1, 0, "Benchmark of Models", merge_format_header - ) - sheet.merge_range( - 0, 1, 1, 2, f"Score is {self._score}", merge_format_header - ) - sheet.set_row(1, 20) - # Set columns width - sheet.set_column(0, 0, 40) - for column in range(2 * len(self._results)): - sheet.set_column(column + 1, column + 1, 15) - # Set report header - # Merge 2 rows - sheet.merge_range(row, 0, row + 1, 0, "Dataset", merge_format) - column = 1 - for model in self._models: - # Merge 3 columns - sheet.merge_range( - row, column, row, column + 2, model, merge_format - ) - column += 3 - row += 1 - column = 1 - for _ in range(len(self._models)): - sheet.write(row, column, "Score", merge_format) - sheet.write(row, column + 1, "Stdev", merge_format) - sheet.write(row, column + 2, "Rank", merge_format) - column += 3 - - def body(): - nonlocal row - for dataset in self._datasets: - row += 1 - normal = book.add_format({"font_size": 14, "border": 1}) - decimal = book.add_format( - { - "num_format": "0.000000", - "font_size": 14, - "border": 1, - } - ) - if row % 2 == 0: - normal.set_bg_color(Excel.color1) - decimal.set_bg_color(Excel.color1) - else: - normal.set_bg_color(Excel.color2) - decimal.set_bg_color(Excel.color2) - sheet.write(row, 0, f"{dataset:30s}", normal) - column = 1 - range_cells = "" - for col in range(0, len(self._models) * 3, 3): - range_cells += chr(ord("B") + col) + str(row + 1) + "," - range_cells = range_cells[:-1] - for model in self._models: - sheet.write( - row, - column, - float(self._report[model][dataset]["score"]), - decimal, - ) - column += 1 - sheet.write( - row, - column, - float(self._report[model][dataset]["score_std"]), - decimal, - ) - column += 1 - cell_target = chr(ord("B") + column - 3) + str(row + 1) - sheet.write_formula( - row, - column, - f"=rank({cell_target},({range_cells}))", - normal, - ) - column += 1 - - def footer(): - nonlocal row - for c in range(row_init, row + 2): - sheet.set_row(c, 20) - # Write totals - row += 1 - sheet.write(row, 0, "Total", merge_format) - for col in range(0, len(self._models) * 3, 3): - range_metric = ( - f"{chr(ord('B') + col )}7:{chr(ord('B') + col )}{row}" - ) - sheet.write_formula( - row, - col + 1, - f"=sum({range_metric})/{self.best_score_value}", - decimal_total, - ) - range_rank = ( - f"{chr(ord('B') + col + 2)}7:" - f"{chr(ord('B') + col + 2)}{row}" - ) - sheet.write_formula( - row, - col + 3, - f"=average({range_rank})", - two_decimal_total, - ) - row += 1 - - def models_files(): - nonlocal row - row += 2 - # Set report header - # Merge 2 rows - sheet.merge_range(row, 0, row + 1, 0, "Model", merge_format) - sheet.merge_range(row, 1, row + 1, 5, "File", merge_format) - sheet.merge_range(row, 6, row + 1, 6, "Score", merge_format) - row += 1 - d_name = next(iter(self._datasets)) - for model in self._models: - file_name = self._report[model][d_name]["file_name"] - report = StubReport(file_name) - report.report() - row += 1 - sheet.write( - row, - 0, - model, - normal, - ) - sheet.merge_range( - row, 1, row, 5, file_name, merge_format_normal - ) - sheet.write( - row, - 6, - report.score, - decimal, - ) - k = Excel(file_name=file_name, book=book) - k.report() - sheet.freeze_panes(6, 1) - sheet.hide_gridlines(2) - - def add_datasets_sheet(): - # Add datasets sheet - re = ReportDatasets(excel=True, book=book) - re.report() - - def exreport_output(): - file_name = os.path.join( - Folders.exreport, Files.exreport_output(self._score) - ) - sheet = book.add_worksheet("Exreport") - normal = book.add_format( - { - "font_size": 14, - "border": 1, - "font_color": "blue", - "font_name": "Courier", - "bold": True, - } - ) - with open(file_name) as f: - lines = f.read().splitlines() - row = 0 - for line in lines: - sheet.write(row, 0, line, normal) - row += 1 - - header() - body() - footer() - models_files() - exreport_output() - add_datasets_sheet() - book.close() - - -class StubReport(BaseReport): - def __init__(self, file_name): - super().__init__(file_name=file_name, best_file=False) - - def print_line(self, line) -> None: - pass - - def header(self) -> None: - self.title = self.data["title"] - self.duration = self.data["duration"] - - def footer(self, accuracy: float) -> None: - self.accuracy = accuracy - self.score = accuracy / self._get_best_accuracy() - - class Summary: def __init__(self, hidden=False, compare=False) -> None: self.results = Files().get_all_results(hidden=hidden) @@ -1587,8 +404,11 @@ class Summary: file_name_result = os.path.join( path, self.data_filtered[num]["file"] ) - rep = Report(file_name_result, compare=self.compare) - rep.report() + try: + rep = Report(file_name_result, compare=self.compare) + rep.report() + except ValueError as e: + print(e) case _: print("Invalid option. Try again!") diff --git a/benchmark/ResultsBase.py b/benchmark/ResultsBase.py new file mode 100644 index 0000000..0a77588 --- /dev/null +++ b/benchmark/ResultsBase.py @@ -0,0 +1,163 @@ +import abc +import json +import os + + +from .Arguments import ALL_METRICS, EnvData +from .Datasets import Datasets +from .Experiments import BestResults +from .Utils import Folders, Symbols + + +def get_input(message="", is_test=False): + return "test" if is_test else input(message) + + +class BestResultsEver: + def __init__(self): + self.data = {} + for i in ["Tanveer", "Surcov", "Arff"]: + self.data[i] = {} + for metric in ALL_METRICS: + self.data[i][metric.replace("-", "_")] = ["self", 1.0] + self.data[i][metric] = ["self", 1.0] + self.data["Tanveer"]["accuracy"] = [ + "STree_default (liblinear-ovr)", + 40.282203, + ] + self.data["Arff"]["accuracy"] = [ + "STree_default (linear-ovo)", + 22.109799, + ] + + def get_name_value(self, key, score): + return self.data[key][score] + + +class BaseReport(abc.ABC): + def __init__(self, file_name, best_file=False): + self.file_name = file_name + if not os.path.isfile(file_name): + if not os.path.isfile(os.path.join(Folders.results, file_name)): + raise FileNotFoundError(f"{file_name} does not exists!") + else: + self.file_name = os.path.join(Folders.results, file_name) + with open(self.file_name) as f: + self.data = json.load(f) + self.best_acc_file = best_file + if best_file: + self.lines = self.data + else: + self.lines = self.data["results"] + self.score_name = self.data["score_name"] + self.__load_env_data() + self.__compute_best_results_ever() + + def __load_env_data(self): + # Set the labels for nodes, leaves, depth + env_data = EnvData.load() + self.nodes_label = env_data["nodes"] + self.leaves_label = env_data["leaves"] + self.depth_label = env_data["depth"] + self.key = env_data["source_data"] + self.margin = float(env_data["margin"]) + + def __compute_best_results_ever(self): + best = BestResultsEver() + self.best_score_name, self.best_score_value = best.get_name_value( + self.key, self.score_name + ) + + def _get_accuracy(self, item): + return self.data[item][0] if self.best_acc_file else item["score"] + + def report(self): + self.header() + accuracy_total = 0.0 + for result in self.lines: + self.print_line(result) + accuracy_total += self._get_accuracy(result) + self.footer(accuracy_total) + + def _load_best_results(self, score, model): + best = BestResults(score, model, Datasets()) + self.best_results = best.load({}) + + def _compute_status(self, dataset, accuracy: float): + status = " " + if self.compare: + # Compare with best results + best = self.best_results[dataset][0] + if accuracy == best: + status = Symbols.equal_best + elif accuracy > best: + status = Symbols.better_best + else: + # compare with dataset label distribution only if its a binary one + # down_arrow if accuracy is less than the ZeroR + # black_star if accuracy is greater than the ZeroR + margin% + if self.score_name == "accuracy": + dt = Datasets() + attr = dt.get_attributes(dataset) + if attr.classes == 2: + max_category = max(attr.distribution.values()) + max_value = max_category * (1 + self.margin) + if max_value > 1: + max_value = 0.9995 + status = ( + Symbols.cross + if accuracy <= max_value + else Symbols.upward_arrow + if accuracy > max_value + else " " + ) + if status != " ": + if status not in self._compare_totals: + self._compare_totals[status] = 1 + else: + self._compare_totals[status] += 1 + return status + + def _status_meaning(self, status): + meaning = { + Symbols.equal_best: "Equal to best", + Symbols.better_best: "Better than best", + Symbols.cross: "Less than or equal to ZeroR", + Symbols.upward_arrow: f"Better than ZeroR + " + f"{self.margin*100:3.1f}%", + } + return meaning[status] + + def _get_best_accuracy(self): + return self.best_score_value + + def _get_message_best_accuracy(self): + return f"{self.score_name} compared to {self.best_score_name} .:" + + @abc.abstractmethod + def header(self) -> None: + pass + + @abc.abstractmethod + def print_line(self, result) -> None: + pass + + @abc.abstractmethod + def footer(self, accuracy: float) -> None: + pass + + +class StubReport(BaseReport): + def __init__(self, file_name): + super().__init__(file_name=file_name, best_file=False) + + def print_line(self, line) -> None: + pass + + def header(self) -> None: + self.title = self.data["title"] + self.duration = self.data["duration"] + + def footer(self, accuracy: float) -> None: + self.accuracy = accuracy + self.score = accuracy / self._get_best_accuracy() diff --git a/benchmark/ResultsFiles.py b/benchmark/ResultsFiles.py new file mode 100644 index 0000000..d75e140 --- /dev/null +++ b/benchmark/ResultsFiles.py @@ -0,0 +1,1044 @@ +import json +import os +import shutil +import subprocess +import sys +from pathlib import Path + +import numpy as np +import xlsxwriter +from xlsxwriter.exceptions import DuplicateWorksheetName + +from ._version import __version__ +from .Arguments import EnvData +from .Datasets import Datasets +from .ResultsBase import BaseReport +from .Utils import NO_RESULTS, Files, Folders, TextColor + + +class Excel(BaseReport): + row = 6 + # alternate lines colors + color1 = "#DCE6F1" + color2 = "#FDE9D9" + color3 = "#B1A0C7" + + def __init__(self, file_name, compare=False, book=None): + super().__init__(file_name) + self.compare = compare + if self.compare: + self._load_best_results( + self.data["score_name"], self.data["model"] + ) + self._compare_totals = {} + if book is None: + self.excel_file_name = Path(self.file_name).name.replace( + Files.report_ext, ".xlsx" + ) + self.book = xlsxwriter.Workbook( + os.path.join(Folders.excel, self.excel_file_name), + {"nan_inf_to_errors": True}, + ) + self.set_book_properties() + self.close = True + else: + self.book = book + self.close = False + suffix = "" + num = 1 + while True: + try: + self.sheet = self.book.add_worksheet( + self.data["model"] + suffix + ) + break + except DuplicateWorksheetName: + num += 1 + suffix = str(num) + self.max_hyper_width = 0 + self.col_hyperparams = 0 + + @staticmethod + def set_properties(book, title): + book.set_properties( + { + "title": title, + "subject": "Machine learning results", + "author": "Ricardo Montañana Gómez", + "manager": "Dr. J. A. Gámez, Dr. J. M. Puerta", + "company": "UCLM", + "comments": "Created with Python and XlsxWriter", + } + ) + + def set_book_properties(self): + self.set_properties(self.book, self.get_title()) + + def get_title(self): + return ( + f" {self.data['model']} ver. {self.data['version']}" + f" {self.data['language']} ver. {self.data['language_version']}" + f" with {self.data['folds']} Folds " + f"cross validation and {len(self.data['seeds'])} random seeds. " + f"{self.data['date']} {self.data['time']}" + ) + + def get_file_name(self): + return self.excel_file_name + + def header(self): + merge_format = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 18, + "bg_color": self.color3, + } + ) + merge_format_subheader = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_left = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "left", + "valign": "vcenter", + "font_size": 12, + "bg_color": self.color1, + } + ) + header_text = self.get_title() + self.sheet.merge_range(0, 0, 0, 12, header_text, merge_format) + self.sheet.merge_range( + 1, 0, 1, 12, f" {self.data['title']}", merge_format_subheader + ) + self.sheet.merge_range( + 2, + 0, + 3, + 0, + f" Score is {self.data['score_name']}", + merge_format_subheader, + ) + self.sheet.merge_range( + 2, + 1, + 3, + 3, + " Execution time", + merge_format_subheader, + ) + hours = self.data["duration"] / 3600 + self.sheet.merge_range( + 2, + 4, + 2, + 5, + f"{self.data['duration']:7,.2f} s", + merge_format_subheader, + ) + self.sheet.merge_range( + 3, + 4, + 3, + 5, + f" {hours:5.2f} h", + merge_format_subheader, + ) + self.sheet.merge_range( + 2, + 6, + 3, + 7, + "Platform", + merge_format_subheader, + ) + self.sheet.merge_range( + 2, + 8, + 3, + 9, + f"{self.data['platform']}", + merge_format_subheader, + ) + self.sheet.merge_range( + 2, + 10, + 2, + 12, + f"Random seeds: {self.data['seeds']}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 3, + 10, + 3, + 11, + f"Stratified: {self.data['stratified']}", + merge_format_subheader_left, + ) + self.sheet.write( + 3, + 12, + f"Discretized: {self.data['discretized']}", + merge_format_subheader_left, + ) + header_cols = [ + ("Dataset", 30), + ("Samples", 10), + ("Features", 7), + ("Classes", 7), + (self.nodes_label, 7), + (self.leaves_label, 7), + (self.depth_label, 7), + ("Score", 12), + ("Score Std.", 12), + ("Time", 12), + ("Time Std.", 12), + ("Hyperparameters", 50), + ] + header_cols.insert(8, ("Stat", 3)) + bold = self.book.add_format( + { + "bold": True, + "font_size": 14, + "bg_color": self.color3, + "border": 1, + } + ) + i = 0 + for item, length in header_cols: + self.sheet.write(5, i, item, bold) + self.sheet.set_column(i, i, length) + i += 1 + + def print_line(self, result): + size_n = 14 + decimal = self.book.add_format( + {"num_format": "0.000000", "font_size": size_n, "border": 1} + ) + integer = self.book.add_format( + {"num_format": "#,###", "font_size": size_n, "border": 1} + ) + normal = self.book.add_format({"font_size": size_n, "border": 1}) + col = 0 + if self.row % 2 == 0: + normal.set_bg_color(self.color1) + decimal.set_bg_color(self.color1) + integer.set_bg_color(self.color1) + else: + normal.set_bg_color(self.color2) + decimal.set_bg_color(self.color2) + integer.set_bg_color(self.color2) + self.sheet.write(self.row, col, result["dataset"], normal) + self.sheet.write(self.row, col + 1, result["samples"], integer) + self.sheet.write(self.row, col + 2, result["features"], integer) + self.sheet.write(self.row, col + 3, result["classes"], normal) + self.sheet.write(self.row, col + 4, result["nodes"], normal) + self.sheet.write(self.row, col + 5, result["leaves"], normal) + self.sheet.write(self.row, col + 6, result["depth"], normal) + self.sheet.write(self.row, col + 7, result["score"], decimal) + status = self._compute_status(result["dataset"], result["score"]) + self.sheet.write(self.row, col + 8, status, normal) + col = 9 + self.sheet.write(self.row, col, result["score_std"], decimal) + self.sheet.write(self.row, col + 1, result["time"], decimal) + self.sheet.write(self.row, col + 2, result["time_std"], decimal) + self.sheet.write( + self.row, col + 3, str(result["hyperparameters"]), normal + ) + self.col_hyperparams = col + 3 + self.max_hyper_width = max( + self.max_hyper_width, len(str(result["hyperparameters"])) + ) + self.row += 1 + + def footer(self, accuracy): + self.row += 2 + bold = self.book.add_format({"bold": True, "font_size": 16}) + for key, total in self._compare_totals.items(): + self.sheet.write(self.row, 1, key, bold) + self.sheet.write(self.row, 2, total, bold) + self.sheet.write(self.row, 3, self._status_meaning(key), bold) + self.row += 1 + message = ( + f"** {self._get_message_best_accuracy()} " + f"{accuracy/self._get_best_accuracy():7.4f}" + ) + bold = self.book.add_format({"bold": True, "font_size": 14}) + # set width of the hyperparams column with the maximum width + self.sheet.set_column( + self.col_hyperparams, + self.col_hyperparams, + max(self.max_hyper_width + 1, 23), + ) + self.sheet.write(self.row + 1, 0, message, bold) + for c in range(self.row + 2): + self.sheet.set_row(c, 20) + self.sheet.set_row(0, 25) + self.sheet.freeze_panes(6, 1) + self.sheet.hide_gridlines(2) + if self.close: + self.book.close() + + +class ReportDatasets: + row = 6 + # alternate lines colors + color1 = "#DCE6F1" + color2 = "#FDE9D9" + color3 = "#B1A0C7" + + def __init__(self, excel=False, book=None): + self.excel = excel + self.env = EnvData().load() + self.close = False + self.output = True + self.header_text = f"Datasets used in benchmark ver. {__version__}" + if excel: + self.max_length = 0 + if book is None: + self.excel_file_name = os.path.join( + Folders.excel, Files.datasets_report_excel + ) + self.book = xlsxwriter.Workbook( + self.excel_file_name, {"nan_inf_to_errors": True} + ) + self.set_properties(self.get_title()) + self.close = True + else: + self.book = book + self.output = False + self.sheet = self.book.add_worksheet("Datasets") + + def set_properties(self, title): + self.book.set_properties( + { + "title": title, + "subject": "Machine learning results", + "author": "Ricardo Montañana Gómez", + "manager": "Dr. J. A. Gámez, Dr. J. M. Puerta", + "company": "UCLM", + "comments": "Created with Python and XlsxWriter", + } + ) + + @staticmethod + def get_python_version(): + return "{}.{}".format(sys.version_info.major, sys.version_info.minor) + + def get_title(self): + return ( + f" Benchmark ver. {__version__} - " + f" Python ver. {self.get_python_version()}" + f" with {self.env['n_folds']} Folds cross validation " + f" Discretization: {self.env['discretize']} " + f"Stratification: {self.env['stratified']}" + ) + + def get_file_name(self): + return self.excel_file_name + + def header(self): + merge_format = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 18, + "bg_color": self.color3, + } + ) + merge_format_subheader = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_right = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "right", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_left = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "left", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format) + self.sheet.merge_range( + 1, + 0, + 4, + 0, + f" Default score {self.env['score']}", + merge_format_subheader, + ) + self.sheet.merge_range( + 1, + 1, + 1, + 4, + "Cross validation", + merge_format_subheader_right, + ) + self.sheet.write( + 1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left + ) + self.sheet.merge_range( + 2, + 1, + 2, + 4, + "Stratified", + merge_format_subheader_right, + ) + self.sheet.write( + 2, + 5, + f"{'True' if self.env['stratified']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 3, + 1, + 3, + 4, + "Discretized", + merge_format_subheader_right, + ) + self.sheet.write( + 3, + 5, + f"{'True' if self.env['discretize']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 4, + 1, + 4, + 4, + "Seeds", + merge_format_subheader_right, + ) + self.sheet.write( + 4, 5, f"{self.env['seeds']}", merge_format_subheader_left + ) + self.update_max_length(len(self.env["seeds"]) + 1) + header_cols = [ + ("Dataset", 30), + ("Samples", 10), + ("Features", 10), + ("Continuous", 10), + ("Classes", 10), + ("Balance", 50), + ] + bold = self.book.add_format( + { + "bold": True, + "font_size": 14, + "bg_color": self.color3, + "border": 1, + } + ) + i = 0 + for item, length in header_cols: + self.sheet.write(5, i, item, bold) + self.sheet.set_column(i, i, length) + i += 1 + + def footer(self): + # set Balance column width to max length + self.sheet.set_column(5, 5, self.max_length) + self.sheet.freeze_panes(6, 1) + self.sheet.hide_gridlines(2) + if self.close: + self.book.close() + + def print_line(self, result): + size_n = 14 + integer = self.book.add_format( + {"num_format": "#,###", "font_size": size_n, "border": 1} + ) + normal = self.book.add_format({"font_size": size_n, "border": 1}) + col = 0 + if self.row % 2 == 0: + normal.set_bg_color(self.color1) + integer.set_bg_color(self.color1) + else: + normal.set_bg_color(self.color2) + integer.set_bg_color(self.color2) + self.sheet.write(self.row, col, result.dataset, normal) + self.sheet.write(self.row, col + 1, result.samples, integer) + self.sheet.write(self.row, col + 2, result.features, integer) + self.sheet.write(self.row, col + 3, result.cont_features, integer) + self.sheet.write(self.row, col + 4, result.classes, normal) + self.sheet.write(self.row, col + 5, result.balance, normal) + self.update_max_length(len(result.balance)) + self.row += 1 + + def update_max_length(self, value): + if value > self.max_length: + self.max_length = value + + def report(self): + data_sets = Datasets() + max_len = max( + [len(data_sets.get_attributes(data).balance) for data in data_sets] + ) + color_line = TextColor.LINE1 + if self.output: + print(color_line, end="") + print(self.header_text) + print("") + print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance") + print("=" * 30 + " ====== ===== ==== === " + "=" * max_len) + if self.excel: + self.header() + for dataset in data_sets: + attributes = data_sets.get_attributes(dataset) + if self.excel: + self.print_line(attributes) + color_line = ( + TextColor.LINE2 + if color_line == TextColor.LINE1 + else TextColor.LINE1 + ) + if self.output: + print(color_line, end="") + print( + f"{dataset:30s} {attributes.samples:6,d} " + f"{attributes.features:5,d} {attributes.cont_features:4,d}" + f" {attributes.classes:3d} {attributes.balance:40s}" + ) + if self.excel: + self.footer() + + +class SQLFile(BaseReport): + table_name = "results" + + def header(self): + file_name = Path(self.file_name).name.replace(Files.report_ext, ".sql") + self.file = open(os.path.join(Folders.sql, file_name), "w") + + def print_line(self, result): + attributes = [ + "date", + "time", + "type", + "title", + "stratified", + "score_name", + "score", + "score_std", + "dataset", + "classifier", + "version", + "norm", + "stand", + "time_spent", + "time_spent_std", + "parameters", + "nodes", + "leaves", + "depth", + "platform", + "nfolds", + "seeds", + ] + command_insert = ( + f"replace into {self.table_name} (" + + ",".join(attributes) + + ") values(" + + ("'%s'," * len(attributes))[:-1] + + ");\n" + ) + values = ( + self.data["date"], + self.data["time"], + "crossval", + self.data["title"], + "1" if self.data["stratified"] else "0", + self.data["score_name"], + result["score"], + result["score_std"], + result["dataset"], + self.data["model"], + self.data["version"], + 0, + 1, + result["time"], + result["time_std"], + str(result["hyperparameters"]).replace("'", '"'), + result["nodes"], + result["leaves"], + result["depth"], + self.data["platform"], + self.data["folds"], + str(self.data["seeds"]), + ) + self.file.write(command_insert % values) + + def footer(self, accuracy): + self.file.close() + + +class Benchmark: + def __init__(self, score, visualize=True): + self._score = score + self._results = [] + self._models = [] + self._report = {} + self._datasets = set() + self.visualize = visualize + self.__compute_best_results_ever() + + def __compute_best_results_ever(self): + args = EnvData.load() + key = args["source_data"] + best = BestResultsEver() + _, self.best_score_value = best.get_name_value(key, self._score) + + def get_result_file_name(self): + return os.path.join(Folders.exreport, Files.exreport(self._score)) + + def compile_results(self): + summary = Summary() + summary.acquire(given_score=self._score) + self._models = summary.get_models() + if self._models == []: + raise ValueError(NO_RESULTS) + for model in self._models: + best = summary.best_result( + criterion="model", value=model, score=self._score + ) + file_name = os.path.join(Folders.results, best["file"]) + with open(file_name) as fi: + experiment = json.load(fi) + for result in experiment["results"]: + dataset = result["dataset"] + record = { + "model": model, + "dataset": dataset, + "score": result["score"], + "score_std": result["score_std"], + "file_name": file_name, + } + self._results.append(record) + if model not in self._report: + self._report[model] = {} + self._report[model][dataset] = record + self._datasets.add(dataset) + self._datasets = sorted(self._datasets) + + def save_results(self): + # build Files.exreport + result_file_name = self.get_result_file_name() + with open(result_file_name, "w") as f: + f.write( + f"classifier, dataset, {self._score.replace('-','')}, " + "stdev, file_name\n" + ) + for record in self._results: + f.write( + f"{record['model']}, {record['dataset']}, " + f"{record['score']}, {record['score_std']}, " + f"{record['file_name']}\n" + ) + + def exreport(self): + def end_message(message, file): + length = 100 + print("*" * length) + print(message) + print("*" * length) + with open(os.path.join(Folders.exreport, file)) as f: + data = f.read().splitlines() + for line in data: + print(line) + + # Remove previous results + if os.path.exists(Folders.report): + shutil.rmtree(Folders.report) + if os.path.exists(Files.exreport_pdf): + os.remove(Files.exreport_pdf) + # Compute Friedman & Holm Tests + fout = open( + os.path.join(Folders.exreport, Files.exreport_output(self._score)), + "w", + ) + ferr = open( + os.path.join(Folders.exreport, Files.exreport_err(self._score)), + "w", + ) + result = subprocess.run( + [ + "Rscript", + os.path.join(Folders.src(), Files.benchmark_r), + self._score.replace("-", ""), + os.path.join(Folders.exreport, f"exreport_{self._score}"), + "1" if self.visualize else "0", + ], + stdout=fout, + stderr=ferr, + ) + fout.close() + ferr.close() + if result.returncode != 0: + end_message( + "Error computing benchmark", Files.exreport_err(self._score) + ) + else: + end_message("Benchmark Ok", Files.exreport_output(self._score)) + Files.open(Files.exreport_pdf) + + def report(self, tex_output): + # Report Header + print(f"{'Dataset':30s} ", end="") + lines = "=" * 30 + " " + for model in self._models: + print(f"{model:^13s} ", end="") + lines += "=" * 13 + " " + print(f"\n{lines}") + if tex_output: + self.print_tex_header() + # Report Body + for num, dataset in enumerate(self._datasets): + print(f"{dataset:30s} ", end="") + scores = [] + for model in self._models: + result = self._report[model][dataset] + score = float(result["score"]) + score_std = float(result["score_std"]) + print(f"{score:.5f}±", end="") + print(f"{score_std:.3f} ", end="") + scores.append((score, score_std)) + print("") + if tex_output: + self.print_tex_line(num, dataset, scores) + if tex_output: + self.print_tex_footer() + # Summary of result files used + d_name = next(iter(self._datasets)) + print(f"\n{'Model':30s} {'File Name':75s} Score") + print("=" * 30 + " " + "=" * 75 + " ========") + for model in self._models: + file_name = self._report[model][d_name]["file_name"] + report = StubReport(file_name) + report.report() + print(f"{model:^30s} {file_name:75s} {report.score:8.5f}") + + def get_tex_file(self): + return os.path.join(Folders.exreport, Files.tex_output(self._score)) + + def print_tex_header(self): + with open(self.get_tex_file(), "w") as f: + header_data = "# & Dataset & \\#S & \\#F & \\#L & " + " & ".join( + self._models + ) + tabular = "{rlrrr" + "c" * len(self._models) + "}" + header = ( + "\\begin{sidewaystable}[ht]\n" + "\\centering\n" + "\\renewcommand{\\arraystretch}{1.2}\n" + "\\renewcommand{\\tabcolsep}{0.07cm}\n" + "\\caption{Accuracy results (mean ± std) for all the " + "algorithms and datasets}\n" + "\\label{table:datasets}\n" + "\\resizebox{0.95\\textwidth}{!}{\n" + "\\begin {tabular} {" + tabular + "}\\hline\n" + "\\" + header_data + "\\\\\n" + "\\hline\n" + ) + f.write(header) + + def print_tex_line(self, num, dataset, scores): + dt = Datasets() + with open(self.get_tex_file(), "a") as f: + X, y = dt.load(dataset) + samples, features = X.shape + n_classes = len(np.unique(y)) + dataset_name = dataset.replace("_", "\\_") + print_line = ( + f"{num + 1} & {dataset_name} & {samples} & {features} " + f"& {n_classes}" + ) + max_value = max(scores)[0] + for score, score_std in scores: + # Add score and score_std + value = f"{score:.4f}±{score_std:.3f}" + value_formated = ( + "\\bfseries " + value + " " + if score == max_value + else value + ) + print_line += " & " + value_formated + print_line += "\\\\" + f.write(f"{print_line}\n") + + def print_tex_footer(self): + with open(self.get_tex_file(), "a") as f: + f.write("\\hline\n\\end{tabular}}\n\\end{sidewaystable}\n") + + def get_excel_file_name(self): + return os.path.join(Folders.excel, Files.exreport_excel(self._score)) + + def excel(self): + book = xlsxwriter.Workbook( + self.get_excel_file_name(), {"nan_inf_to_errors": True} + ) + Excel.set_properties(book, "Experimentation summary") + sheet = book.add_worksheet("Benchmark") + normal = book.add_format({"font_size": 14, "border": 1}) + decimal = book.add_format( + {"num_format": "0.000000", "font_size": 14, "border": 1} + ) + decimal_total = book.add_format( + { + "num_format": "0.000000", + "font_size": 14, + "border": 1, + "bold": True, + "bg_color": Excel.color3, + } + ) + two_decimal_total = book.add_format( + { + "num_format": "0.00", + "font_size": 14, + "border": 1, + "bold": True, + "bg_color": Excel.color3, + } + ) + merge_format_header = book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 14, + "bg_color": Excel.color1, + } + ) + merge_format = book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 14, + "bg_color": Excel.color3, + } + ) + merge_format_normal = book.add_format( + { + "border": 1, + "valign": "vcenter", + "font_size": 14, + } + ) + row = row_init = 4 + + def header(): + nonlocal row + sheet.merge_range( + 0, 0, 1, 0, "Benchmark of Models", merge_format_header + ) + sheet.merge_range( + 0, 1, 1, 2, f"Score is {self._score}", merge_format_header + ) + sheet.set_row(1, 20) + # Set columns width + sheet.set_column(0, 0, 40) + for column in range(2 * len(self._results)): + sheet.set_column(column + 1, column + 1, 15) + # Set report header + # Merge 2 rows + sheet.merge_range(row, 0, row + 1, 0, "Dataset", merge_format) + column = 1 + for model in self._models: + # Merge 3 columns + sheet.merge_range( + row, column, row, column + 2, model, merge_format + ) + column += 3 + row += 1 + column = 1 + for _ in range(len(self._models)): + sheet.write(row, column, "Score", merge_format) + sheet.write(row, column + 1, "Stdev", merge_format) + sheet.write(row, column + 2, "Rank", merge_format) + column += 3 + + def body(): + nonlocal row + for dataset in self._datasets: + row += 1 + normal = book.add_format({"font_size": 14, "border": 1}) + decimal = book.add_format( + { + "num_format": "0.000000", + "font_size": 14, + "border": 1, + } + ) + if row % 2 == 0: + normal.set_bg_color(Excel.color1) + decimal.set_bg_color(Excel.color1) + else: + normal.set_bg_color(Excel.color2) + decimal.set_bg_color(Excel.color2) + sheet.write(row, 0, f"{dataset:30s}", normal) + column = 1 + range_cells = "" + for col in range(0, len(self._models) * 3, 3): + range_cells += chr(ord("B") + col) + str(row + 1) + "," + range_cells = range_cells[:-1] + for model in self._models: + sheet.write( + row, + column, + float(self._report[model][dataset]["score"]), + decimal, + ) + column += 1 + sheet.write( + row, + column, + float(self._report[model][dataset]["score_std"]), + decimal, + ) + column += 1 + cell_target = chr(ord("B") + column - 3) + str(row + 1) + sheet.write_formula( + row, + column, + f"=rank({cell_target},({range_cells}))", + normal, + ) + column += 1 + + def footer(): + nonlocal row + for c in range(row_init, row + 2): + sheet.set_row(c, 20) + # Write totals + row += 1 + sheet.write(row, 0, "Total", merge_format) + for col in range(0, len(self._models) * 3, 3): + range_metric = ( + f"{chr(ord('B') + col )}7:{chr(ord('B') + col )}{row}" + ) + sheet.write_formula( + row, + col + 1, + f"=sum({range_metric})/{self.best_score_value}", + decimal_total, + ) + range_rank = ( + f"{chr(ord('B') + col + 2)}7:" + f"{chr(ord('B') + col + 2)}{row}" + ) + sheet.write_formula( + row, + col + 3, + f"=average({range_rank})", + two_decimal_total, + ) + row += 1 + + def models_files(): + nonlocal row + row += 2 + # Set report header + # Merge 2 rows + sheet.merge_range(row, 0, row + 1, 0, "Model", merge_format) + sheet.merge_range(row, 1, row + 1, 5, "File", merge_format) + sheet.merge_range(row, 6, row + 1, 6, "Score", merge_format) + row += 1 + d_name = next(iter(self._datasets)) + for model in self._models: + file_name = self._report[model][d_name]["file_name"] + report = StubReport(file_name) + report.report() + row += 1 + sheet.write( + row, + 0, + model, + normal, + ) + sheet.merge_range( + row, 1, row, 5, file_name, merge_format_normal + ) + sheet.write( + row, + 6, + report.score, + decimal, + ) + k = Excel(file_name=file_name, book=book) + k.report() + sheet.freeze_panes(6, 1) + sheet.hide_gridlines(2) + + def add_datasets_sheet(): + # Add datasets sheet + re = ReportDatasets(excel=True, book=book) + re.report() + + def exreport_output(): + file_name = os.path.join( + Folders.exreport, Files.exreport_output(self._score) + ) + sheet = book.add_worksheet("Exreport") + normal = book.add_format( + { + "font_size": 14, + "border": 1, + "font_color": "blue", + "font_name": "Courier", + "bold": True, + } + ) + with open(file_name) as f: + lines = f.read().splitlines() + row = 0 + for line in lines: + sheet.write(row, 0, line, normal) + row += 1 + + header() + body() + footer() + models_files() + exreport_output() + add_datasets_sheet() + book.close() diff --git a/benchmark/Utils.py b/benchmark/Utils.py index 61cc664..177b49f 100644 --- a/benchmark/Utils.py +++ b/benchmark/Utils.py @@ -14,6 +14,7 @@ class Folders: report = os.path.join(exreport, "exreport_output") img = "img" excel = "excel" + sql = "sql" @staticmethod def src(): @@ -127,6 +128,9 @@ class Symbols: check_mark = "\N{heavy check mark}" exclamation = "\N{heavy exclamation mark symbol}" black_star = "\N{black star}" + cross = "\N{Ballot X}" + upward_arrow = "\N{Black-feathered north east arrow}" + down_arrow = "\N{downwards black arrow}" equal_best = check_mark better_best = black_star diff --git a/benchmark/scripts/be_init_project.py b/benchmark/scripts/be_init_project.py index 737ce8c..034dbf6 100755 --- a/benchmark/scripts/be_init_project.py +++ b/benchmark/scripts/be_init_project.py @@ -16,6 +16,8 @@ def main(args_test=None): folders.append(os.path.join(args.project_name, Folders.report)) folders.append(os.path.join(args.project_name, Folders.img)) folders.append(os.path.join(args.project_name, Folders.excel)) + folders.append(os.path.join(args.project_name, Folders.sql)) + try: for folder in folders: print(f"Creating folder {folder}") diff --git a/benchmark/scripts/be_report.py b/benchmark/scripts/be_report.py index 30a5f88..f23803d 100755 --- a/benchmark/scripts/be_report.py +++ b/benchmark/scripts/be_report.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import os -from benchmark.Results import Report, Excel, SQL, ReportBest, ReportDatasets +from benchmark.Results import Report, ReportBest +from benchmark.ResultsFiles import Excel, SQLFile, ReportDatasets from benchmark.Utils import Files, Folders from benchmark.Arguments import Arguments from pathlib import Path @@ -67,7 +68,7 @@ def main(args_test=None): print(e) return if args.sql: - sql = SQL(args.file_name) + sql = SQLFile(args.file_name) sql.report() if args.excel: excel = Excel( diff --git a/benchmark/tests/.env b/benchmark/tests/.env index 10a5766..6ad03f5 100644 --- a/benchmark/tests/.env +++ b/benchmark/tests/.env @@ -10,4 +10,5 @@ discretize=0 nodes=Nodes leaves=Leaves depth=Depth -fit_features=0 \ No newline at end of file +fit_features=0 +margin=0.1 \ No newline at end of file diff --git a/benchmark/tests/.env.arff b/benchmark/tests/.env.arff index 33df877..0926483 100644 --- a/benchmark/tests/.env.arff +++ b/benchmark/tests/.env.arff @@ -9,4 +9,5 @@ discretize=1 nodes=Nodes leaves=Leaves depth=Depth -fit_features=1 \ No newline at end of file +fit_features=1 +margin=0.1 \ No newline at end of file diff --git a/benchmark/tests/.env.dist b/benchmark/tests/.env.dist index 10a5766..6ad03f5 100644 --- a/benchmark/tests/.env.dist +++ b/benchmark/tests/.env.dist @@ -10,4 +10,5 @@ discretize=0 nodes=Nodes leaves=Leaves depth=Depth -fit_features=0 \ No newline at end of file +fit_features=0 +margin=0.1 \ No newline at end of file diff --git a/benchmark/tests/.env.surcov b/benchmark/tests/.env.surcov index 8bb1959..bfd0c9e 100644 --- a/benchmark/tests/.env.surcov +++ b/benchmark/tests/.env.surcov @@ -10,4 +10,5 @@ discretize=0 nodes=Nodes leaves=Leaves depth=Depth -fit_features=0 \ No newline at end of file +fit_features=0 +margin=0.1 \ No newline at end of file diff --git a/benchmark/tests/Benchmark_test.py b/benchmark/tests/Benchmark_test.py index 0ed2e22..eb207d3 100644 --- a/benchmark/tests/Benchmark_test.py +++ b/benchmark/tests/Benchmark_test.py @@ -4,7 +4,7 @@ from unittest.mock import patch from openpyxl import load_workbook from .TestBase import TestBase from ..Utils import Folders, Files, NO_RESULTS -from ..Results import Benchmark +from ..ResultsFiles import Benchmark from .._version import __version__ diff --git a/benchmark/tests/Excel_test.py b/benchmark/tests/Excel_test.py index 1241776..7c4de30 100644 --- a/benchmark/tests/Excel_test.py +++ b/benchmark/tests/Excel_test.py @@ -2,7 +2,7 @@ import os from openpyxl import load_workbook from xlsxwriter import Workbook from .TestBase import TestBase -from ..Results import Excel +from ..ResultsFiles import Excel from ..Utils import Folders diff --git a/benchmark/tests/Report_test.py b/benchmark/tests/Report_test.py index ff384ca..9731e17 100644 --- a/benchmark/tests/Report_test.py +++ b/benchmark/tests/Report_test.py @@ -2,7 +2,9 @@ import os from io import StringIO from unittest.mock import patch from .TestBase import TestBase -from ..Results import Report, BaseReport, ReportBest, ReportDatasets, get_input +from ..Results import Report, ReportBest +from ..ResultsFiles import ReportDatasets +from ..ResultsBase import BaseReport, get_input from ..Utils import Symbols diff --git a/benchmark/tests/SQL_test.py b/benchmark/tests/SQL_test.py index 7011d68..aad2fd0 100644 --- a/benchmark/tests/SQL_test.py +++ b/benchmark/tests/SQL_test.py @@ -1,7 +1,7 @@ import os from .TestBase import TestBase -from ..Results import SQL -from ..Utils import Folders +from ..ResultsFiles import SQLFile +from ..Utils import Folders, Files class SQLTest(TestBase): @@ -9,14 +9,14 @@ class SQLTest(TestBase): files = [ "results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.sql", ] - self.remove_files(files, Folders.results) + self.remove_files(files, Folders.sql) return super().tearDown() def test_report_SQL(self): file_name = "results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.json" - report = SQL(file_name) + report = SQLFile(file_name) report.report() file_name = os.path.join( - Folders.results, file_name.replace(".json", ".sql") + Folders.sql, file_name.replace(Files.report_ext, ".sql") ) self.check_file_file(file_name, "sql") diff --git a/benchmark/tests/Util_test.py b/benchmark/tests/Util_test.py index 51c7027..3cca34d 100644 --- a/benchmark/tests/Util_test.py +++ b/benchmark/tests/Util_test.py @@ -186,6 +186,7 @@ class UtilTest(TestBase): "leaves": "Leaves", "depth": "Depth", "fit_features": "0", + "margin": "0.1", } computed = EnvData().load() self.assertDictEqual(computed, expected) diff --git a/benchmark/tests/test_files/be_init_project.test b/benchmark/tests/test_files/be_init_project.test index 2826682..daf5d94 100644 --- a/benchmark/tests/test_files/be_init_project.test +++ b/benchmark/tests/test_files/be_init_project.test @@ -5,6 +5,7 @@ Creating folder test_project/exreport Creating folder test_project/exreport/exreport_output Creating folder test_project/img Creating folder test_project/excel +Creating folder test_project/sql Done! Please, edit .env file with your settings and add a datasets folder with an all.txt file with the datasets you want to use.