diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index 0691350..8afcaa1 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -1,5 +1,6 @@ import os import pandas as pd +import numpy as np from scipy.io import arff from .Utils import Files from .Arguments import EnvData @@ -40,9 +41,6 @@ class DatasetsArff: class DatasetsTanveer: - def __init__(self, discretized): - self.discretized = discretized - @staticmethod def dataset_names(name): return f"{name}_R.dat" @@ -127,6 +125,24 @@ class Datasets: self.data_sets = result self.class_names = class_names + def get_attributes(self, name): + class Attributes: + pass + + X, y = self.load_continuous(name) + attr = Attributes() + values, counts = np.unique(y, return_counts=True) + comp = "" + sep = "" + for count in counts: + comp += f"{sep}{count/sum(counts)*100:5.2f}%" + sep = "/ " + attr.balance = comp + attr.classes = len(np.unique(y)) + attr.samples = X.shape[0] + attr.features = X.shape[1] + return attr + def get_features(self): return self.dataset.features diff --git a/benchmark/Results.py b/benchmark/Results.py index 7827e9c..011b3a0 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -1,4 +1,5 @@ import os +import sys from operator import itemgetter import math import json @@ -17,6 +18,7 @@ from .Utils import ( TextColor, NO_RESULTS, ) +from ._version import __version__ class BestResultsEver: @@ -566,37 +568,247 @@ class Excel(BaseReport): self.sheet.set_row(c, 20) self.sheet.set_row(0, 25) self.sheet.freeze_panes(6, 1) - self.sheet.hide_gridlines() + self.sheet.hide_gridlines(2) if self.close: self.book.close() class ReportDatasets: + row = 6 + # alternate lines colors + color1 = "#DCE6F1" + color2 = "#FDE9D9" + color3 = "#B1A0C7" + + def __init__(self, excel, book=None): + self.excel = excel + self.env = EnvData().load() + self.close = False + self.output = True + self.header_text = f"Datasets used in benchmark ver. {__version__}" + if excel: + self.max_length = 0 + if book is None: + self.excel_file_name = "ReportDatasets.xlsx" + self.book = xlsxwriter.Workbook( + self.excel_file_name, {"nan_inf_to_errors": True} + ) + self.set_properties(self.get_title()) + self.close = True + else: + self.book = book + self.output = False + self.sheet = self.book.add_worksheet("Datasets") + + def set_properties(self, title): + self.book.set_properties( + { + "title": title, + "subject": "Machine learning results", + "author": "Ricardo Montañana Gómez", + "manager": "Dr. J. A. Gámez, Dr. J. M. Puerta", + "company": "UCLM", + "comments": "Created with Python and XlsxWriter", + } + ) + @staticmethod - def report(): + def get_python_version(): + return "{}.{}".format(sys.version_info.major, sys.version_info.minor) + + def get_title(self): + return ( + f" Benchmark ver. {__version__} - " + f" Python ver. {self.get_python_version()}" + f" with {self.env['n_folds']} Folds cross validation " + f" Discretization: {self.env['discretize']} " + f"Stratification: {self.env['stratified']}" + ) + + def get_file_name(self): + return self.excel_file_name + + def header(self): + merge_format = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 18, + "bg_color": self.color3, + } + ) + merge_format_subheader = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_right = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "right", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_left = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "left", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format) + self.sheet.merge_range( + 1, + 0, + 4, + 0, + f" Default score {self.env['score']}", + merge_format_subheader, + ) + self.sheet.merge_range( + 1, + 1, + 1, + 3, + "Cross validation", + merge_format_subheader_right, + ) + self.sheet.write( + 1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left + ) + self.sheet.merge_range( + 2, + 1, + 2, + 3, + "Stratified", + merge_format_subheader_right, + ) + self.sheet.write( + 2, + 4, + f"{'True' if self.env['stratified']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 3, + 1, + 3, + 3, + "Discretized", + merge_format_subheader_right, + ) + self.sheet.write( + 3, + 4, + f"{'True' if self.env['discretize']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 4, + 1, + 4, + 3, + "Seeds", + merge_format_subheader_right, + ) + self.sheet.write( + 4, 4, f"{self.env['seeds']}", merge_format_subheader_left + ) + header_cols = [ + ("Dataset", 30), + ("Samples", 10), + ("Features", 10), + ("Classes", 10), + ("Balance", 50), + ] + bold = self.book.add_format( + { + "bold": True, + "font_size": 14, + "bg_color": self.color3, + "border": 1, + } + ) + i = 0 + for item, length in header_cols: + self.sheet.write(5, i, item, bold) + self.sheet.set_column(i, i, length) + i += 1 + + def footer(self): + # set Balance column width to max length + self.sheet.set_column(4, 4, self.max_length) + self.sheet.freeze_panes(6, 1) + self.sheet.hide_gridlines(2) + if self.close: + self.book.close() + + def print_line(self, result): + size_n = 14 + integer = self.book.add_format( + {"num_format": "#,###", "font_size": size_n, "border": 1} + ) + normal = self.book.add_format({"font_size": size_n, "border": 1}) + col = 0 + if self.row % 2 == 0: + normal.set_bg_color(self.color1) + integer.set_bg_color(self.color1) + else: + normal.set_bg_color(self.color2) + integer.set_bg_color(self.color2) + self.sheet.write(self.row, col, result.dataset, normal) + self.sheet.write(self.row, col + 1, result.samples, integer) + self.sheet.write(self.row, col + 2, result.features, integer) + self.sheet.write(self.row, col + 3, result.classes, normal) + self.sheet.write(self.row, col + 4, result.balance, normal) + if len(result.balance) > self.max_length: + self.max_length = len(result.balance) + self.row += 1 + + def report(self): data_sets = Datasets() color_line = TextColor.LINE1 - print(color_line, end="") - print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") - print("=" * 30 + " ===== ====== === " + "=" * 40) + if self.excel: + self.header() + if self.output: + print(color_line, end="") + print(self.header_text) + print("") + print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") + print("=" * 30 + " ===== ====== === " + "=" * 60) for dataset in data_sets: - X, y = data_sets.load(dataset) + attributes = data_sets.get_attributes(dataset) + attributes.dataset = dataset + if self.excel: + self.print_line(attributes) color_line = ( TextColor.LINE2 if color_line == TextColor.LINE1 else TextColor.LINE1 ) - values, counts = np.unique(y, return_counts=True) - comp = "" - sep = "" - for count in counts: - comp += f"{sep}{count/sum(counts)*100:5.2f}%" - sep = "/ " - print(color_line, end="") - print( - f"{dataset:30s} {X.shape[0]:6,d} {X.shape[1]:5,d} " - f"{len(np.unique(y)):3d} {comp:40s}" - ) + if self.output: + print(color_line, end="") + print( + f"{dataset:30s} {attributes.samples:6,d} " + f"{attributes.features:5,d} {attributes.classes:3d} " + f"{attributes.balance:40s}" + ) + if self.excel: + self.footer() class SQL(BaseReport): @@ -1043,7 +1255,8 @@ class Benchmark: sheet.merge_range(row, 0, row + 1, 0, "Model", merge_format) sheet.merge_range(row, 1, row + 1, 5, "File", merge_format) sheet.merge_range(row, 6, row + 1, 6, "Score", merge_format) - row += 1 + sheet.freeze_panes(6, 1) + sheet.hide_gridlines(2) d_name = next(iter(self._datasets)) for model in self._models: file_name = self._report[model][d_name]["file_name"] @@ -1067,8 +1280,10 @@ class Benchmark: ) k = Excel(file_name=file_name, book=book) k.report() - sheet.freeze_panes(6, 1) - sheet.hide_gridlines() + + # Add datasets sheet + re = ReportDatasets(excel=True, book=book) + re.report() def exreport_output(): file_name = os.path.join( diff --git a/benchmark/__init__.py b/benchmark/__init__.py index 4d65a6b..eea7c15 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -6,10 +6,11 @@ from .Datasets import ( ) from .Experiments import Experiment from .Results import Report, Summary +from ._version import __version__ __author__ = "Ricardo Montañana Gómez" -__copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" +__copyright__ = "Copyright 2020-2023, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Experiment", "Datasets", "Report", "Summary"] +__all__ = ["Experiment", "Datasets", "Report", "Summary", __version__] diff --git a/benchmark/_version b/benchmark/_version new file mode 100644 index 0000000..78a9143 --- /dev/null +++ b/benchmark/_version @@ -0,0 +1 @@ +__version__ = "0.7.1" \ No newline at end of file diff --git a/benchmark/scripts/be_report.py b/benchmark/scripts/be_report.py index 6abdf40..8ee44f0 100755 --- a/benchmark/scripts/be_report.py +++ b/benchmark/scripts/be_report.py @@ -21,7 +21,11 @@ def main(args_test=None): if args.grid: args.best = None if args.file is None and args.best is None and args.grid is None: - ReportDatasets.report() + report = ReportDatasets(args.excel) + report.report() + if args.excel: + is_test = args_test is not None + Files.open(report.get_file_name(), is_test) else: if args.best is not None or args.grid is not None: report = ReportBest(args.score, args.model, args.best, args.grid)