Add excel to report dataset

This commit is contained in:
2022-11-13 14:46:41 +01:00
parent 2e6f49de8e
commit f1b9dc1fef
5 changed files with 263 additions and 26 deletions

View File

@@ -1,5 +1,6 @@
import os import os
import pandas as pd import pandas as pd
import numpy as np
from scipy.io import arff from scipy.io import arff
from .Utils import Files from .Utils import Files
from .Arguments import EnvData from .Arguments import EnvData
@@ -40,9 +41,6 @@ class DatasetsArff:
class DatasetsTanveer: class DatasetsTanveer:
def __init__(self, discretized):
self.discretized = discretized
@staticmethod @staticmethod
def dataset_names(name): def dataset_names(name):
return f"{name}_R.dat" return f"{name}_R.dat"
@@ -127,6 +125,24 @@ class Datasets:
self.data_sets = result self.data_sets = result
self.class_names = class_names self.class_names = class_names
def get_attributes(self, name):
class Attributes:
pass
X, y = self.load_continuous(name)
attr = Attributes()
values, counts = np.unique(y, return_counts=True)
comp = ""
sep = ""
for count in counts:
comp += f"{sep}{count/sum(counts)*100:5.2f}%"
sep = "/ "
attr.balance = comp
attr.classes = len(np.unique(y))
attr.samples = X.shape[0]
attr.features = X.shape[1]
return attr
def get_features(self): def get_features(self):
return self.dataset.features return self.dataset.features

View File

@@ -1,4 +1,5 @@
import os import os
import sys
from operator import itemgetter from operator import itemgetter
import math import math
import json import json
@@ -17,6 +18,7 @@ from .Utils import (
TextColor, TextColor,
NO_RESULTS, NO_RESULTS,
) )
from ._version import __version__
class BestResultsEver: class BestResultsEver:
@@ -566,37 +568,247 @@ class Excel(BaseReport):
self.sheet.set_row(c, 20) self.sheet.set_row(c, 20)
self.sheet.set_row(0, 25) self.sheet.set_row(0, 25)
self.sheet.freeze_panes(6, 1) self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines() self.sheet.hide_gridlines(2)
if self.close: if self.close:
self.book.close() self.book.close()
class ReportDatasets: class ReportDatasets:
row = 6
# alternate lines colors
color1 = "#DCE6F1"
color2 = "#FDE9D9"
color3 = "#B1A0C7"
def __init__(self, excel, book=None):
self.excel = excel
self.env = EnvData().load()
self.close = False
self.output = True
self.header_text = f"Datasets used in benchmark ver. {__version__}"
if excel:
self.max_length = 0
if book is None:
self.excel_file_name = "ReportDatasets.xlsx"
self.book = xlsxwriter.Workbook(
self.excel_file_name, {"nan_inf_to_errors": True}
)
self.set_properties(self.get_title())
self.close = True
else:
self.book = book
self.output = False
self.sheet = self.book.add_worksheet("Datasets")
def set_properties(self, title):
self.book.set_properties(
{
"title": title,
"subject": "Machine learning results",
"author": "Ricardo Montañana Gómez",
"manager": "Dr. J. A. Gámez, Dr. J. M. Puerta",
"company": "UCLM",
"comments": "Created with Python and XlsxWriter",
}
)
@staticmethod @staticmethod
def report(): def get_python_version():
return "{}.{}".format(sys.version_info.major, sys.version_info.minor)
def get_title(self):
return (
f" Benchmark ver. {__version__} - "
f" Python ver. {self.get_python_version()}"
f" with {self.env['n_folds']} Folds cross validation "
f" Discretization: {self.env['discretize']} "
f"Stratification: {self.env['stratified']}"
)
def get_file_name(self):
return self.excel_file_name
def header(self):
merge_format = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "center",
"valign": "vcenter",
"font_size": 18,
"bg_color": self.color3,
}
)
merge_format_subheader = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "center",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
merge_format_subheader_right = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "right",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
merge_format_subheader_left = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "left",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
self.sheet.merge_range(
1,
0,
4,
0,
f" Default score {self.env['score']}",
merge_format_subheader,
)
self.sheet.merge_range(
1,
1,
1,
3,
"Cross validation",
merge_format_subheader_right,
)
self.sheet.write(
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
)
self.sheet.merge_range(
2,
1,
2,
3,
"Stratified",
merge_format_subheader_right,
)
self.sheet.write(
2,
4,
f"{'True' if self.env['stratified']=='1' else 'False'}",
merge_format_subheader_left,
)
self.sheet.merge_range(
3,
1,
3,
3,
"Discretized",
merge_format_subheader_right,
)
self.sheet.write(
3,
4,
f"{'True' if self.env['discretize']=='1' else 'False'}",
merge_format_subheader_left,
)
self.sheet.merge_range(
4,
1,
4,
3,
"Seeds",
merge_format_subheader_right,
)
self.sheet.write(
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
)
header_cols = [
("Dataset", 30),
("Samples", 10),
("Features", 10),
("Classes", 10),
("Balance", 50),
]
bold = self.book.add_format(
{
"bold": True,
"font_size": 14,
"bg_color": self.color3,
"border": 1,
}
)
i = 0
for item, length in header_cols:
self.sheet.write(5, i, item, bold)
self.sheet.set_column(i, i, length)
i += 1
def footer(self):
# set Balance column width to max length
self.sheet.set_column(4, 4, self.max_length)
self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines(2)
if self.close:
self.book.close()
def print_line(self, result):
size_n = 14
integer = self.book.add_format(
{"num_format": "#,###", "font_size": size_n, "border": 1}
)
normal = self.book.add_format({"font_size": size_n, "border": 1})
col = 0
if self.row % 2 == 0:
normal.set_bg_color(self.color1)
integer.set_bg_color(self.color1)
else:
normal.set_bg_color(self.color2)
integer.set_bg_color(self.color2)
self.sheet.write(self.row, col, result.dataset, normal)
self.sheet.write(self.row, col + 1, result.samples, integer)
self.sheet.write(self.row, col + 2, result.features, integer)
self.sheet.write(self.row, col + 3, result.classes, normal)
self.sheet.write(self.row, col + 4, result.balance, normal)
if len(result.balance) > self.max_length:
self.max_length = len(result.balance)
self.row += 1
def report(self):
data_sets = Datasets() data_sets = Datasets()
color_line = TextColor.LINE1 color_line = TextColor.LINE1
print(color_line, end="") if self.excel:
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") self.header()
print("=" * 30 + " ===== ====== === " + "=" * 40) if self.output:
print(color_line, end="")
print(self.header_text)
print("")
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
print("=" * 30 + " ===== ====== === " + "=" * 60)
for dataset in data_sets: for dataset in data_sets:
X, y = data_sets.load(dataset) attributes = data_sets.get_attributes(dataset)
attributes.dataset = dataset
if self.excel:
self.print_line(attributes)
color_line = ( color_line = (
TextColor.LINE2 TextColor.LINE2
if color_line == TextColor.LINE1 if color_line == TextColor.LINE1
else TextColor.LINE1 else TextColor.LINE1
) )
values, counts = np.unique(y, return_counts=True) if self.output:
comp = "" print(color_line, end="")
sep = "" print(
for count in counts: f"{dataset:30s} {attributes.samples:6,d} "
comp += f"{sep}{count/sum(counts)*100:5.2f}%" f"{attributes.features:5,d} {attributes.classes:3d} "
sep = "/ " f"{attributes.balance:40s}"
print(color_line, end="") )
print( if self.excel:
f"{dataset:30s} {X.shape[0]:6,d} {X.shape[1]:5,d} " self.footer()
f"{len(np.unique(y)):3d} {comp:40s}"
)
class SQL(BaseReport): class SQL(BaseReport):
@@ -1043,7 +1255,8 @@ class Benchmark:
sheet.merge_range(row, 0, row + 1, 0, "Model", merge_format) sheet.merge_range(row, 0, row + 1, 0, "Model", merge_format)
sheet.merge_range(row, 1, row + 1, 5, "File", merge_format) sheet.merge_range(row, 1, row + 1, 5, "File", merge_format)
sheet.merge_range(row, 6, row + 1, 6, "Score", merge_format) sheet.merge_range(row, 6, row + 1, 6, "Score", merge_format)
row += 1 sheet.freeze_panes(6, 1)
sheet.hide_gridlines(2)
d_name = next(iter(self._datasets)) d_name = next(iter(self._datasets))
for model in self._models: for model in self._models:
file_name = self._report[model][d_name]["file_name"] file_name = self._report[model][d_name]["file_name"]
@@ -1067,8 +1280,10 @@ class Benchmark:
) )
k = Excel(file_name=file_name, book=book) k = Excel(file_name=file_name, book=book)
k.report() k.report()
sheet.freeze_panes(6, 1)
sheet.hide_gridlines() # Add datasets sheet
re = ReportDatasets(excel=True, book=book)
re.report()
def exreport_output(): def exreport_output():
file_name = os.path.join( file_name = os.path.join(

View File

@@ -6,10 +6,11 @@ from .Datasets import (
) )
from .Experiments import Experiment from .Experiments import Experiment
from .Results import Report, Summary from .Results import Report, Summary
from ._version import __version__
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020-2023, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es" __author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Experiment", "Datasets", "Report", "Summary"] __all__ = ["Experiment", "Datasets", "Report", "Summary", __version__]

1
benchmark/_version Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.7.1"

View File

@@ -21,7 +21,11 @@ def main(args_test=None):
if args.grid: if args.grid:
args.best = None args.best = None
if args.file is None and args.best is None and args.grid is None: if args.file is None and args.best is None and args.grid is None:
ReportDatasets.report() report = ReportDatasets(args.excel)
report.report()
if args.excel:
is_test = args_test is not None
Files.open(report.get_file_name(), is_test)
else: else:
if args.best is not None or args.grid is not None: if args.best is not None or args.grid is not None:
report = ReportBest(args.score, args.model, args.best, args.grid) report = ReportBest(args.score, args.model, args.best, args.grid)