From feaf85d0b863ee97c31b655ecc9a65bf46ce645e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Fri, 4 Nov 2022 18:40:50 +0100 Subject: [PATCH 01/10] Add Dataset load return a pandas dataframe --- benchmark/Datasets.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index e3735e7..cf06740 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -24,14 +24,16 @@ class DatasetsArff: def folder(): return "datasets" - def load(self, name, class_name): + def load(self, name, class_name, dataframe): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = arff.loadarff(file_name) df = pd.DataFrame(data[0]) df = df.dropna() - X = df.drop(class_name, axis=1).to_numpy() + X = df.drop(class_name, axis=1) + self.features = X.columns + self.class_name = class_name y, _ = pd.factorize(df[class_name]) - return X, y + return df if dataframe else (X.to_numpy(), y) class DatasetsTanveer: @@ -43,7 +45,7 @@ class DatasetsTanveer: def folder(): return "data" - def load(self, name, _): + def load(self, name, *args): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = pd.read_csv( file_name, @@ -64,7 +66,7 @@ class DatasetsSurcov: def folder(): return "datasets" - def load(self, name, _): + def load(self, name, *args): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = pd.read_csv( file_name, @@ -115,10 +117,10 @@ class Datasets: self.data_sets = result self.class_names = class_names - def load(self, name): + def load(self, name, dataframe=False): try: class_name = self.class_names[self.data_sets.index(name)] - return self.dataset.load(name, class_name) + return self.dataset.load(name, class_name, dataframe) except (ValueError, FileNotFoundError): raise ValueError(f"Unknown dataset: {name}") From 4b442a46f28f0fee8fe08cf6a1e6a792bfd9be9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Thu, 10 Nov 2022 11:47:01 +0100 Subject: [PATCH 02/10] Add Discretizer to Datasets --- benchmark/Datasets.py | 51 ++++++++++++++++++++++++++++++++++++++----- benchmark/__init__.py | 10 +++++++-- requirements.txt | 1 + 3 files changed, 55 insertions(+), 7 deletions(-) diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index cf06740..10a8935 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -3,6 +3,7 @@ import pandas as pd from scipy.io import arff from .Utils import Files from .Arguments import EnvData +from mdlp import MDLP class Diterator: @@ -28,15 +29,20 @@ class DatasetsArff: file_name = os.path.join(self.folder(), self.dataset_names(name)) data = arff.loadarff(file_name) df = pd.DataFrame(data[0]) - df = df.dropna() + df.dropna(axis=0, how="any", inplace=True) X = df.drop(class_name, axis=1) self.features = X.columns self.class_name = class_name y, _ = pd.factorize(df[class_name]) - return df if dataframe else (X.to_numpy(), y) + df[class_name] = y + X = X.to_numpy() + return df if dataframe else (X, y) class DatasetsTanveer: + def __init__(self, discretized): + self.discretized = discretized + @staticmethod def dataset_names(name): return f"{name}_R.dat" @@ -82,7 +88,6 @@ class DatasetsSurcov: class Datasets: def __init__(self, dataset_name=None): - envData = EnvData.load() class_name = getattr( __import__(__name__), @@ -90,7 +95,7 @@ class Datasets: ) self.dataset = class_name() self.class_names = [] - self.load_names() + self._load_names() if dataset_name is not None: try: class_name = self.class_names[ @@ -101,7 +106,7 @@ class Datasets: raise ValueError(f"Unknown dataset: {dataset_name}") self.data_sets = [dataset_name] - def load_names(self): + def _load_names(self): file_name = os.path.join(self.dataset.folder(), Files.index) default_class = "class" with open(file_name) as f: @@ -117,6 +122,12 @@ class Datasets: self.data_sets = result self.class_names = class_names + def get_features(self): + return self.dataset.features + + def get_class_name(self): + return self.dataset.class_name + def load(self, name, dataframe=False): try: class_name = self.class_names[self.data_sets.index(name)] @@ -126,3 +137,33 @@ class Datasets: def __iter__(self) -> Diterator: return Diterator(self.data_sets) + + +class Discretizer(Datasets): + def __init__(self, dataset_name=None): + super().__init__(dataset_name) + + def load(self, name, dataframe=False): + X, y = super().load(name) + X, y = self.discretize(X, y) + dataset = pd.DataFrame(X, columns=self.get_features()) + dataset[self.get_class_name()] = y + return dataset if dataframe else X, y + + def discretize(self, X, y): + """Supervised discretization with Fayyad and Irani's MDLP algorithm. + + Parameters + ---------- + X : np.ndarray + array (n_samples, n_features) of features + y : np.ndarray + array (n_samples,) of labels + + Returns + ------- + tuple (X, y) of numpy.ndarray + """ + discretiz = MDLP() + Xdisc = discretiz.fit_transform(X, y) + return Xdisc.astype(int), y.astype(int) diff --git a/benchmark/__init__.py b/benchmark/__init__.py index e26a7cf..c8c4821 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -1,4 +1,10 @@ -from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer, DatasetsArff +from .Datasets import ( + Datasets, + DatasetsSurcov, + DatasetsTanveer, + DatasetsArff, + Discretizer, +) from .Experiments import Experiment from .Results import Report, Summary @@ -7,4 +13,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Experiment", "Datasets", "Report", "Summary"] +__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"] diff --git a/requirements.txt b/requirements.txt index 02446a6..b9cf80f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ pandas scikit-learn scipy odte +mdlp mufs xlsxwriter openpyxl From 2d61cd11c244cb7a5494f7189a036a8e02dacb59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 12 Nov 2022 19:37:46 +0100 Subject: [PATCH 03/10] refactor Discretization in datasets --- benchmark/Datasets.py | 34 +++++++++++++++++----------------- benchmark/Models.py | 2 ++ benchmark/__init__.py | 3 +-- benchmark/tests/.env.arff | 3 ++- benchmark/tests/.env.dist | 1 + benchmark/tests/.env.surcov | 3 ++- requirements.txt | 2 +- 7 files changed, 26 insertions(+), 22 deletions(-) diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index 10a8935..0691350 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -3,7 +3,7 @@ import pandas as pd from scipy.io import arff from .Utils import Files from .Arguments import EnvData -from mdlp import MDLP +from mdlp.discretization import MDLP class Diterator: @@ -93,6 +93,11 @@ class Datasets: __import__(__name__), f"Datasets{envData['source_data']}", ) + self.load = ( + self.load_discretized + if envData["discretize"] == "1" + else self.load_continuous + ) self.dataset = class_name() self.class_names = [] self._load_names() @@ -128,28 +133,13 @@ class Datasets: def get_class_name(self): return self.dataset.class_name - def load(self, name, dataframe=False): + def load_continuous(self, name, dataframe=False): try: class_name = self.class_names[self.data_sets.index(name)] return self.dataset.load(name, class_name, dataframe) except (ValueError, FileNotFoundError): raise ValueError(f"Unknown dataset: {name}") - def __iter__(self) -> Diterator: - return Diterator(self.data_sets) - - -class Discretizer(Datasets): - def __init__(self, dataset_name=None): - super().__init__(dataset_name) - - def load(self, name, dataframe=False): - X, y = super().load(name) - X, y = self.discretize(X, y) - dataset = pd.DataFrame(X, columns=self.get_features()) - dataset[self.get_class_name()] = y - return dataset if dataframe else X, y - def discretize(self, X, y): """Supervised discretization with Fayyad and Irani's MDLP algorithm. @@ -167,3 +157,13 @@ class Discretizer(Datasets): discretiz = MDLP() Xdisc = discretiz.fit_transform(X, y) return Xdisc.astype(int), y.astype(int) + + def load_discretized(self, name, dataframe=False): + X, y = self.load_continuous(name) + X, y = self.discretize(X, y) + dataset = pd.DataFrame(X, columns=self.get_features()) + dataset[self.get_class_name()] = y + return dataset if dataframe else X, y + + def __iter__(self) -> Diterator: + return Diterator(self.data_sets) diff --git a/benchmark/Models.py b/benchmark/Models.py index 03d31eb..a69a312 100644 --- a/benchmark/Models.py +++ b/benchmark/Models.py @@ -8,6 +8,7 @@ from sklearn.ensemble import ( ) from sklearn.svm import SVC from stree import Stree +from bayesclass import TAN from wodt import Wodt from odte import Odte from xgboost import XGBClassifier @@ -20,6 +21,7 @@ class Models: def define_models(random_state): return { "STree": Stree(random_state=random_state), + "TAN": TAN(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state), "Wodt": Wodt(random_state=random_state), diff --git a/benchmark/__init__.py b/benchmark/__init__.py index c8c4821..4d65a6b 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -3,7 +3,6 @@ from .Datasets import ( DatasetsSurcov, DatasetsTanveer, DatasetsArff, - Discretizer, ) from .Experiments import Experiment from .Results import Report, Summary @@ -13,4 +12,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"] +__all__ = ["Experiment", "Datasets", "Report", "Summary"] diff --git a/benchmark/tests/.env.arff b/benchmark/tests/.env.arff index ab8956d..7f196d9 100644 --- a/benchmark/tests/.env.arff +++ b/benchmark/tests/.env.arff @@ -4,4 +4,5 @@ n_folds=5 model=ODTE stratified=0 source_data=Arff -seeds=[271, 314, 171] \ No newline at end of file +seeds=[271, 314, 171] +discretize=1 \ No newline at end of file diff --git a/benchmark/tests/.env.dist b/benchmark/tests/.env.dist index 31a99ab..9641efa 100644 --- a/benchmark/tests/.env.dist +++ b/benchmark/tests/.env.dist @@ -6,3 +6,4 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 diff --git a/benchmark/tests/.env.surcov b/benchmark/tests/.env.surcov index 63cc579..805ec7b 100644 --- a/benchmark/tests/.env.surcov +++ b/benchmark/tests/.env.surcov @@ -5,4 +5,5 @@ model=ODTE stratified=0 # Source of data Tanveer/Surcov source_data=Surcov -seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] \ No newline at end of file +seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b9cf80f..37821a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ pandas scikit-learn scipy odte -mdlp +mdlp-discretization mufs xlsxwriter openpyxl From 2e6f49de8e6668cb007ce3cd1717ac9442bfc56b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 12 Nov 2022 19:38:14 +0100 Subject: [PATCH 04/10] Add discretize key to .env.dist --- .env.dist | 1 + 1 file changed, 1 insertion(+) diff --git a/.env.dist b/.env.dist index a540dfe..1a1267b 100644 --- a/.env.dist +++ b/.env.dist @@ -5,3 +5,4 @@ model=ODTE stratified=0 source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 From f1b9dc1fefd654d679da0e98d4ce09efd4932e4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 13 Nov 2022 14:46:41 +0100 Subject: [PATCH 05/10] Add excel to report dataset --- benchmark/Datasets.py | 22 ++- benchmark/Results.py | 255 ++++++++++++++++++++++++++++++--- benchmark/__init__.py | 5 +- benchmark/_version | 1 + benchmark/scripts/be_report.py | 6 +- 5 files changed, 263 insertions(+), 26 deletions(-) create mode 100644 benchmark/_version diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index 0691350..8afcaa1 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -1,5 +1,6 @@ import os import pandas as pd +import numpy as np from scipy.io import arff from .Utils import Files from .Arguments import EnvData @@ -40,9 +41,6 @@ class DatasetsArff: class DatasetsTanveer: - def __init__(self, discretized): - self.discretized = discretized - @staticmethod def dataset_names(name): return f"{name}_R.dat" @@ -127,6 +125,24 @@ class Datasets: self.data_sets = result self.class_names = class_names + def get_attributes(self, name): + class Attributes: + pass + + X, y = self.load_continuous(name) + attr = Attributes() + values, counts = np.unique(y, return_counts=True) + comp = "" + sep = "" + for count in counts: + comp += f"{sep}{count/sum(counts)*100:5.2f}%" + sep = "/ " + attr.balance = comp + attr.classes = len(np.unique(y)) + attr.samples = X.shape[0] + attr.features = X.shape[1] + return attr + def get_features(self): return self.dataset.features diff --git a/benchmark/Results.py b/benchmark/Results.py index 7827e9c..011b3a0 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -1,4 +1,5 @@ import os +import sys from operator import itemgetter import math import json @@ -17,6 +18,7 @@ from .Utils import ( TextColor, NO_RESULTS, ) +from ._version import __version__ class BestResultsEver: @@ -566,37 +568,247 @@ class Excel(BaseReport): self.sheet.set_row(c, 20) self.sheet.set_row(0, 25) self.sheet.freeze_panes(6, 1) - self.sheet.hide_gridlines() + self.sheet.hide_gridlines(2) if self.close: self.book.close() class ReportDatasets: + row = 6 + # alternate lines colors + color1 = "#DCE6F1" + color2 = "#FDE9D9" + color3 = "#B1A0C7" + + def __init__(self, excel, book=None): + self.excel = excel + self.env = EnvData().load() + self.close = False + self.output = True + self.header_text = f"Datasets used in benchmark ver. {__version__}" + if excel: + self.max_length = 0 + if book is None: + self.excel_file_name = "ReportDatasets.xlsx" + self.book = xlsxwriter.Workbook( + self.excel_file_name, {"nan_inf_to_errors": True} + ) + self.set_properties(self.get_title()) + self.close = True + else: + self.book = book + self.output = False + self.sheet = self.book.add_worksheet("Datasets") + + def set_properties(self, title): + self.book.set_properties( + { + "title": title, + "subject": "Machine learning results", + "author": "Ricardo Montañana Gómez", + "manager": "Dr. J. A. Gámez, Dr. J. M. Puerta", + "company": "UCLM", + "comments": "Created with Python and XlsxWriter", + } + ) + @staticmethod - def report(): + def get_python_version(): + return "{}.{}".format(sys.version_info.major, sys.version_info.minor) + + def get_title(self): + return ( + f" Benchmark ver. {__version__} - " + f" Python ver. {self.get_python_version()}" + f" with {self.env['n_folds']} Folds cross validation " + f" Discretization: {self.env['discretize']} " + f"Stratification: {self.env['stratified']}" + ) + + def get_file_name(self): + return self.excel_file_name + + def header(self): + merge_format = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 18, + "bg_color": self.color3, + } + ) + merge_format_subheader = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_right = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "right", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_left = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "left", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format) + self.sheet.merge_range( + 1, + 0, + 4, + 0, + f" Default score {self.env['score']}", + merge_format_subheader, + ) + self.sheet.merge_range( + 1, + 1, + 1, + 3, + "Cross validation", + merge_format_subheader_right, + ) + self.sheet.write( + 1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left + ) + self.sheet.merge_range( + 2, + 1, + 2, + 3, + "Stratified", + merge_format_subheader_right, + ) + self.sheet.write( + 2, + 4, + f"{'True' if self.env['stratified']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 3, + 1, + 3, + 3, + "Discretized", + merge_format_subheader_right, + ) + self.sheet.write( + 3, + 4, + f"{'True' if self.env['discretize']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 4, + 1, + 4, + 3, + "Seeds", + merge_format_subheader_right, + ) + self.sheet.write( + 4, 4, f"{self.env['seeds']}", merge_format_subheader_left + ) + header_cols = [ + ("Dataset", 30), + ("Samples", 10), + ("Features", 10), + ("Classes", 10), + ("Balance", 50), + ] + bold = self.book.add_format( + { + "bold": True, + "font_size": 14, + "bg_color": self.color3, + "border": 1, + } + ) + i = 0 + for item, length in header_cols: + self.sheet.write(5, i, item, bold) + self.sheet.set_column(i, i, length) + i += 1 + + def footer(self): + # set Balance column width to max length + self.sheet.set_column(4, 4, self.max_length) + self.sheet.freeze_panes(6, 1) + self.sheet.hide_gridlines(2) + if self.close: + self.book.close() + + def print_line(self, result): + size_n = 14 + integer = self.book.add_format( + {"num_format": "#,###", "font_size": size_n, "border": 1} + ) + normal = self.book.add_format({"font_size": size_n, "border": 1}) + col = 0 + if self.row % 2 == 0: + normal.set_bg_color(self.color1) + integer.set_bg_color(self.color1) + else: + normal.set_bg_color(self.color2) + integer.set_bg_color(self.color2) + self.sheet.write(self.row, col, result.dataset, normal) + self.sheet.write(self.row, col + 1, result.samples, integer) + self.sheet.write(self.row, col + 2, result.features, integer) + self.sheet.write(self.row, col + 3, result.classes, normal) + self.sheet.write(self.row, col + 4, result.balance, normal) + if len(result.balance) > self.max_length: + self.max_length = len(result.balance) + self.row += 1 + + def report(self): data_sets = Datasets() color_line = TextColor.LINE1 - print(color_line, end="") - print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") - print("=" * 30 + " ===== ====== === " + "=" * 40) + if self.excel: + self.header() + if self.output: + print(color_line, end="") + print(self.header_text) + print("") + print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") + print("=" * 30 + " ===== ====== === " + "=" * 60) for dataset in data_sets: - X, y = data_sets.load(dataset) + attributes = data_sets.get_attributes(dataset) + attributes.dataset = dataset + if self.excel: + self.print_line(attributes) color_line = ( TextColor.LINE2 if color_line == TextColor.LINE1 else TextColor.LINE1 ) - values, counts = np.unique(y, return_counts=True) - comp = "" - sep = "" - for count in counts: - comp += f"{sep}{count/sum(counts)*100:5.2f}%" - sep = "/ " - print(color_line, end="") - print( - f"{dataset:30s} {X.shape[0]:6,d} {X.shape[1]:5,d} " - f"{len(np.unique(y)):3d} {comp:40s}" - ) + if self.output: + print(color_line, end="") + print( + f"{dataset:30s} {attributes.samples:6,d} " + f"{attributes.features:5,d} {attributes.classes:3d} " + f"{attributes.balance:40s}" + ) + if self.excel: + self.footer() class SQL(BaseReport): @@ -1043,7 +1255,8 @@ class Benchmark: sheet.merge_range(row, 0, row + 1, 0, "Model", merge_format) sheet.merge_range(row, 1, row + 1, 5, "File", merge_format) sheet.merge_range(row, 6, row + 1, 6, "Score", merge_format) - row += 1 + sheet.freeze_panes(6, 1) + sheet.hide_gridlines(2) d_name = next(iter(self._datasets)) for model in self._models: file_name = self._report[model][d_name]["file_name"] @@ -1067,8 +1280,10 @@ class Benchmark: ) k = Excel(file_name=file_name, book=book) k.report() - sheet.freeze_panes(6, 1) - sheet.hide_gridlines() + + # Add datasets sheet + re = ReportDatasets(excel=True, book=book) + re.report() def exreport_output(): file_name = os.path.join( diff --git a/benchmark/__init__.py b/benchmark/__init__.py index 4d65a6b..eea7c15 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -6,10 +6,11 @@ from .Datasets import ( ) from .Experiments import Experiment from .Results import Report, Summary +from ._version import __version__ __author__ = "Ricardo Montañana Gómez" -__copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" +__copyright__ = "Copyright 2020-2023, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Experiment", "Datasets", "Report", "Summary"] +__all__ = ["Experiment", "Datasets", "Report", "Summary", __version__] diff --git a/benchmark/_version b/benchmark/_version new file mode 100644 index 0000000..78a9143 --- /dev/null +++ b/benchmark/_version @@ -0,0 +1 @@ +__version__ = "0.7.1" \ No newline at end of file diff --git a/benchmark/scripts/be_report.py b/benchmark/scripts/be_report.py index 6abdf40..8ee44f0 100755 --- a/benchmark/scripts/be_report.py +++ b/benchmark/scripts/be_report.py @@ -21,7 +21,11 @@ def main(args_test=None): if args.grid: args.best = None if args.file is None and args.best is None and args.grid is None: - ReportDatasets.report() + report = ReportDatasets(args.excel) + report.report() + if args.excel: + is_test = args_test is not None + Files.open(report.get_file_name(), is_test) else: if args.best is not None or args.grid is not None: report = ReportBest(args.score, args.model, args.best, args.grid) From 6aec5b2a9726304c635f3efd33ae7731eb3000a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 13 Nov 2022 17:44:45 +0100 Subject: [PATCH 06/10] Add tests to excel in report datasets --- benchmark/Results.py | 21 ++++++++++------ benchmark/Utils.py | 1 + benchmark/tests/.env | 1 + benchmark/tests/.gitignore | 1 + benchmark/tests/Benchmark_test.py | 12 ++++++--- benchmark/tests/Util_test.py | 1 + benchmark/tests/scripts/Be_Report_test.py | 11 +++++++- .../test_files/exreport_excel_Datasets.test | 25 +++++++++++++++++++ .../tests/test_files/report_datasets.test | 6 +++-- 9 files changed, 66 insertions(+), 13 deletions(-) create mode 100644 benchmark/tests/.gitignore create mode 100644 benchmark/tests/test_files/exreport_excel_Datasets.test diff --git a/benchmark/Results.py b/benchmark/Results.py index 011b3a0..e53a0f6 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -580,7 +580,7 @@ class ReportDatasets: color2 = "#FDE9D9" color3 = "#B1A0C7" - def __init__(self, excel, book=None): + def __init__(self, excel=False, book=None): self.excel = excel self.env = EnvData().load() self.close = False @@ -589,7 +589,7 @@ class ReportDatasets: if excel: self.max_length = 0 if book is None: - self.excel_file_name = "ReportDatasets.xlsx" + self.excel_file_name = Files.datasets_report_excel self.book = xlsxwriter.Workbook( self.excel_file_name, {"nan_inf_to_errors": True} ) @@ -728,6 +728,7 @@ class ReportDatasets: self.sheet.write( 4, 4, f"{self.env['seeds']}", merge_format_subheader_left ) + self.update_max_length(len(self.env["seeds"]) + 1) header_cols = [ ("Dataset", 30), ("Samples", 10), @@ -775,10 +776,13 @@ class ReportDatasets: self.sheet.write(self.row, col + 2, result.features, integer) self.sheet.write(self.row, col + 3, result.classes, normal) self.sheet.write(self.row, col + 4, result.balance, normal) - if len(result.balance) > self.max_length: - self.max_length = len(result.balance) + self.update_max_length(len(result.balance)) self.row += 1 + def update_max_length(self, value): + if value > self.max_length: + self.max_length = value + def report(self): data_sets = Datasets() color_line = TextColor.LINE1 @@ -789,7 +793,7 @@ class ReportDatasets: print(self.header_text) print("") print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") - print("=" * 30 + " ===== ====== === " + "=" * 60) + print("=" * 30 + " ====== ===== === " + "=" * 60) for dataset in data_sets: attributes = data_sets.get_attributes(dataset) attributes.dataset = dataset @@ -1255,8 +1259,7 @@ class Benchmark: sheet.merge_range(row, 0, row + 1, 0, "Model", merge_format) sheet.merge_range(row, 1, row + 1, 5, "File", merge_format) sheet.merge_range(row, 6, row + 1, 6, "Score", merge_format) - sheet.freeze_panes(6, 1) - sheet.hide_gridlines(2) + row += 1 d_name = next(iter(self._datasets)) for model in self._models: file_name = self._report[model][d_name]["file_name"] @@ -1280,7 +1283,10 @@ class Benchmark: ) k = Excel(file_name=file_name, book=book) k.report() + sheet.freeze_panes(6, 1) + sheet.hide_gridlines(2) + def add_datasets_sheet(): # Add datasets sheet re = ReportDatasets(excel=True, book=book) re.report() @@ -1311,6 +1317,7 @@ class Benchmark: footer() models_files() exreport_output() + add_datasets_sheet() book.close() diff --git a/benchmark/Utils.py b/benchmark/Utils.py index b6b5797..9663086 100644 --- a/benchmark/Utils.py +++ b/benchmark/Utils.py @@ -27,6 +27,7 @@ class Files: exreport_pdf = "Rplots.pdf" benchmark_r = "benchmark.r" dot_env = ".env" + datasets_report_excel = "ReportDatasets.xlsx" @staticmethod def exreport_output(score): diff --git a/benchmark/tests/.env b/benchmark/tests/.env index 31a99ab..9641efa 100644 --- a/benchmark/tests/.env +++ b/benchmark/tests/.env @@ -6,3 +6,4 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 diff --git a/benchmark/tests/.gitignore b/benchmark/tests/.gitignore new file mode 100644 index 0000000..c56bbf8 --- /dev/null +++ b/benchmark/tests/.gitignore @@ -0,0 +1 @@ +ReportDatasets.xlsx diff --git a/benchmark/tests/Benchmark_test.py b/benchmark/tests/Benchmark_test.py index 71eea04..0b4abc2 100644 --- a/benchmark/tests/Benchmark_test.py +++ b/benchmark/tests/Benchmark_test.py @@ -89,6 +89,15 @@ class BenchmarkTest(TestBase): self.assertTrue(os.path.exists(benchmark.get_tex_file())) self.check_file_file(benchmark.get_tex_file(), "exreport_tex") + @staticmethod + def generate_excel_sheet(test, sheet, file_name): + with open(os.path.join("test_files", file_name), "w") as f: + for row in range(1, sheet.max_row + 1): + for col in range(1, sheet.max_column + 1): + value = sheet.cell(row=row, column=col).value + if value is not None: + print(f'{row};{col};"{value}"', file=f) + def test_excel_output(self): benchmark = Benchmark("accuracy", visualize=False) benchmark.compile_results() @@ -101,6 +110,3 @@ class BenchmarkTest(TestBase): for sheet_name in book.sheetnames: sheet = book[sheet_name] self.check_excel_sheet(sheet, f"exreport_excel_{sheet_name}") - # ExcelTest.generate_excel_sheet( - # self, sheet, f"exreport_excel_{sheet_name}" - # ) diff --git a/benchmark/tests/Util_test.py b/benchmark/tests/Util_test.py index 1020a5e..8ca7b33 100644 --- a/benchmark/tests/Util_test.py +++ b/benchmark/tests/Util_test.py @@ -179,6 +179,7 @@ class UtilTest(TestBase): "stratified": "0", "source_data": "Tanveer", "seeds": "[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]", + "discretize": "0", } computed = EnvData().load() self.assertDictEqual(computed, expected) diff --git a/benchmark/tests/scripts/Be_Report_test.py b/benchmark/tests/scripts/Be_Report_test.py index 14a51f8..073d4e6 100644 --- a/benchmark/tests/scripts/Be_Report_test.py +++ b/benchmark/tests/scripts/Be_Report_test.py @@ -1,6 +1,6 @@ import os from openpyxl import load_workbook -from ...Utils import Folders +from ...Utils import Folders, Files from ..TestBase import TestBase @@ -43,6 +43,15 @@ class BeReportTest(TestBase): self.assertEqual(stderr.getvalue(), "") self.check_output_file(stdout, "report_datasets") + def test_be_report_datasets_excel(self): + stdout, stderr = self.execute_script("be_report", ["-x", "1"]) + self.assertEqual(stderr.getvalue(), "") + self.check_output_file(stdout, "report_datasets") + file_name = os.path.join(os.getcwd(), Files.datasets_report_excel) + book = load_workbook(file_name) + sheet = book["Datasets"] + self.check_excel_sheet(sheet, "exreport_excel_datasets") + def test_be_report_best(self): stdout, stderr = self.execute_script( "be_report", ["-s", "accuracy", "-m", "STree", "-b", "1"] diff --git a/benchmark/tests/test_files/exreport_excel_Datasets.test b/benchmark/tests/test_files/exreport_excel_Datasets.test new file mode 100644 index 0000000..5c2f35a --- /dev/null +++ b/benchmark/tests/test_files/exreport_excel_Datasets.test @@ -0,0 +1,25 @@ +1;1;"Datasets used in benchmark ver. 0.2.0" +2;1;" Default score accuracy" +2;2;"Cross validation" +2;5;"5 Folds" +3;2;"Stratified" +3;5;"False" +4;2;"Discretized" +4;5;"False" +5;2;"Seeds" +5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]" +6;1;"Dataset" +6;2;"Samples" +6;3;"Features" +6;4;"Classes" +6;5;"Balance" +7;1;"balance-scale" +7;2;"625" +7;3;"4" +7;4;"3" +7;5;" 7.84%/ 46.08%/ 46.08%" +8;1;"balloons" +8;2;"16" +8;3;"4" +8;4;"2" +8;5;"56.25%/ 43.75%" diff --git a/benchmark/tests/test_files/report_datasets.test b/benchmark/tests/test_files/report_datasets.test index 8f5b0f6..16c7bd7 100644 --- a/benchmark/tests/test_files/report_datasets.test +++ b/benchmark/tests/test_files/report_datasets.test @@ -1,4 +1,6 @@ -Dataset Sampl. Feat. Cls Balance -============================== ===== ====== === ======================================== +Datasets used in benchmark ver. 0.2.0 + +Dataset Sampl. Feat. Cls Balance +============================== ====== ===== === ============================================================ balance-scale 625 4 3 7.84%/ 46.08%/ 46.08% balloons 16 4 2 56.25%/ 43.75% From cd2d803ff54d751a9f34c85351124978ea7f7e29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 13 Nov 2022 18:10:42 +0100 Subject: [PATCH 07/10] Update requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 37821a3..a3a8e9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ pandas scikit-learn scipy odte +cython mdlp-discretization mufs xlsxwriter From 6ebcc31c362edbf006ddef36de9a58f6b6adc891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 13 Nov 2022 18:34:54 +0100 Subject: [PATCH 08/10] Add bayesclass to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index a3a8e9b..cba834e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ odte cython mdlp-discretization mufs +bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git xlsxwriter openpyxl tqdm From 5b5d385b4ca41d9f2905aa80aaab751beb2bacae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 13 Nov 2022 20:04:26 +0100 Subject: [PATCH 09/10] Fix uppercase mistake in filename --- benchmark/tests/scripts/Be_Report_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/tests/scripts/Be_Report_test.py b/benchmark/tests/scripts/Be_Report_test.py index 073d4e6..7e7d49e 100644 --- a/benchmark/tests/scripts/Be_Report_test.py +++ b/benchmark/tests/scripts/Be_Report_test.py @@ -50,7 +50,7 @@ class BeReportTest(TestBase): file_name = os.path.join(os.getcwd(), Files.datasets_report_excel) book = load_workbook(file_name) sheet = book["Datasets"] - self.check_excel_sheet(sheet, "exreport_excel_datasets") + self.check_excel_sheet(sheet, "exreport_excel_Datasets") def test_be_report_best(self): stdout, stderr = self.execute_script( From 9039a634cf810c8736c1e47f7d0c9891bf91f859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sun, 13 Nov 2022 22:14:01 +0100 Subject: [PATCH 10/10] Exclude macos-latest with python 3.11 (no torch) --- .github/workflows/main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f4e9584..bf594e0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -14,6 +14,9 @@ jobs: matrix: os: [macos-latest, ubuntu-latest] python: ["3.10", "3.11"] + exclude: + - os: macos-latest + python: "3.11" steps: - uses: actions/checkout@v3