diff --git a/.env.dist b/.env.dist index a540dfe..1a1267b 100644 --- a/.env.dist +++ b/.env.dist @@ -5,3 +5,4 @@ model=ODTE stratified=0 source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f4e9584..bf594e0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -14,6 +14,9 @@ jobs: matrix: os: [macos-latest, ubuntu-latest] python: ["3.10", "3.11"] + exclude: + - os: macos-latest + python: "3.11" steps: - uses: actions/checkout@v3 diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index e3735e7..8afcaa1 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -1,8 +1,10 @@ import os import pandas as pd +import numpy as np from scipy.io import arff from .Utils import Files from .Arguments import EnvData +from mdlp.discretization import MDLP class Diterator: @@ -24,14 +26,18 @@ class DatasetsArff: def folder(): return "datasets" - def load(self, name, class_name): + def load(self, name, class_name, dataframe): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = arff.loadarff(file_name) df = pd.DataFrame(data[0]) - df = df.dropna() - X = df.drop(class_name, axis=1).to_numpy() + df.dropna(axis=0, how="any", inplace=True) + X = df.drop(class_name, axis=1) + self.features = X.columns + self.class_name = class_name y, _ = pd.factorize(df[class_name]) - return X, y + df[class_name] = y + X = X.to_numpy() + return df if dataframe else (X, y) class DatasetsTanveer: @@ -43,7 +49,7 @@ class DatasetsTanveer: def folder(): return "data" - def load(self, name, _): + def load(self, name, *args): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = pd.read_csv( file_name, @@ -64,7 +70,7 @@ class DatasetsSurcov: def folder(): return "datasets" - def load(self, name, _): + def load(self, name, *args): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = pd.read_csv( file_name, @@ -80,15 +86,19 @@ class DatasetsSurcov: class Datasets: def __init__(self, dataset_name=None): - envData = EnvData.load() class_name = getattr( __import__(__name__), f"Datasets{envData['source_data']}", ) + self.load = ( + self.load_discretized + if envData["discretize"] == "1" + else self.load_continuous + ) self.dataset = class_name() self.class_names = [] - self.load_names() + self._load_names() if dataset_name is not None: try: class_name = self.class_names[ @@ -99,7 +109,7 @@ class Datasets: raise ValueError(f"Unknown dataset: {dataset_name}") self.data_sets = [dataset_name] - def load_names(self): + def _load_names(self): file_name = os.path.join(self.dataset.folder(), Files.index) default_class = "class" with open(file_name) as f: @@ -115,12 +125,61 @@ class Datasets: self.data_sets = result self.class_names = class_names - def load(self, name): + def get_attributes(self, name): + class Attributes: + pass + + X, y = self.load_continuous(name) + attr = Attributes() + values, counts = np.unique(y, return_counts=True) + comp = "" + sep = "" + for count in counts: + comp += f"{sep}{count/sum(counts)*100:5.2f}%" + sep = "/ " + attr.balance = comp + attr.classes = len(np.unique(y)) + attr.samples = X.shape[0] + attr.features = X.shape[1] + return attr + + def get_features(self): + return self.dataset.features + + def get_class_name(self): + return self.dataset.class_name + + def load_continuous(self, name, dataframe=False): try: class_name = self.class_names[self.data_sets.index(name)] - return self.dataset.load(name, class_name) + return self.dataset.load(name, class_name, dataframe) except (ValueError, FileNotFoundError): raise ValueError(f"Unknown dataset: {name}") + def discretize(self, X, y): + """Supervised discretization with Fayyad and Irani's MDLP algorithm. + + Parameters + ---------- + X : np.ndarray + array (n_samples, n_features) of features + y : np.ndarray + array (n_samples,) of labels + + Returns + ------- + tuple (X, y) of numpy.ndarray + """ + discretiz = MDLP() + Xdisc = discretiz.fit_transform(X, y) + return Xdisc.astype(int), y.astype(int) + + def load_discretized(self, name, dataframe=False): + X, y = self.load_continuous(name) + X, y = self.discretize(X, y) + dataset = pd.DataFrame(X, columns=self.get_features()) + dataset[self.get_class_name()] = y + return dataset if dataframe else X, y + def __iter__(self) -> Diterator: return Diterator(self.data_sets) diff --git a/benchmark/Models.py b/benchmark/Models.py index 03d31eb..a69a312 100644 --- a/benchmark/Models.py +++ b/benchmark/Models.py @@ -8,6 +8,7 @@ from sklearn.ensemble import ( ) from sklearn.svm import SVC from stree import Stree +from bayesclass import TAN from wodt import Wodt from odte import Odte from xgboost import XGBClassifier @@ -20,6 +21,7 @@ class Models: def define_models(random_state): return { "STree": Stree(random_state=random_state), + "TAN": TAN(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state), "Wodt": Wodt(random_state=random_state), diff --git a/benchmark/Results.py b/benchmark/Results.py index 7827e9c..e53a0f6 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -1,4 +1,5 @@ import os +import sys from operator import itemgetter import math import json @@ -17,6 +18,7 @@ from .Utils import ( TextColor, NO_RESULTS, ) +from ._version import __version__ class BestResultsEver: @@ -566,37 +568,251 @@ class Excel(BaseReport): self.sheet.set_row(c, 20) self.sheet.set_row(0, 25) self.sheet.freeze_panes(6, 1) - self.sheet.hide_gridlines() + self.sheet.hide_gridlines(2) if self.close: self.book.close() class ReportDatasets: + row = 6 + # alternate lines colors + color1 = "#DCE6F1" + color2 = "#FDE9D9" + color3 = "#B1A0C7" + + def __init__(self, excel=False, book=None): + self.excel = excel + self.env = EnvData().load() + self.close = False + self.output = True + self.header_text = f"Datasets used in benchmark ver. {__version__}" + if excel: + self.max_length = 0 + if book is None: + self.excel_file_name = Files.datasets_report_excel + self.book = xlsxwriter.Workbook( + self.excel_file_name, {"nan_inf_to_errors": True} + ) + self.set_properties(self.get_title()) + self.close = True + else: + self.book = book + self.output = False + self.sheet = self.book.add_worksheet("Datasets") + + def set_properties(self, title): + self.book.set_properties( + { + "title": title, + "subject": "Machine learning results", + "author": "Ricardo Montañana Gómez", + "manager": "Dr. J. A. Gámez, Dr. J. M. Puerta", + "company": "UCLM", + "comments": "Created with Python and XlsxWriter", + } + ) + @staticmethod - def report(): + def get_python_version(): + return "{}.{}".format(sys.version_info.major, sys.version_info.minor) + + def get_title(self): + return ( + f" Benchmark ver. {__version__} - " + f" Python ver. {self.get_python_version()}" + f" with {self.env['n_folds']} Folds cross validation " + f" Discretization: {self.env['discretize']} " + f"Stratification: {self.env['stratified']}" + ) + + def get_file_name(self): + return self.excel_file_name + + def header(self): + merge_format = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 18, + "bg_color": self.color3, + } + ) + merge_format_subheader = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "center", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_right = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "right", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + merge_format_subheader_left = self.book.add_format( + { + "border": 1, + "bold": 1, + "align": "left", + "valign": "vcenter", + "font_size": 16, + "bg_color": self.color1, + } + ) + self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format) + self.sheet.merge_range( + 1, + 0, + 4, + 0, + f" Default score {self.env['score']}", + merge_format_subheader, + ) + self.sheet.merge_range( + 1, + 1, + 1, + 3, + "Cross validation", + merge_format_subheader_right, + ) + self.sheet.write( + 1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left + ) + self.sheet.merge_range( + 2, + 1, + 2, + 3, + "Stratified", + merge_format_subheader_right, + ) + self.sheet.write( + 2, + 4, + f"{'True' if self.env['stratified']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 3, + 1, + 3, + 3, + "Discretized", + merge_format_subheader_right, + ) + self.sheet.write( + 3, + 4, + f"{'True' if self.env['discretize']=='1' else 'False'}", + merge_format_subheader_left, + ) + self.sheet.merge_range( + 4, + 1, + 4, + 3, + "Seeds", + merge_format_subheader_right, + ) + self.sheet.write( + 4, 4, f"{self.env['seeds']}", merge_format_subheader_left + ) + self.update_max_length(len(self.env["seeds"]) + 1) + header_cols = [ + ("Dataset", 30), + ("Samples", 10), + ("Features", 10), + ("Classes", 10), + ("Balance", 50), + ] + bold = self.book.add_format( + { + "bold": True, + "font_size": 14, + "bg_color": self.color3, + "border": 1, + } + ) + i = 0 + for item, length in header_cols: + self.sheet.write(5, i, item, bold) + self.sheet.set_column(i, i, length) + i += 1 + + def footer(self): + # set Balance column width to max length + self.sheet.set_column(4, 4, self.max_length) + self.sheet.freeze_panes(6, 1) + self.sheet.hide_gridlines(2) + if self.close: + self.book.close() + + def print_line(self, result): + size_n = 14 + integer = self.book.add_format( + {"num_format": "#,###", "font_size": size_n, "border": 1} + ) + normal = self.book.add_format({"font_size": size_n, "border": 1}) + col = 0 + if self.row % 2 == 0: + normal.set_bg_color(self.color1) + integer.set_bg_color(self.color1) + else: + normal.set_bg_color(self.color2) + integer.set_bg_color(self.color2) + self.sheet.write(self.row, col, result.dataset, normal) + self.sheet.write(self.row, col + 1, result.samples, integer) + self.sheet.write(self.row, col + 2, result.features, integer) + self.sheet.write(self.row, col + 3, result.classes, normal) + self.sheet.write(self.row, col + 4, result.balance, normal) + self.update_max_length(len(result.balance)) + self.row += 1 + + def update_max_length(self, value): + if value > self.max_length: + self.max_length = value + + def report(self): data_sets = Datasets() color_line = TextColor.LINE1 - print(color_line, end="") - print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") - print("=" * 30 + " ===== ====== === " + "=" * 40) + if self.excel: + self.header() + if self.output: + print(color_line, end="") + print(self.header_text) + print("") + print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") + print("=" * 30 + " ====== ===== === " + "=" * 60) for dataset in data_sets: - X, y = data_sets.load(dataset) + attributes = data_sets.get_attributes(dataset) + attributes.dataset = dataset + if self.excel: + self.print_line(attributes) color_line = ( TextColor.LINE2 if color_line == TextColor.LINE1 else TextColor.LINE1 ) - values, counts = np.unique(y, return_counts=True) - comp = "" - sep = "" - for count in counts: - comp += f"{sep}{count/sum(counts)*100:5.2f}%" - sep = "/ " - print(color_line, end="") - print( - f"{dataset:30s} {X.shape[0]:6,d} {X.shape[1]:5,d} " - f"{len(np.unique(y)):3d} {comp:40s}" - ) + if self.output: + print(color_line, end="") + print( + f"{dataset:30s} {attributes.samples:6,d} " + f"{attributes.features:5,d} {attributes.classes:3d} " + f"{attributes.balance:40s}" + ) + if self.excel: + self.footer() class SQL(BaseReport): @@ -1068,7 +1284,12 @@ class Benchmark: k = Excel(file_name=file_name, book=book) k.report() sheet.freeze_panes(6, 1) - sheet.hide_gridlines() + sheet.hide_gridlines(2) + + def add_datasets_sheet(): + # Add datasets sheet + re = ReportDatasets(excel=True, book=book) + re.report() def exreport_output(): file_name = os.path.join( @@ -1096,6 +1317,7 @@ class Benchmark: footer() models_files() exreport_output() + add_datasets_sheet() book.close() diff --git a/benchmark/Utils.py b/benchmark/Utils.py index b6b5797..9663086 100644 --- a/benchmark/Utils.py +++ b/benchmark/Utils.py @@ -27,6 +27,7 @@ class Files: exreport_pdf = "Rplots.pdf" benchmark_r = "benchmark.r" dot_env = ".env" + datasets_report_excel = "ReportDatasets.xlsx" @staticmethod def exreport_output(score): diff --git a/benchmark/__init__.py b/benchmark/__init__.py index e26a7cf..eea7c15 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -1,10 +1,16 @@ -from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer, DatasetsArff +from .Datasets import ( + Datasets, + DatasetsSurcov, + DatasetsTanveer, + DatasetsArff, +) from .Experiments import Experiment from .Results import Report, Summary +from ._version import __version__ __author__ = "Ricardo Montañana Gómez" -__copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" +__copyright__ = "Copyright 2020-2023, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Experiment", "Datasets", "Report", "Summary"] +__all__ = ["Experiment", "Datasets", "Report", "Summary", __version__] diff --git a/benchmark/_version b/benchmark/_version new file mode 100644 index 0000000..78a9143 --- /dev/null +++ b/benchmark/_version @@ -0,0 +1 @@ +__version__ = "0.7.1" \ No newline at end of file diff --git a/benchmark/scripts/be_report.py b/benchmark/scripts/be_report.py index 6abdf40..8ee44f0 100755 --- a/benchmark/scripts/be_report.py +++ b/benchmark/scripts/be_report.py @@ -21,7 +21,11 @@ def main(args_test=None): if args.grid: args.best = None if args.file is None and args.best is None and args.grid is None: - ReportDatasets.report() + report = ReportDatasets(args.excel) + report.report() + if args.excel: + is_test = args_test is not None + Files.open(report.get_file_name(), is_test) else: if args.best is not None or args.grid is not None: report = ReportBest(args.score, args.model, args.best, args.grid) diff --git a/benchmark/tests/.env b/benchmark/tests/.env index 31a99ab..9641efa 100644 --- a/benchmark/tests/.env +++ b/benchmark/tests/.env @@ -6,3 +6,4 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 diff --git a/benchmark/tests/.env.arff b/benchmark/tests/.env.arff index ab8956d..7f196d9 100644 --- a/benchmark/tests/.env.arff +++ b/benchmark/tests/.env.arff @@ -4,4 +4,5 @@ n_folds=5 model=ODTE stratified=0 source_data=Arff -seeds=[271, 314, 171] \ No newline at end of file +seeds=[271, 314, 171] +discretize=1 \ No newline at end of file diff --git a/benchmark/tests/.env.dist b/benchmark/tests/.env.dist index 31a99ab..9641efa 100644 --- a/benchmark/tests/.env.dist +++ b/benchmark/tests/.env.dist @@ -6,3 +6,4 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 diff --git a/benchmark/tests/.env.surcov b/benchmark/tests/.env.surcov index 63cc579..805ec7b 100644 --- a/benchmark/tests/.env.surcov +++ b/benchmark/tests/.env.surcov @@ -5,4 +5,5 @@ model=ODTE stratified=0 # Source of data Tanveer/Surcov source_data=Surcov -seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] \ No newline at end of file +seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 \ No newline at end of file diff --git a/benchmark/tests/.gitignore b/benchmark/tests/.gitignore new file mode 100644 index 0000000..c56bbf8 --- /dev/null +++ b/benchmark/tests/.gitignore @@ -0,0 +1 @@ +ReportDatasets.xlsx diff --git a/benchmark/tests/Benchmark_test.py b/benchmark/tests/Benchmark_test.py index 71eea04..0b4abc2 100644 --- a/benchmark/tests/Benchmark_test.py +++ b/benchmark/tests/Benchmark_test.py @@ -89,6 +89,15 @@ class BenchmarkTest(TestBase): self.assertTrue(os.path.exists(benchmark.get_tex_file())) self.check_file_file(benchmark.get_tex_file(), "exreport_tex") + @staticmethod + def generate_excel_sheet(test, sheet, file_name): + with open(os.path.join("test_files", file_name), "w") as f: + for row in range(1, sheet.max_row + 1): + for col in range(1, sheet.max_column + 1): + value = sheet.cell(row=row, column=col).value + if value is not None: + print(f'{row};{col};"{value}"', file=f) + def test_excel_output(self): benchmark = Benchmark("accuracy", visualize=False) benchmark.compile_results() @@ -101,6 +110,3 @@ class BenchmarkTest(TestBase): for sheet_name in book.sheetnames: sheet = book[sheet_name] self.check_excel_sheet(sheet, f"exreport_excel_{sheet_name}") - # ExcelTest.generate_excel_sheet( - # self, sheet, f"exreport_excel_{sheet_name}" - # ) diff --git a/benchmark/tests/Util_test.py b/benchmark/tests/Util_test.py index 1020a5e..8ca7b33 100644 --- a/benchmark/tests/Util_test.py +++ b/benchmark/tests/Util_test.py @@ -179,6 +179,7 @@ class UtilTest(TestBase): "stratified": "0", "source_data": "Tanveer", "seeds": "[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]", + "discretize": "0", } computed = EnvData().load() self.assertDictEqual(computed, expected) diff --git a/benchmark/tests/scripts/Be_Report_test.py b/benchmark/tests/scripts/Be_Report_test.py index 14a51f8..7e7d49e 100644 --- a/benchmark/tests/scripts/Be_Report_test.py +++ b/benchmark/tests/scripts/Be_Report_test.py @@ -1,6 +1,6 @@ import os from openpyxl import load_workbook -from ...Utils import Folders +from ...Utils import Folders, Files from ..TestBase import TestBase @@ -43,6 +43,15 @@ class BeReportTest(TestBase): self.assertEqual(stderr.getvalue(), "") self.check_output_file(stdout, "report_datasets") + def test_be_report_datasets_excel(self): + stdout, stderr = self.execute_script("be_report", ["-x", "1"]) + self.assertEqual(stderr.getvalue(), "") + self.check_output_file(stdout, "report_datasets") + file_name = os.path.join(os.getcwd(), Files.datasets_report_excel) + book = load_workbook(file_name) + sheet = book["Datasets"] + self.check_excel_sheet(sheet, "exreport_excel_Datasets") + def test_be_report_best(self): stdout, stderr = self.execute_script( "be_report", ["-s", "accuracy", "-m", "STree", "-b", "1"] diff --git a/benchmark/tests/test_files/exreport_excel_Datasets.test b/benchmark/tests/test_files/exreport_excel_Datasets.test new file mode 100644 index 0000000..5c2f35a --- /dev/null +++ b/benchmark/tests/test_files/exreport_excel_Datasets.test @@ -0,0 +1,25 @@ +1;1;"Datasets used in benchmark ver. 0.2.0" +2;1;" Default score accuracy" +2;2;"Cross validation" +2;5;"5 Folds" +3;2;"Stratified" +3;5;"False" +4;2;"Discretized" +4;5;"False" +5;2;"Seeds" +5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]" +6;1;"Dataset" +6;2;"Samples" +6;3;"Features" +6;4;"Classes" +6;5;"Balance" +7;1;"balance-scale" +7;2;"625" +7;3;"4" +7;4;"3" +7;5;" 7.84%/ 46.08%/ 46.08%" +8;1;"balloons" +8;2;"16" +8;3;"4" +8;4;"2" +8;5;"56.25%/ 43.75%" diff --git a/benchmark/tests/test_files/report_datasets.test b/benchmark/tests/test_files/report_datasets.test index 8f5b0f6..16c7bd7 100644 --- a/benchmark/tests/test_files/report_datasets.test +++ b/benchmark/tests/test_files/report_datasets.test @@ -1,4 +1,6 @@ -Dataset Sampl. Feat. Cls Balance -============================== ===== ====== === ======================================== +Datasets used in benchmark ver. 0.2.0 + +Dataset Sampl. Feat. Cls Balance +============================== ====== ===== === ============================================================ balance-scale 625 4 3 7.84%/ 46.08%/ 46.08% balloons 16 4 2 56.25%/ 43.75% diff --git a/requirements.txt b/requirements.txt index 02446a6..cba834e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,10 @@ pandas scikit-learn scipy odte +cython +mdlp-discretization mufs +bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git xlsxwriter openpyxl tqdm