Merge pull request #6 from Doctorado-ML/language_version

Add Discretizer to Datasets
Add excel to report datasets
Add report datasets sheet to benchmark excel
This commit is contained in:
Ricardo Montañana Gómez
2022-11-13 22:51:50 +01:00
committed by GitHub
20 changed files with 391 additions and 41 deletions

View File

@@ -5,3 +5,4 @@ model=ODTE
stratified=0 stratified=0
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -14,6 +14,9 @@ jobs:
matrix: matrix:
os: [macos-latest, ubuntu-latest] os: [macos-latest, ubuntu-latest]
python: ["3.10", "3.11"] python: ["3.10", "3.11"]
exclude:
- os: macos-latest
python: "3.11"
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3

View File

@@ -1,8 +1,10 @@
import os import os
import pandas as pd import pandas as pd
import numpy as np
from scipy.io import arff from scipy.io import arff
from .Utils import Files from .Utils import Files
from .Arguments import EnvData from .Arguments import EnvData
from mdlp.discretization import MDLP
class Diterator: class Diterator:
@@ -24,14 +26,18 @@ class DatasetsArff:
def folder(): def folder():
return "datasets" return "datasets"
def load(self, name, class_name): def load(self, name, class_name, dataframe):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = arff.loadarff(file_name) data = arff.loadarff(file_name)
df = pd.DataFrame(data[0]) df = pd.DataFrame(data[0])
df = df.dropna() df.dropna(axis=0, how="any", inplace=True)
X = df.drop(class_name, axis=1).to_numpy() X = df.drop(class_name, axis=1)
self.features = X.columns
self.class_name = class_name
y, _ = pd.factorize(df[class_name]) y, _ = pd.factorize(df[class_name])
return X, y df[class_name] = y
X = X.to_numpy()
return df if dataframe else (X, y)
class DatasetsTanveer: class DatasetsTanveer:
@@ -43,7 +49,7 @@ class DatasetsTanveer:
def folder(): def folder():
return "data" return "data"
def load(self, name, _): def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv( data = pd.read_csv(
file_name, file_name,
@@ -64,7 +70,7 @@ class DatasetsSurcov:
def folder(): def folder():
return "datasets" return "datasets"
def load(self, name, _): def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv( data = pd.read_csv(
file_name, file_name,
@@ -80,15 +86,19 @@ class DatasetsSurcov:
class Datasets: class Datasets:
def __init__(self, dataset_name=None): def __init__(self, dataset_name=None):
envData = EnvData.load() envData = EnvData.load()
class_name = getattr( class_name = getattr(
__import__(__name__), __import__(__name__),
f"Datasets{envData['source_data']}", f"Datasets{envData['source_data']}",
) )
self.load = (
self.load_discretized
if envData["discretize"] == "1"
else self.load_continuous
)
self.dataset = class_name() self.dataset = class_name()
self.class_names = [] self.class_names = []
self.load_names() self._load_names()
if dataset_name is not None: if dataset_name is not None:
try: try:
class_name = self.class_names[ class_name = self.class_names[
@@ -99,7 +109,7 @@ class Datasets:
raise ValueError(f"Unknown dataset: {dataset_name}") raise ValueError(f"Unknown dataset: {dataset_name}")
self.data_sets = [dataset_name] self.data_sets = [dataset_name]
def load_names(self): def _load_names(self):
file_name = os.path.join(self.dataset.folder(), Files.index) file_name = os.path.join(self.dataset.folder(), Files.index)
default_class = "class" default_class = "class"
with open(file_name) as f: with open(file_name) as f:
@@ -115,12 +125,61 @@ class Datasets:
self.data_sets = result self.data_sets = result
self.class_names = class_names self.class_names = class_names
def load(self, name): def get_attributes(self, name):
class Attributes:
pass
X, y = self.load_continuous(name)
attr = Attributes()
values, counts = np.unique(y, return_counts=True)
comp = ""
sep = ""
for count in counts:
comp += f"{sep}{count/sum(counts)*100:5.2f}%"
sep = "/ "
attr.balance = comp
attr.classes = len(np.unique(y))
attr.samples = X.shape[0]
attr.features = X.shape[1]
return attr
def get_features(self):
return self.dataset.features
def get_class_name(self):
return self.dataset.class_name
def load_continuous(self, name, dataframe=False):
try: try:
class_name = self.class_names[self.data_sets.index(name)] class_name = self.class_names[self.data_sets.index(name)]
return self.dataset.load(name, class_name) return self.dataset.load(name, class_name, dataframe)
except (ValueError, FileNotFoundError): except (ValueError, FileNotFoundError):
raise ValueError(f"Unknown dataset: {name}") raise ValueError(f"Unknown dataset: {name}")
def discretize(self, X, y):
"""Supervised discretization with Fayyad and Irani's MDLP algorithm.
Parameters
----------
X : np.ndarray
array (n_samples, n_features) of features
y : np.ndarray
array (n_samples,) of labels
Returns
-------
tuple (X, y) of numpy.ndarray
"""
discretiz = MDLP()
Xdisc = discretiz.fit_transform(X, y)
return Xdisc.astype(int), y.astype(int)
def load_discretized(self, name, dataframe=False):
X, y = self.load_continuous(name)
X, y = self.discretize(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
return dataset if dataframe else X, y
def __iter__(self) -> Diterator: def __iter__(self) -> Diterator:
return Diterator(self.data_sets) return Diterator(self.data_sets)

View File

@@ -8,6 +8,7 @@ from sklearn.ensemble import (
) )
from sklearn.svm import SVC from sklearn.svm import SVC
from stree import Stree from stree import Stree
from bayesclass import TAN
from wodt import Wodt from wodt import Wodt
from odte import Odte from odte import Odte
from xgboost import XGBClassifier from xgboost import XGBClassifier
@@ -20,6 +21,7 @@ class Models:
def define_models(random_state): def define_models(random_state):
return { return {
"STree": Stree(random_state=random_state), "STree": Stree(random_state=random_state),
"TAN": TAN(random_state=random_state),
"Cart": DecisionTreeClassifier(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state),
"ExtraTree": ExtraTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state),
"Wodt": Wodt(random_state=random_state), "Wodt": Wodt(random_state=random_state),

View File

@@ -1,4 +1,5 @@
import os import os
import sys
from operator import itemgetter from operator import itemgetter
import math import math
import json import json
@@ -17,6 +18,7 @@ from .Utils import (
TextColor, TextColor,
NO_RESULTS, NO_RESULTS,
) )
from ._version import __version__
class BestResultsEver: class BestResultsEver:
@@ -566,37 +568,251 @@ class Excel(BaseReport):
self.sheet.set_row(c, 20) self.sheet.set_row(c, 20)
self.sheet.set_row(0, 25) self.sheet.set_row(0, 25)
self.sheet.freeze_panes(6, 1) self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines() self.sheet.hide_gridlines(2)
if self.close: if self.close:
self.book.close() self.book.close()
class ReportDatasets: class ReportDatasets:
row = 6
# alternate lines colors
color1 = "#DCE6F1"
color2 = "#FDE9D9"
color3 = "#B1A0C7"
def __init__(self, excel=False, book=None):
self.excel = excel
self.env = EnvData().load()
self.close = False
self.output = True
self.header_text = f"Datasets used in benchmark ver. {__version__}"
if excel:
self.max_length = 0
if book is None:
self.excel_file_name = Files.datasets_report_excel
self.book = xlsxwriter.Workbook(
self.excel_file_name, {"nan_inf_to_errors": True}
)
self.set_properties(self.get_title())
self.close = True
else:
self.book = book
self.output = False
self.sheet = self.book.add_worksheet("Datasets")
def set_properties(self, title):
self.book.set_properties(
{
"title": title,
"subject": "Machine learning results",
"author": "Ricardo Montañana Gómez",
"manager": "Dr. J. A. Gámez, Dr. J. M. Puerta",
"company": "UCLM",
"comments": "Created with Python and XlsxWriter",
}
)
@staticmethod @staticmethod
def report(): def get_python_version():
return "{}.{}".format(sys.version_info.major, sys.version_info.minor)
def get_title(self):
return (
f" Benchmark ver. {__version__} - "
f" Python ver. {self.get_python_version()}"
f" with {self.env['n_folds']} Folds cross validation "
f" Discretization: {self.env['discretize']} "
f"Stratification: {self.env['stratified']}"
)
def get_file_name(self):
return self.excel_file_name
def header(self):
merge_format = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "center",
"valign": "vcenter",
"font_size": 18,
"bg_color": self.color3,
}
)
merge_format_subheader = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "center",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
merge_format_subheader_right = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "right",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
merge_format_subheader_left = self.book.add_format(
{
"border": 1,
"bold": 1,
"align": "left",
"valign": "vcenter",
"font_size": 16,
"bg_color": self.color1,
}
)
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
self.sheet.merge_range(
1,
0,
4,
0,
f" Default score {self.env['score']}",
merge_format_subheader,
)
self.sheet.merge_range(
1,
1,
1,
3,
"Cross validation",
merge_format_subheader_right,
)
self.sheet.write(
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
)
self.sheet.merge_range(
2,
1,
2,
3,
"Stratified",
merge_format_subheader_right,
)
self.sheet.write(
2,
4,
f"{'True' if self.env['stratified']=='1' else 'False'}",
merge_format_subheader_left,
)
self.sheet.merge_range(
3,
1,
3,
3,
"Discretized",
merge_format_subheader_right,
)
self.sheet.write(
3,
4,
f"{'True' if self.env['discretize']=='1' else 'False'}",
merge_format_subheader_left,
)
self.sheet.merge_range(
4,
1,
4,
3,
"Seeds",
merge_format_subheader_right,
)
self.sheet.write(
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
)
self.update_max_length(len(self.env["seeds"]) + 1)
header_cols = [
("Dataset", 30),
("Samples", 10),
("Features", 10),
("Classes", 10),
("Balance", 50),
]
bold = self.book.add_format(
{
"bold": True,
"font_size": 14,
"bg_color": self.color3,
"border": 1,
}
)
i = 0
for item, length in header_cols:
self.sheet.write(5, i, item, bold)
self.sheet.set_column(i, i, length)
i += 1
def footer(self):
# set Balance column width to max length
self.sheet.set_column(4, 4, self.max_length)
self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines(2)
if self.close:
self.book.close()
def print_line(self, result):
size_n = 14
integer = self.book.add_format(
{"num_format": "#,###", "font_size": size_n, "border": 1}
)
normal = self.book.add_format({"font_size": size_n, "border": 1})
col = 0
if self.row % 2 == 0:
normal.set_bg_color(self.color1)
integer.set_bg_color(self.color1)
else:
normal.set_bg_color(self.color2)
integer.set_bg_color(self.color2)
self.sheet.write(self.row, col, result.dataset, normal)
self.sheet.write(self.row, col + 1, result.samples, integer)
self.sheet.write(self.row, col + 2, result.features, integer)
self.sheet.write(self.row, col + 3, result.classes, normal)
self.sheet.write(self.row, col + 4, result.balance, normal)
self.update_max_length(len(result.balance))
self.row += 1
def update_max_length(self, value):
if value > self.max_length:
self.max_length = value
def report(self):
data_sets = Datasets() data_sets = Datasets()
color_line = TextColor.LINE1 color_line = TextColor.LINE1
print(color_line, end="") if self.excel:
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") self.header()
print("=" * 30 + " ===== ====== === " + "=" * 40) if self.output:
print(color_line, end="")
print(self.header_text)
print("")
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
print("=" * 30 + " ====== ===== === " + "=" * 60)
for dataset in data_sets: for dataset in data_sets:
X, y = data_sets.load(dataset) attributes = data_sets.get_attributes(dataset)
attributes.dataset = dataset
if self.excel:
self.print_line(attributes)
color_line = ( color_line = (
TextColor.LINE2 TextColor.LINE2
if color_line == TextColor.LINE1 if color_line == TextColor.LINE1
else TextColor.LINE1 else TextColor.LINE1
) )
values, counts = np.unique(y, return_counts=True) if self.output:
comp = "" print(color_line, end="")
sep = "" print(
for count in counts: f"{dataset:30s} {attributes.samples:6,d} "
comp += f"{sep}{count/sum(counts)*100:5.2f}%" f"{attributes.features:5,d} {attributes.classes:3d} "
sep = "/ " f"{attributes.balance:40s}"
print(color_line, end="") )
print( if self.excel:
f"{dataset:30s} {X.shape[0]:6,d} {X.shape[1]:5,d} " self.footer()
f"{len(np.unique(y)):3d} {comp:40s}"
)
class SQL(BaseReport): class SQL(BaseReport):
@@ -1068,7 +1284,12 @@ class Benchmark:
k = Excel(file_name=file_name, book=book) k = Excel(file_name=file_name, book=book)
k.report() k.report()
sheet.freeze_panes(6, 1) sheet.freeze_panes(6, 1)
sheet.hide_gridlines() sheet.hide_gridlines(2)
def add_datasets_sheet():
# Add datasets sheet
re = ReportDatasets(excel=True, book=book)
re.report()
def exreport_output(): def exreport_output():
file_name = os.path.join( file_name = os.path.join(
@@ -1096,6 +1317,7 @@ class Benchmark:
footer() footer()
models_files() models_files()
exreport_output() exreport_output()
add_datasets_sheet()
book.close() book.close()

View File

@@ -27,6 +27,7 @@ class Files:
exreport_pdf = "Rplots.pdf" exreport_pdf = "Rplots.pdf"
benchmark_r = "benchmark.r" benchmark_r = "benchmark.r"
dot_env = ".env" dot_env = ".env"
datasets_report_excel = "ReportDatasets.xlsx"
@staticmethod @staticmethod
def exreport_output(score): def exreport_output(score):

View File

@@ -1,10 +1,16 @@
from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer, DatasetsArff from .Datasets import (
Datasets,
DatasetsSurcov,
DatasetsTanveer,
DatasetsArff,
)
from .Experiments import Experiment from .Experiments import Experiment
from .Results import Report, Summary from .Results import Report, Summary
from ._version import __version__
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"
__copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" __copyright__ = "Copyright 2020-2023, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es" __author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Experiment", "Datasets", "Report", "Summary"] __all__ = ["Experiment", "Datasets", "Report", "Summary", __version__]

1
benchmark/_version Normal file
View File

@@ -0,0 +1 @@
__version__ = "0.7.1"

View File

@@ -21,7 +21,11 @@ def main(args_test=None):
if args.grid: if args.grid:
args.best = None args.best = None
if args.file is None and args.best is None and args.grid is None: if args.file is None and args.best is None and args.grid is None:
ReportDatasets.report() report = ReportDatasets(args.excel)
report.report()
if args.excel:
is_test = args_test is not None
Files.open(report.get_file_name(), is_test)
else: else:
if args.best is not None or args.grid is not None: if args.best is not None or args.grid is not None:
report = ReportBest(args.score, args.model, args.best, args.grid) report = ReportBest(args.score, args.model, args.best, args.grid)

View File

@@ -6,3 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -4,4 +4,5 @@ n_folds=5
model=ODTE model=ODTE
stratified=0 stratified=0
source_data=Arff source_data=Arff
seeds=[271, 314, 171] seeds=[271, 314, 171]
discretize=1

View File

@@ -6,3 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -5,4 +5,5 @@ model=ODTE
stratified=0 stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Surcov source_data=Surcov
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

1
benchmark/tests/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
ReportDatasets.xlsx

View File

@@ -89,6 +89,15 @@ class BenchmarkTest(TestBase):
self.assertTrue(os.path.exists(benchmark.get_tex_file())) self.assertTrue(os.path.exists(benchmark.get_tex_file()))
self.check_file_file(benchmark.get_tex_file(), "exreport_tex") self.check_file_file(benchmark.get_tex_file(), "exreport_tex")
@staticmethod
def generate_excel_sheet(test, sheet, file_name):
with open(os.path.join("test_files", file_name), "w") as f:
for row in range(1, sheet.max_row + 1):
for col in range(1, sheet.max_column + 1):
value = sheet.cell(row=row, column=col).value
if value is not None:
print(f'{row};{col};"{value}"', file=f)
def test_excel_output(self): def test_excel_output(self):
benchmark = Benchmark("accuracy", visualize=False) benchmark = Benchmark("accuracy", visualize=False)
benchmark.compile_results() benchmark.compile_results()
@@ -101,6 +110,3 @@ class BenchmarkTest(TestBase):
for sheet_name in book.sheetnames: for sheet_name in book.sheetnames:
sheet = book[sheet_name] sheet = book[sheet_name]
self.check_excel_sheet(sheet, f"exreport_excel_{sheet_name}") self.check_excel_sheet(sheet, f"exreport_excel_{sheet_name}")
# ExcelTest.generate_excel_sheet(
# self, sheet, f"exreport_excel_{sheet_name}"
# )

View File

@@ -179,6 +179,7 @@ class UtilTest(TestBase):
"stratified": "0", "stratified": "0",
"source_data": "Tanveer", "source_data": "Tanveer",
"seeds": "[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]", "seeds": "[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]",
"discretize": "0",
} }
computed = EnvData().load() computed = EnvData().load()
self.assertDictEqual(computed, expected) self.assertDictEqual(computed, expected)

View File

@@ -1,6 +1,6 @@
import os import os
from openpyxl import load_workbook from openpyxl import load_workbook
from ...Utils import Folders from ...Utils import Folders, Files
from ..TestBase import TestBase from ..TestBase import TestBase
@@ -43,6 +43,15 @@ class BeReportTest(TestBase):
self.assertEqual(stderr.getvalue(), "") self.assertEqual(stderr.getvalue(), "")
self.check_output_file(stdout, "report_datasets") self.check_output_file(stdout, "report_datasets")
def test_be_report_datasets_excel(self):
stdout, stderr = self.execute_script("be_report", ["-x", "1"])
self.assertEqual(stderr.getvalue(), "")
self.check_output_file(stdout, "report_datasets")
file_name = os.path.join(os.getcwd(), Files.datasets_report_excel)
book = load_workbook(file_name)
sheet = book["Datasets"]
self.check_excel_sheet(sheet, "exreport_excel_Datasets")
def test_be_report_best(self): def test_be_report_best(self):
stdout, stderr = self.execute_script( stdout, stderr = self.execute_script(
"be_report", ["-s", "accuracy", "-m", "STree", "-b", "1"] "be_report", ["-s", "accuracy", "-m", "STree", "-b", "1"]

View File

@@ -0,0 +1,25 @@
1;1;"Datasets used in benchmark ver. 0.2.0"
2;1;" Default score accuracy"
2;2;"Cross validation"
2;5;"5 Folds"
3;2;"Stratified"
3;5;"False"
4;2;"Discretized"
4;5;"False"
5;2;"Seeds"
5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]"
6;1;"Dataset"
6;2;"Samples"
6;3;"Features"
6;4;"Classes"
6;5;"Balance"
7;1;"balance-scale"
7;2;"625"
7;3;"4"
7;4;"3"
7;5;" 7.84%/ 46.08%/ 46.08%"
8;1;"balloons"
8;2;"16"
8;3;"4"
8;4;"2"
8;5;"56.25%/ 43.75%"

View File

@@ -1,4 +1,6 @@
Dataset Sampl. Feat. Cls Balance Datasets used in benchmark ver. 0.2.0
============================== ===== ====== === ========================================
Dataset Sampl. Feat. Cls Balance
============================== ====== ===== === ============================================================
balance-scale 625 4 3 7.84%/ 46.08%/ 46.08% balance-scale 625 4 3 7.84%/ 46.08%/ 46.08%
balloons 16 4 2 56.25%/ 43.75% balloons 16 4 2 56.25%/ 43.75%

View File

@@ -2,7 +2,10 @@ pandas
scikit-learn scikit-learn
scipy scipy
odte odte
cython
mdlp-discretization
mufs mufs
bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git
xlsxwriter xlsxwriter
openpyxl openpyxl
tqdm tqdm