From 8e035ef196c37d688bd2e404ffb141f77ad12b48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Sat, 17 Dec 2022 19:24:37 +0100 Subject: [PATCH] feat: :sparkles: Add continuous features for datasets in Arff Files Makes possible to leave untouched some already discrete variables if discretize is on on .env file --- benchmark/Datasets.py | 24 +++++++++++++++++++++--- benchmark/Results.py | 36 +++++++++++++++++++----------------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index 2a966e7..ad78b58 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -2,10 +2,11 @@ import os from types import SimpleNamespace import pandas as pd import numpy as np +import json from scipy.io import arff from .Utils import Files from .Arguments import EnvData -from mdlp.discretization import MDLP +from fimdlp.mdlp import FImdlp class Diterator: @@ -112,6 +113,7 @@ class Datasets: def _init_names(self, dataset_name): file_name = os.path.join(self.dataset.folder(), Files.index) default_class = "class" + self.continuous_features = {} with open(file_name) as f: sets = f.read().splitlines() class_names = [default_class] * len(sets) @@ -119,10 +121,14 @@ class Datasets: result = [] class_names = [] for data in sets: - name, class_name = data.split(",") + name, class_name, features = data.split(",", 2) result.append(name) class_names.append(class_name) + self.continuous_features[name] = features sets = result + else: + for name in sets: + self.continuous_features[name] = None # Set as dataset list the dataset passed as argument if dataset_name is None: return class_names, sets @@ -137,6 +143,7 @@ class Datasets: self.discretize = False X, y = self.load(name) attr = SimpleNamespace() + attr.dataset = name values, counts = np.unique(y, return_counts=True) comp = "" sep = "" @@ -147,12 +154,16 @@ class Datasets: attr.classes = len(np.unique(y)) attr.samples = X.shape[0] attr.features = X.shape[1] + attr.cont_features = len(self.get_continuous_features()) self.discretize = tmp return attr def get_features(self): return self.dataset.features + def get_continuous_features(self): + return self.continuous_features_dataset + def get_class_name(self): return self.dataset.class_name @@ -160,9 +171,16 @@ class Datasets: return self.dataset.dataset def load(self, name, dataframe=False): + def get_range_features(X, name): + c_features = self.continuous_features[name] + if c_features.strip() == "all": + return list(range(X.shape[1])) + return json.loads(c_features) + try: class_name = self.class_names[self.data_sets.index(name)] X, y = self.dataset.load(name, class_name) + self.continuous_features_dataset = get_range_features(X, name) if self.discretize: X = self.discretize_dataset(X, y) dataset = pd.DataFrame(X, columns=self.get_features()) @@ -188,7 +206,7 @@ class Datasets: ------- tuple (X, y) of numpy.ndarray """ - discretiz = MDLP(random_state=17, dtype=np.int32) + discretiz = FImdlp(proposal=False) Xdisc = discretiz.fit_transform(X, y) return Xdisc diff --git a/benchmark/Results.py b/benchmark/Results.py index 28376b0..2188ea3 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -684,7 +684,7 @@ class ReportDatasets: "bg_color": self.color1, } ) - self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format) + self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format) self.sheet.merge_range( 1, 0, @@ -697,24 +697,24 @@ class ReportDatasets: 1, 1, 1, - 3, + 4, "Cross validation", merge_format_subheader_right, ) self.sheet.write( - 1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left + 1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left ) self.sheet.merge_range( 2, 1, 2, - 3, + 4, "Stratified", merge_format_subheader_right, ) self.sheet.write( 2, - 4, + 5, f"{'True' if self.env['stratified']=='1' else 'False'}", merge_format_subheader_left, ) @@ -722,13 +722,13 @@ class ReportDatasets: 3, 1, 3, - 3, + 4, "Discretized", merge_format_subheader_right, ) self.sheet.write( 3, - 4, + 5, f"{'True' if self.env['discretize']=='1' else 'False'}", merge_format_subheader_left, ) @@ -736,18 +736,19 @@ class ReportDatasets: 4, 1, 4, - 3, + 4, "Seeds", merge_format_subheader_right, ) self.sheet.write( - 4, 4, f"{self.env['seeds']}", merge_format_subheader_left + 4, 5, f"{self.env['seeds']}", merge_format_subheader_left ) self.update_max_length(len(self.env["seeds"]) + 1) header_cols = [ ("Dataset", 30), ("Samples", 10), ("Features", 10), + ("Continuous", 10), ("Classes", 10), ("Balance", 50), ] @@ -767,7 +768,7 @@ class ReportDatasets: def footer(self): # set Balance column width to max length - self.sheet.set_column(4, 4, self.max_length) + self.sheet.set_column(5, 5, self.max_length) self.sheet.freeze_panes(6, 1) self.sheet.hide_gridlines(2) if self.close: @@ -789,8 +790,9 @@ class ReportDatasets: self.sheet.write(self.row, col, result.dataset, normal) self.sheet.write(self.row, col + 1, result.samples, integer) self.sheet.write(self.row, col + 2, result.features, integer) - self.sheet.write(self.row, col + 3, result.classes, normal) - self.sheet.write(self.row, col + 4, result.balance, normal) + self.sheet.write(self.row, col + 3, result.cont_features, integer) + self.sheet.write(self.row, col + 4, result.classes, normal) + self.sheet.write(self.row, col + 5, result.balance, normal) self.update_max_length(len(result.balance)) self.row += 1 @@ -807,11 +809,11 @@ class ReportDatasets: print(color_line, end="") print(self.header_text) print("") - print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") - print("=" * 30 + " ====== ===== === " + "=" * 60) + print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance") + print("=" * 30 + " ====== ===== ==== === " + "=" * 60) for dataset in data_sets: attributes = data_sets.get_attributes(dataset) - attributes.dataset = dataset + if self.excel: self.print_line(attributes) color_line = ( @@ -823,8 +825,8 @@ class ReportDatasets: print(color_line, end="") print( f"{dataset:30s} {attributes.samples:6,d} " - f"{attributes.features:5,d} {attributes.classes:3d} " - f"{attributes.balance:40s}" + f"{attributes.features:5,d} {attributes.cont_features:4,d}" + f" {attributes.classes:3d} {attributes.balance:40s}" ) if self.excel: self.footer()