mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-15 23:45:54 +00:00
feat: ✨ Add continuous features for datasets in Arff Files
Makes possible to leave untouched some already discrete variables if discretize is on on .env file
This commit is contained in:
@@ -2,10 +2,11 @@ import os
|
||||
from types import SimpleNamespace
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import json
|
||||
from scipy.io import arff
|
||||
from .Utils import Files
|
||||
from .Arguments import EnvData
|
||||
from mdlp.discretization import MDLP
|
||||
from fimdlp.mdlp import FImdlp
|
||||
|
||||
|
||||
class Diterator:
|
||||
@@ -112,6 +113,7 @@ class Datasets:
|
||||
def _init_names(self, dataset_name):
|
||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||
default_class = "class"
|
||||
self.continuous_features = {}
|
||||
with open(file_name) as f:
|
||||
sets = f.read().splitlines()
|
||||
class_names = [default_class] * len(sets)
|
||||
@@ -119,10 +121,14 @@ class Datasets:
|
||||
result = []
|
||||
class_names = []
|
||||
for data in sets:
|
||||
name, class_name = data.split(",")
|
||||
name, class_name, features = data.split(",", 2)
|
||||
result.append(name)
|
||||
class_names.append(class_name)
|
||||
self.continuous_features[name] = features
|
||||
sets = result
|
||||
else:
|
||||
for name in sets:
|
||||
self.continuous_features[name] = None
|
||||
# Set as dataset list the dataset passed as argument
|
||||
if dataset_name is None:
|
||||
return class_names, sets
|
||||
@@ -137,6 +143,7 @@ class Datasets:
|
||||
self.discretize = False
|
||||
X, y = self.load(name)
|
||||
attr = SimpleNamespace()
|
||||
attr.dataset = name
|
||||
values, counts = np.unique(y, return_counts=True)
|
||||
comp = ""
|
||||
sep = ""
|
||||
@@ -147,12 +154,16 @@ class Datasets:
|
||||
attr.classes = len(np.unique(y))
|
||||
attr.samples = X.shape[0]
|
||||
attr.features = X.shape[1]
|
||||
attr.cont_features = len(self.get_continuous_features())
|
||||
self.discretize = tmp
|
||||
return attr
|
||||
|
||||
def get_features(self):
|
||||
return self.dataset.features
|
||||
|
||||
def get_continuous_features(self):
|
||||
return self.continuous_features_dataset
|
||||
|
||||
def get_class_name(self):
|
||||
return self.dataset.class_name
|
||||
|
||||
@@ -160,9 +171,16 @@ class Datasets:
|
||||
return self.dataset.dataset
|
||||
|
||||
def load(self, name, dataframe=False):
|
||||
def get_range_features(X, name):
|
||||
c_features = self.continuous_features[name]
|
||||
if c_features.strip() == "all":
|
||||
return list(range(X.shape[1]))
|
||||
return json.loads(c_features)
|
||||
|
||||
try:
|
||||
class_name = self.class_names[self.data_sets.index(name)]
|
||||
X, y = self.dataset.load(name, class_name)
|
||||
self.continuous_features_dataset = get_range_features(X, name)
|
||||
if self.discretize:
|
||||
X = self.discretize_dataset(X, y)
|
||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||
@@ -188,7 +206,7 @@ class Datasets:
|
||||
-------
|
||||
tuple (X, y) of numpy.ndarray
|
||||
"""
|
||||
discretiz = MDLP(random_state=17, dtype=np.int32)
|
||||
discretiz = FImdlp(proposal=False)
|
||||
Xdisc = discretiz.fit_transform(X, y)
|
||||
return Xdisc
|
||||
|
||||
|
@@ -684,7 +684,7 @@ class ReportDatasets:
|
||||
"bg_color": self.color1,
|
||||
}
|
||||
)
|
||||
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
|
||||
self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format)
|
||||
self.sheet.merge_range(
|
||||
1,
|
||||
0,
|
||||
@@ -697,24 +697,24 @@ class ReportDatasets:
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
3,
|
||||
4,
|
||||
"Cross validation",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
|
||||
1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left
|
||||
)
|
||||
self.sheet.merge_range(
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
"Stratified",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
2,
|
||||
4,
|
||||
5,
|
||||
f"{'True' if self.env['stratified']=='1' else 'False'}",
|
||||
merge_format_subheader_left,
|
||||
)
|
||||
@@ -722,13 +722,13 @@ class ReportDatasets:
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
3,
|
||||
4,
|
||||
"Discretized",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
f"{'True' if self.env['discretize']=='1' else 'False'}",
|
||||
merge_format_subheader_left,
|
||||
)
|
||||
@@ -736,18 +736,19 @@ class ReportDatasets:
|
||||
4,
|
||||
1,
|
||||
4,
|
||||
3,
|
||||
4,
|
||||
"Seeds",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
|
||||
4, 5, f"{self.env['seeds']}", merge_format_subheader_left
|
||||
)
|
||||
self.update_max_length(len(self.env["seeds"]) + 1)
|
||||
header_cols = [
|
||||
("Dataset", 30),
|
||||
("Samples", 10),
|
||||
("Features", 10),
|
||||
("Continuous", 10),
|
||||
("Classes", 10),
|
||||
("Balance", 50),
|
||||
]
|
||||
@@ -767,7 +768,7 @@ class ReportDatasets:
|
||||
|
||||
def footer(self):
|
||||
# set Balance column width to max length
|
||||
self.sheet.set_column(4, 4, self.max_length)
|
||||
self.sheet.set_column(5, 5, self.max_length)
|
||||
self.sheet.freeze_panes(6, 1)
|
||||
self.sheet.hide_gridlines(2)
|
||||
if self.close:
|
||||
@@ -789,8 +790,9 @@ class ReportDatasets:
|
||||
self.sheet.write(self.row, col, result.dataset, normal)
|
||||
self.sheet.write(self.row, col + 1, result.samples, integer)
|
||||
self.sheet.write(self.row, col + 2, result.features, integer)
|
||||
self.sheet.write(self.row, col + 3, result.classes, normal)
|
||||
self.sheet.write(self.row, col + 4, result.balance, normal)
|
||||
self.sheet.write(self.row, col + 3, result.cont_features, integer)
|
||||
self.sheet.write(self.row, col + 4, result.classes, normal)
|
||||
self.sheet.write(self.row, col + 5, result.balance, normal)
|
||||
self.update_max_length(len(result.balance))
|
||||
self.row += 1
|
||||
|
||||
@@ -807,11 +809,11 @@ class ReportDatasets:
|
||||
print(color_line, end="")
|
||||
print(self.header_text)
|
||||
print("")
|
||||
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
|
||||
print("=" * 30 + " ====== ===== === " + "=" * 60)
|
||||
print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance")
|
||||
print("=" * 30 + " ====== ===== ==== === " + "=" * 60)
|
||||
for dataset in data_sets:
|
||||
attributes = data_sets.get_attributes(dataset)
|
||||
attributes.dataset = dataset
|
||||
|
||||
if self.excel:
|
||||
self.print_line(attributes)
|
||||
color_line = (
|
||||
@@ -823,8 +825,8 @@ class ReportDatasets:
|
||||
print(color_line, end="")
|
||||
print(
|
||||
f"{dataset:30s} {attributes.samples:6,d} "
|
||||
f"{attributes.features:5,d} {attributes.classes:3d} "
|
||||
f"{attributes.balance:40s}"
|
||||
f"{attributes.features:5,d} {attributes.cont_features:4,d}"
|
||||
f" {attributes.classes:3d} {attributes.balance:40s}"
|
||||
)
|
||||
if self.excel:
|
||||
self.footer()
|
||||
|
Reference in New Issue
Block a user