mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-16 16:05:54 +00:00
feat: ✨ Add continuous features for datasets in Arff Files
Makes possible to leave untouched some already discrete variables if discretize is on on .env file
This commit is contained in:
@@ -2,10 +2,11 @@ import os
|
|||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import json
|
||||||
from scipy.io import arff
|
from scipy.io import arff
|
||||||
from .Utils import Files
|
from .Utils import Files
|
||||||
from .Arguments import EnvData
|
from .Arguments import EnvData
|
||||||
from mdlp.discretization import MDLP
|
from fimdlp.mdlp import FImdlp
|
||||||
|
|
||||||
|
|
||||||
class Diterator:
|
class Diterator:
|
||||||
@@ -112,6 +113,7 @@ class Datasets:
|
|||||||
def _init_names(self, dataset_name):
|
def _init_names(self, dataset_name):
|
||||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||||
default_class = "class"
|
default_class = "class"
|
||||||
|
self.continuous_features = {}
|
||||||
with open(file_name) as f:
|
with open(file_name) as f:
|
||||||
sets = f.read().splitlines()
|
sets = f.read().splitlines()
|
||||||
class_names = [default_class] * len(sets)
|
class_names = [default_class] * len(sets)
|
||||||
@@ -119,10 +121,14 @@ class Datasets:
|
|||||||
result = []
|
result = []
|
||||||
class_names = []
|
class_names = []
|
||||||
for data in sets:
|
for data in sets:
|
||||||
name, class_name = data.split(",")
|
name, class_name, features = data.split(",", 2)
|
||||||
result.append(name)
|
result.append(name)
|
||||||
class_names.append(class_name)
|
class_names.append(class_name)
|
||||||
|
self.continuous_features[name] = features
|
||||||
sets = result
|
sets = result
|
||||||
|
else:
|
||||||
|
for name in sets:
|
||||||
|
self.continuous_features[name] = None
|
||||||
# Set as dataset list the dataset passed as argument
|
# Set as dataset list the dataset passed as argument
|
||||||
if dataset_name is None:
|
if dataset_name is None:
|
||||||
return class_names, sets
|
return class_names, sets
|
||||||
@@ -137,6 +143,7 @@ class Datasets:
|
|||||||
self.discretize = False
|
self.discretize = False
|
||||||
X, y = self.load(name)
|
X, y = self.load(name)
|
||||||
attr = SimpleNamespace()
|
attr = SimpleNamespace()
|
||||||
|
attr.dataset = name
|
||||||
values, counts = np.unique(y, return_counts=True)
|
values, counts = np.unique(y, return_counts=True)
|
||||||
comp = ""
|
comp = ""
|
||||||
sep = ""
|
sep = ""
|
||||||
@@ -147,12 +154,16 @@ class Datasets:
|
|||||||
attr.classes = len(np.unique(y))
|
attr.classes = len(np.unique(y))
|
||||||
attr.samples = X.shape[0]
|
attr.samples = X.shape[0]
|
||||||
attr.features = X.shape[1]
|
attr.features = X.shape[1]
|
||||||
|
attr.cont_features = len(self.get_continuous_features())
|
||||||
self.discretize = tmp
|
self.discretize = tmp
|
||||||
return attr
|
return attr
|
||||||
|
|
||||||
def get_features(self):
|
def get_features(self):
|
||||||
return self.dataset.features
|
return self.dataset.features
|
||||||
|
|
||||||
|
def get_continuous_features(self):
|
||||||
|
return self.continuous_features_dataset
|
||||||
|
|
||||||
def get_class_name(self):
|
def get_class_name(self):
|
||||||
return self.dataset.class_name
|
return self.dataset.class_name
|
||||||
|
|
||||||
@@ -160,9 +171,16 @@ class Datasets:
|
|||||||
return self.dataset.dataset
|
return self.dataset.dataset
|
||||||
|
|
||||||
def load(self, name, dataframe=False):
|
def load(self, name, dataframe=False):
|
||||||
|
def get_range_features(X, name):
|
||||||
|
c_features = self.continuous_features[name]
|
||||||
|
if c_features.strip() == "all":
|
||||||
|
return list(range(X.shape[1]))
|
||||||
|
return json.loads(c_features)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
class_name = self.class_names[self.data_sets.index(name)]
|
class_name = self.class_names[self.data_sets.index(name)]
|
||||||
X, y = self.dataset.load(name, class_name)
|
X, y = self.dataset.load(name, class_name)
|
||||||
|
self.continuous_features_dataset = get_range_features(X, name)
|
||||||
if self.discretize:
|
if self.discretize:
|
||||||
X = self.discretize_dataset(X, y)
|
X = self.discretize_dataset(X, y)
|
||||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||||
@@ -188,7 +206,7 @@ class Datasets:
|
|||||||
-------
|
-------
|
||||||
tuple (X, y) of numpy.ndarray
|
tuple (X, y) of numpy.ndarray
|
||||||
"""
|
"""
|
||||||
discretiz = MDLP(random_state=17, dtype=np.int32)
|
discretiz = FImdlp(proposal=False)
|
||||||
Xdisc = discretiz.fit_transform(X, y)
|
Xdisc = discretiz.fit_transform(X, y)
|
||||||
return Xdisc
|
return Xdisc
|
||||||
|
|
||||||
|
@@ -684,7 +684,7 @@ class ReportDatasets:
|
|||||||
"bg_color": self.color1,
|
"bg_color": self.color1,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
|
self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format)
|
||||||
self.sheet.merge_range(
|
self.sheet.merge_range(
|
||||||
1,
|
1,
|
||||||
0,
|
0,
|
||||||
@@ -697,24 +697,24 @@ class ReportDatasets:
|
|||||||
1,
|
1,
|
||||||
1,
|
1,
|
||||||
1,
|
1,
|
||||||
3,
|
4,
|
||||||
"Cross validation",
|
"Cross validation",
|
||||||
merge_format_subheader_right,
|
merge_format_subheader_right,
|
||||||
)
|
)
|
||||||
self.sheet.write(
|
self.sheet.write(
|
||||||
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
|
1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left
|
||||||
)
|
)
|
||||||
self.sheet.merge_range(
|
self.sheet.merge_range(
|
||||||
2,
|
2,
|
||||||
1,
|
1,
|
||||||
2,
|
2,
|
||||||
3,
|
4,
|
||||||
"Stratified",
|
"Stratified",
|
||||||
merge_format_subheader_right,
|
merge_format_subheader_right,
|
||||||
)
|
)
|
||||||
self.sheet.write(
|
self.sheet.write(
|
||||||
2,
|
2,
|
||||||
4,
|
5,
|
||||||
f"{'True' if self.env['stratified']=='1' else 'False'}",
|
f"{'True' if self.env['stratified']=='1' else 'False'}",
|
||||||
merge_format_subheader_left,
|
merge_format_subheader_left,
|
||||||
)
|
)
|
||||||
@@ -722,13 +722,13 @@ class ReportDatasets:
|
|||||||
3,
|
3,
|
||||||
1,
|
1,
|
||||||
3,
|
3,
|
||||||
3,
|
4,
|
||||||
"Discretized",
|
"Discretized",
|
||||||
merge_format_subheader_right,
|
merge_format_subheader_right,
|
||||||
)
|
)
|
||||||
self.sheet.write(
|
self.sheet.write(
|
||||||
3,
|
3,
|
||||||
4,
|
5,
|
||||||
f"{'True' if self.env['discretize']=='1' else 'False'}",
|
f"{'True' if self.env['discretize']=='1' else 'False'}",
|
||||||
merge_format_subheader_left,
|
merge_format_subheader_left,
|
||||||
)
|
)
|
||||||
@@ -736,18 +736,19 @@ class ReportDatasets:
|
|||||||
4,
|
4,
|
||||||
1,
|
1,
|
||||||
4,
|
4,
|
||||||
3,
|
4,
|
||||||
"Seeds",
|
"Seeds",
|
||||||
merge_format_subheader_right,
|
merge_format_subheader_right,
|
||||||
)
|
)
|
||||||
self.sheet.write(
|
self.sheet.write(
|
||||||
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
|
4, 5, f"{self.env['seeds']}", merge_format_subheader_left
|
||||||
)
|
)
|
||||||
self.update_max_length(len(self.env["seeds"]) + 1)
|
self.update_max_length(len(self.env["seeds"]) + 1)
|
||||||
header_cols = [
|
header_cols = [
|
||||||
("Dataset", 30),
|
("Dataset", 30),
|
||||||
("Samples", 10),
|
("Samples", 10),
|
||||||
("Features", 10),
|
("Features", 10),
|
||||||
|
("Continuous", 10),
|
||||||
("Classes", 10),
|
("Classes", 10),
|
||||||
("Balance", 50),
|
("Balance", 50),
|
||||||
]
|
]
|
||||||
@@ -767,7 +768,7 @@ class ReportDatasets:
|
|||||||
|
|
||||||
def footer(self):
|
def footer(self):
|
||||||
# set Balance column width to max length
|
# set Balance column width to max length
|
||||||
self.sheet.set_column(4, 4, self.max_length)
|
self.sheet.set_column(5, 5, self.max_length)
|
||||||
self.sheet.freeze_panes(6, 1)
|
self.sheet.freeze_panes(6, 1)
|
||||||
self.sheet.hide_gridlines(2)
|
self.sheet.hide_gridlines(2)
|
||||||
if self.close:
|
if self.close:
|
||||||
@@ -789,8 +790,9 @@ class ReportDatasets:
|
|||||||
self.sheet.write(self.row, col, result.dataset, normal)
|
self.sheet.write(self.row, col, result.dataset, normal)
|
||||||
self.sheet.write(self.row, col + 1, result.samples, integer)
|
self.sheet.write(self.row, col + 1, result.samples, integer)
|
||||||
self.sheet.write(self.row, col + 2, result.features, integer)
|
self.sheet.write(self.row, col + 2, result.features, integer)
|
||||||
self.sheet.write(self.row, col + 3, result.classes, normal)
|
self.sheet.write(self.row, col + 3, result.cont_features, integer)
|
||||||
self.sheet.write(self.row, col + 4, result.balance, normal)
|
self.sheet.write(self.row, col + 4, result.classes, normal)
|
||||||
|
self.sheet.write(self.row, col + 5, result.balance, normal)
|
||||||
self.update_max_length(len(result.balance))
|
self.update_max_length(len(result.balance))
|
||||||
self.row += 1
|
self.row += 1
|
||||||
|
|
||||||
@@ -807,11 +809,11 @@ class ReportDatasets:
|
|||||||
print(color_line, end="")
|
print(color_line, end="")
|
||||||
print(self.header_text)
|
print(self.header_text)
|
||||||
print("")
|
print("")
|
||||||
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
|
print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance")
|
||||||
print("=" * 30 + " ====== ===== === " + "=" * 60)
|
print("=" * 30 + " ====== ===== ==== === " + "=" * 60)
|
||||||
for dataset in data_sets:
|
for dataset in data_sets:
|
||||||
attributes = data_sets.get_attributes(dataset)
|
attributes = data_sets.get_attributes(dataset)
|
||||||
attributes.dataset = dataset
|
|
||||||
if self.excel:
|
if self.excel:
|
||||||
self.print_line(attributes)
|
self.print_line(attributes)
|
||||||
color_line = (
|
color_line = (
|
||||||
@@ -823,8 +825,8 @@ class ReportDatasets:
|
|||||||
print(color_line, end="")
|
print(color_line, end="")
|
||||||
print(
|
print(
|
||||||
f"{dataset:30s} {attributes.samples:6,d} "
|
f"{dataset:30s} {attributes.samples:6,d} "
|
||||||
f"{attributes.features:5,d} {attributes.classes:3d} "
|
f"{attributes.features:5,d} {attributes.cont_features:4,d}"
|
||||||
f"{attributes.balance:40s}"
|
f" {attributes.classes:3d} {attributes.balance:40s}"
|
||||||
)
|
)
|
||||||
if self.excel:
|
if self.excel:
|
||||||
self.footer()
|
self.footer()
|
||||||
|
Reference in New Issue
Block a user