feat: Add continuous features for datasets in Arff Files

Makes possible to leave untouched some already discrete variables if discretize is on on .env file
This commit is contained in:
2022-12-17 19:24:37 +01:00
parent 9bff48832b
commit 8e035ef196
2 changed files with 40 additions and 20 deletions

View File

@@ -2,10 +2,11 @@ import os
from types import SimpleNamespace
import pandas as pd
import numpy as np
import json
from scipy.io import arff
from .Utils import Files
from .Arguments import EnvData
from mdlp.discretization import MDLP
from fimdlp.mdlp import FImdlp
class Diterator:
@@ -112,6 +113,7 @@ class Datasets:
def _init_names(self, dataset_name):
file_name = os.path.join(self.dataset.folder(), Files.index)
default_class = "class"
self.continuous_features = {}
with open(file_name) as f:
sets = f.read().splitlines()
class_names = [default_class] * len(sets)
@@ -119,10 +121,14 @@ class Datasets:
result = []
class_names = []
for data in sets:
name, class_name = data.split(",")
name, class_name, features = data.split(",", 2)
result.append(name)
class_names.append(class_name)
self.continuous_features[name] = features
sets = result
else:
for name in sets:
self.continuous_features[name] = None
# Set as dataset list the dataset passed as argument
if dataset_name is None:
return class_names, sets
@@ -137,6 +143,7 @@ class Datasets:
self.discretize = False
X, y = self.load(name)
attr = SimpleNamespace()
attr.dataset = name
values, counts = np.unique(y, return_counts=True)
comp = ""
sep = ""
@@ -147,12 +154,16 @@ class Datasets:
attr.classes = len(np.unique(y))
attr.samples = X.shape[0]
attr.features = X.shape[1]
attr.cont_features = len(self.get_continuous_features())
self.discretize = tmp
return attr
def get_features(self):
return self.dataset.features
def get_continuous_features(self):
return self.continuous_features_dataset
def get_class_name(self):
return self.dataset.class_name
@@ -160,9 +171,16 @@ class Datasets:
return self.dataset.dataset
def load(self, name, dataframe=False):
def get_range_features(X, name):
c_features = self.continuous_features[name]
if c_features.strip() == "all":
return list(range(X.shape[1]))
return json.loads(c_features)
try:
class_name = self.class_names[self.data_sets.index(name)]
X, y = self.dataset.load(name, class_name)
self.continuous_features_dataset = get_range_features(X, name)
if self.discretize:
X = self.discretize_dataset(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
@@ -188,7 +206,7 @@ class Datasets:
-------
tuple (X, y) of numpy.ndarray
"""
discretiz = MDLP(random_state=17, dtype=np.int32)
discretiz = FImdlp(proposal=False)
Xdisc = discretiz.fit_transform(X, y)
return Xdisc

View File

@@ -684,7 +684,7 @@ class ReportDatasets:
"bg_color": self.color1,
}
)
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format)
self.sheet.merge_range(
1,
0,
@@ -697,24 +697,24 @@ class ReportDatasets:
1,
1,
1,
3,
4,
"Cross validation",
merge_format_subheader_right,
)
self.sheet.write(
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left
)
self.sheet.merge_range(
2,
1,
2,
3,
4,
"Stratified",
merge_format_subheader_right,
)
self.sheet.write(
2,
4,
5,
f"{'True' if self.env['stratified']=='1' else 'False'}",
merge_format_subheader_left,
)
@@ -722,13 +722,13 @@ class ReportDatasets:
3,
1,
3,
3,
4,
"Discretized",
merge_format_subheader_right,
)
self.sheet.write(
3,
4,
5,
f"{'True' if self.env['discretize']=='1' else 'False'}",
merge_format_subheader_left,
)
@@ -736,18 +736,19 @@ class ReportDatasets:
4,
1,
4,
3,
4,
"Seeds",
merge_format_subheader_right,
)
self.sheet.write(
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
4, 5, f"{self.env['seeds']}", merge_format_subheader_left
)
self.update_max_length(len(self.env["seeds"]) + 1)
header_cols = [
("Dataset", 30),
("Samples", 10),
("Features", 10),
("Continuous", 10),
("Classes", 10),
("Balance", 50),
]
@@ -767,7 +768,7 @@ class ReportDatasets:
def footer(self):
# set Balance column width to max length
self.sheet.set_column(4, 4, self.max_length)
self.sheet.set_column(5, 5, self.max_length)
self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines(2)
if self.close:
@@ -789,8 +790,9 @@ class ReportDatasets:
self.sheet.write(self.row, col, result.dataset, normal)
self.sheet.write(self.row, col + 1, result.samples, integer)
self.sheet.write(self.row, col + 2, result.features, integer)
self.sheet.write(self.row, col + 3, result.classes, normal)
self.sheet.write(self.row, col + 4, result.balance, normal)
self.sheet.write(self.row, col + 3, result.cont_features, integer)
self.sheet.write(self.row, col + 4, result.classes, normal)
self.sheet.write(self.row, col + 5, result.balance, normal)
self.update_max_length(len(result.balance))
self.row += 1
@@ -807,11 +809,11 @@ class ReportDatasets:
print(color_line, end="")
print(self.header_text)
print("")
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
print("=" * 30 + " ====== ===== === " + "=" * 60)
print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance")
print("=" * 30 + " ====== ===== ==== === " + "=" * 60)
for dataset in data_sets:
attributes = data_sets.get_attributes(dataset)
attributes.dataset = dataset
if self.excel:
self.print_line(attributes)
color_line = (
@@ -823,8 +825,8 @@ class ReportDatasets:
print(color_line, end="")
print(
f"{dataset:30s} {attributes.samples:6,d} "
f"{attributes.features:5,d} {attributes.classes:3d} "
f"{attributes.balance:40s}"
f"{attributes.features:5,d} {attributes.cont_features:4,d}"
f" {attributes.classes:3d} {attributes.balance:40s}"
)
if self.excel:
self.footer()