Add Discretizer to Datasets

This commit is contained in:
2022-11-10 11:47:01 +01:00
parent feaf85d0b8
commit 4b442a46f2
3 changed files with 55 additions and 7 deletions

View File

@@ -3,6 +3,7 @@ import pandas as pd
from scipy.io import arff from scipy.io import arff
from .Utils import Files from .Utils import Files
from .Arguments import EnvData from .Arguments import EnvData
from mdlp import MDLP
class Diterator: class Diterator:
@@ -28,15 +29,20 @@ class DatasetsArff:
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = arff.loadarff(file_name) data = arff.loadarff(file_name)
df = pd.DataFrame(data[0]) df = pd.DataFrame(data[0])
df = df.dropna() df.dropna(axis=0, how="any", inplace=True)
X = df.drop(class_name, axis=1) X = df.drop(class_name, axis=1)
self.features = X.columns self.features = X.columns
self.class_name = class_name self.class_name = class_name
y, _ = pd.factorize(df[class_name]) y, _ = pd.factorize(df[class_name])
return df if dataframe else (X.to_numpy(), y) df[class_name] = y
X = X.to_numpy()
return df if dataframe else (X, y)
class DatasetsTanveer: class DatasetsTanveer:
def __init__(self, discretized):
self.discretized = discretized
@staticmethod @staticmethod
def dataset_names(name): def dataset_names(name):
return f"{name}_R.dat" return f"{name}_R.dat"
@@ -82,7 +88,6 @@ class DatasetsSurcov:
class Datasets: class Datasets:
def __init__(self, dataset_name=None): def __init__(self, dataset_name=None):
envData = EnvData.load() envData = EnvData.load()
class_name = getattr( class_name = getattr(
__import__(__name__), __import__(__name__),
@@ -90,7 +95,7 @@ class Datasets:
) )
self.dataset = class_name() self.dataset = class_name()
self.class_names = [] self.class_names = []
self.load_names() self._load_names()
if dataset_name is not None: if dataset_name is not None:
try: try:
class_name = self.class_names[ class_name = self.class_names[
@@ -101,7 +106,7 @@ class Datasets:
raise ValueError(f"Unknown dataset: {dataset_name}") raise ValueError(f"Unknown dataset: {dataset_name}")
self.data_sets = [dataset_name] self.data_sets = [dataset_name]
def load_names(self): def _load_names(self):
file_name = os.path.join(self.dataset.folder(), Files.index) file_name = os.path.join(self.dataset.folder(), Files.index)
default_class = "class" default_class = "class"
with open(file_name) as f: with open(file_name) as f:
@@ -117,6 +122,12 @@ class Datasets:
self.data_sets = result self.data_sets = result
self.class_names = class_names self.class_names = class_names
def get_features(self):
return self.dataset.features
def get_class_name(self):
return self.dataset.class_name
def load(self, name, dataframe=False): def load(self, name, dataframe=False):
try: try:
class_name = self.class_names[self.data_sets.index(name)] class_name = self.class_names[self.data_sets.index(name)]
@@ -126,3 +137,33 @@ class Datasets:
def __iter__(self) -> Diterator: def __iter__(self) -> Diterator:
return Diterator(self.data_sets) return Diterator(self.data_sets)
class Discretizer(Datasets):
def __init__(self, dataset_name=None):
super().__init__(dataset_name)
def load(self, name, dataframe=False):
X, y = super().load(name)
X, y = self.discretize(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
return dataset if dataframe else X, y
def discretize(self, X, y):
"""Supervised discretization with Fayyad and Irani's MDLP algorithm.
Parameters
----------
X : np.ndarray
array (n_samples, n_features) of features
y : np.ndarray
array (n_samples,) of labels
Returns
-------
tuple (X, y) of numpy.ndarray
"""
discretiz = MDLP()
Xdisc = discretiz.fit_transform(X, y)
return Xdisc.astype(int), y.astype(int)

View File

@@ -1,4 +1,10 @@
from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer, DatasetsArff from .Datasets import (
Datasets,
DatasetsSurcov,
DatasetsTanveer,
DatasetsArff,
Discretizer,
)
from .Experiments import Experiment from .Experiments import Experiment
from .Results import Report, Summary from .Results import Report, Summary
@@ -7,4 +13,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es" __author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Experiment", "Datasets", "Report", "Summary"] __all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"]

View File

@@ -2,6 +2,7 @@ pandas
scikit-learn scikit-learn
scipy scipy
odte odte
mdlp
mufs mufs
xlsxwriter xlsxwriter
openpyxl openpyxl