diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index cf06740..10a8935 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -3,6 +3,7 @@ import pandas as pd from scipy.io import arff from .Utils import Files from .Arguments import EnvData +from mdlp import MDLP class Diterator: @@ -28,15 +29,20 @@ class DatasetsArff: file_name = os.path.join(self.folder(), self.dataset_names(name)) data = arff.loadarff(file_name) df = pd.DataFrame(data[0]) - df = df.dropna() + df.dropna(axis=0, how="any", inplace=True) X = df.drop(class_name, axis=1) self.features = X.columns self.class_name = class_name y, _ = pd.factorize(df[class_name]) - return df if dataframe else (X.to_numpy(), y) + df[class_name] = y + X = X.to_numpy() + return df if dataframe else (X, y) class DatasetsTanveer: + def __init__(self, discretized): + self.discretized = discretized + @staticmethod def dataset_names(name): return f"{name}_R.dat" @@ -82,7 +88,6 @@ class DatasetsSurcov: class Datasets: def __init__(self, dataset_name=None): - envData = EnvData.load() class_name = getattr( __import__(__name__), @@ -90,7 +95,7 @@ class Datasets: ) self.dataset = class_name() self.class_names = [] - self.load_names() + self._load_names() if dataset_name is not None: try: class_name = self.class_names[ @@ -101,7 +106,7 @@ class Datasets: raise ValueError(f"Unknown dataset: {dataset_name}") self.data_sets = [dataset_name] - def load_names(self): + def _load_names(self): file_name = os.path.join(self.dataset.folder(), Files.index) default_class = "class" with open(file_name) as f: @@ -117,6 +122,12 @@ class Datasets: self.data_sets = result self.class_names = class_names + def get_features(self): + return self.dataset.features + + def get_class_name(self): + return self.dataset.class_name + def load(self, name, dataframe=False): try: class_name = self.class_names[self.data_sets.index(name)] @@ -126,3 +137,33 @@ class Datasets: def __iter__(self) -> Diterator: return Diterator(self.data_sets) + + +class Discretizer(Datasets): + def __init__(self, dataset_name=None): + super().__init__(dataset_name) + + def load(self, name, dataframe=False): + X, y = super().load(name) + X, y = self.discretize(X, y) + dataset = pd.DataFrame(X, columns=self.get_features()) + dataset[self.get_class_name()] = y + return dataset if dataframe else X, y + + def discretize(self, X, y): + """Supervised discretization with Fayyad and Irani's MDLP algorithm. + + Parameters + ---------- + X : np.ndarray + array (n_samples, n_features) of features + y : np.ndarray + array (n_samples,) of labels + + Returns + ------- + tuple (X, y) of numpy.ndarray + """ + discretiz = MDLP() + Xdisc = discretiz.fit_transform(X, y) + return Xdisc.astype(int), y.astype(int) diff --git a/benchmark/__init__.py b/benchmark/__init__.py index e26a7cf..c8c4821 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -1,4 +1,10 @@ -from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer, DatasetsArff +from .Datasets import ( + Datasets, + DatasetsSurcov, + DatasetsTanveer, + DatasetsArff, + Discretizer, +) from .Experiments import Experiment from .Results import Report, Summary @@ -7,4 +13,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Experiment", "Datasets", "Report", "Summary"] +__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"] diff --git a/requirements.txt b/requirements.txt index 02446a6..b9cf80f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ pandas scikit-learn scipy odte +mdlp mufs xlsxwriter openpyxl