diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index 10a8935..0691350 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -3,7 +3,7 @@ import pandas as pd from scipy.io import arff from .Utils import Files from .Arguments import EnvData -from mdlp import MDLP +from mdlp.discretization import MDLP class Diterator: @@ -93,6 +93,11 @@ class Datasets: __import__(__name__), f"Datasets{envData['source_data']}", ) + self.load = ( + self.load_discretized + if envData["discretize"] == "1" + else self.load_continuous + ) self.dataset = class_name() self.class_names = [] self._load_names() @@ -128,28 +133,13 @@ class Datasets: def get_class_name(self): return self.dataset.class_name - def load(self, name, dataframe=False): + def load_continuous(self, name, dataframe=False): try: class_name = self.class_names[self.data_sets.index(name)] return self.dataset.load(name, class_name, dataframe) except (ValueError, FileNotFoundError): raise ValueError(f"Unknown dataset: {name}") - def __iter__(self) -> Diterator: - return Diterator(self.data_sets) - - -class Discretizer(Datasets): - def __init__(self, dataset_name=None): - super().__init__(dataset_name) - - def load(self, name, dataframe=False): - X, y = super().load(name) - X, y = self.discretize(X, y) - dataset = pd.DataFrame(X, columns=self.get_features()) - dataset[self.get_class_name()] = y - return dataset if dataframe else X, y - def discretize(self, X, y): """Supervised discretization with Fayyad and Irani's MDLP algorithm. @@ -167,3 +157,13 @@ class Discretizer(Datasets): discretiz = MDLP() Xdisc = discretiz.fit_transform(X, y) return Xdisc.astype(int), y.astype(int) + + def load_discretized(self, name, dataframe=False): + X, y = self.load_continuous(name) + X, y = self.discretize(X, y) + dataset = pd.DataFrame(X, columns=self.get_features()) + dataset[self.get_class_name()] = y + return dataset if dataframe else X, y + + def __iter__(self) -> Diterator: + return Diterator(self.data_sets) diff --git a/benchmark/Models.py b/benchmark/Models.py index 03d31eb..a69a312 100644 --- a/benchmark/Models.py +++ b/benchmark/Models.py @@ -8,6 +8,7 @@ from sklearn.ensemble import ( ) from sklearn.svm import SVC from stree import Stree +from bayesclass import TAN from wodt import Wodt from odte import Odte from xgboost import XGBClassifier @@ -20,6 +21,7 @@ class Models: def define_models(random_state): return { "STree": Stree(random_state=random_state), + "TAN": TAN(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state), "Wodt": Wodt(random_state=random_state), diff --git a/benchmark/__init__.py b/benchmark/__init__.py index c8c4821..4d65a6b 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -3,7 +3,6 @@ from .Datasets import ( DatasetsSurcov, DatasetsTanveer, DatasetsArff, - Discretizer, ) from .Experiments import Experiment from .Results import Report, Summary @@ -13,4 +12,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez" __license__ = "MIT License" __author_email__ = "ricardo.montanana@alu.uclm.es" -__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"] +__all__ = ["Experiment", "Datasets", "Report", "Summary"] diff --git a/benchmark/tests/.env.arff b/benchmark/tests/.env.arff index ab8956d..7f196d9 100644 --- a/benchmark/tests/.env.arff +++ b/benchmark/tests/.env.arff @@ -4,4 +4,5 @@ n_folds=5 model=ODTE stratified=0 source_data=Arff -seeds=[271, 314, 171] \ No newline at end of file +seeds=[271, 314, 171] +discretize=1 \ No newline at end of file diff --git a/benchmark/tests/.env.dist b/benchmark/tests/.env.dist index 31a99ab..9641efa 100644 --- a/benchmark/tests/.env.dist +++ b/benchmark/tests/.env.dist @@ -6,3 +6,4 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 diff --git a/benchmark/tests/.env.surcov b/benchmark/tests/.env.surcov index 63cc579..805ec7b 100644 --- a/benchmark/tests/.env.surcov +++ b/benchmark/tests/.env.surcov @@ -5,4 +5,5 @@ model=ODTE stratified=0 # Source of data Tanveer/Surcov source_data=Surcov -seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] \ No newline at end of file +seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +discretize=0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b9cf80f..37821a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ pandas scikit-learn scipy odte -mdlp +mdlp-discretization mufs xlsxwriter openpyxl