refactor Discretization in datasets

This commit is contained in:
2022-11-12 19:37:46 +01:00
parent 4b442a46f2
commit 2d61cd11c2
7 changed files with 26 additions and 22 deletions

View File

@@ -3,7 +3,7 @@ import pandas as pd
from scipy.io import arff from scipy.io import arff
from .Utils import Files from .Utils import Files
from .Arguments import EnvData from .Arguments import EnvData
from mdlp import MDLP from mdlp.discretization import MDLP
class Diterator: class Diterator:
@@ -93,6 +93,11 @@ class Datasets:
__import__(__name__), __import__(__name__),
f"Datasets{envData['source_data']}", f"Datasets{envData['source_data']}",
) )
self.load = (
self.load_discretized
if envData["discretize"] == "1"
else self.load_continuous
)
self.dataset = class_name() self.dataset = class_name()
self.class_names = [] self.class_names = []
self._load_names() self._load_names()
@@ -128,28 +133,13 @@ class Datasets:
def get_class_name(self): def get_class_name(self):
return self.dataset.class_name return self.dataset.class_name
def load(self, name, dataframe=False): def load_continuous(self, name, dataframe=False):
try: try:
class_name = self.class_names[self.data_sets.index(name)] class_name = self.class_names[self.data_sets.index(name)]
return self.dataset.load(name, class_name, dataframe) return self.dataset.load(name, class_name, dataframe)
except (ValueError, FileNotFoundError): except (ValueError, FileNotFoundError):
raise ValueError(f"Unknown dataset: {name}") raise ValueError(f"Unknown dataset: {name}")
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)
class Discretizer(Datasets):
def __init__(self, dataset_name=None):
super().__init__(dataset_name)
def load(self, name, dataframe=False):
X, y = super().load(name)
X, y = self.discretize(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
return dataset if dataframe else X, y
def discretize(self, X, y): def discretize(self, X, y):
"""Supervised discretization with Fayyad and Irani's MDLP algorithm. """Supervised discretization with Fayyad and Irani's MDLP algorithm.
@@ -167,3 +157,13 @@ class Discretizer(Datasets):
discretiz = MDLP() discretiz = MDLP()
Xdisc = discretiz.fit_transform(X, y) Xdisc = discretiz.fit_transform(X, y)
return Xdisc.astype(int), y.astype(int) return Xdisc.astype(int), y.astype(int)
def load_discretized(self, name, dataframe=False):
X, y = self.load_continuous(name)
X, y = self.discretize(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
return dataset if dataframe else X, y
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)

View File

@@ -8,6 +8,7 @@ from sklearn.ensemble import (
) )
from sklearn.svm import SVC from sklearn.svm import SVC
from stree import Stree from stree import Stree
from bayesclass import TAN
from wodt import Wodt from wodt import Wodt
from odte import Odte from odte import Odte
from xgboost import XGBClassifier from xgboost import XGBClassifier
@@ -20,6 +21,7 @@ class Models:
def define_models(random_state): def define_models(random_state):
return { return {
"STree": Stree(random_state=random_state), "STree": Stree(random_state=random_state),
"TAN": TAN(random_state=random_state),
"Cart": DecisionTreeClassifier(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state),
"ExtraTree": ExtraTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state),
"Wodt": Wodt(random_state=random_state), "Wodt": Wodt(random_state=random_state),

View File

@@ -3,7 +3,6 @@ from .Datasets import (
DatasetsSurcov, DatasetsSurcov,
DatasetsTanveer, DatasetsTanveer,
DatasetsArff, DatasetsArff,
Discretizer,
) )
from .Experiments import Experiment from .Experiments import Experiment
from .Results import Report, Summary from .Results import Report, Summary
@@ -13,4 +12,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez"
__license__ = "MIT License" __license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es" __author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"] __all__ = ["Experiment", "Datasets", "Report", "Summary"]

View File

@@ -4,4 +4,5 @@ n_folds=5
model=ODTE model=ODTE
stratified=0 stratified=0
source_data=Arff source_data=Arff
seeds=[271, 314, 171] seeds=[271, 314, 171]
discretize=1

View File

@@ -6,3 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -5,4 +5,5 @@ model=ODTE
stratified=0 stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Surcov source_data=Surcov
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -2,7 +2,7 @@ pandas
scikit-learn scikit-learn
scipy scipy
odte odte
mdlp mdlp-discretization
mufs mufs
xlsxwriter xlsxwriter
openpyxl openpyxl