refactor Discretization in datasets

This commit is contained in:
2022-11-12 19:37:46 +01:00
parent 4b442a46f2
commit 2d61cd11c2
7 changed files with 26 additions and 22 deletions

View File

@@ -3,7 +3,7 @@ import pandas as pd
from scipy.io import arff
from .Utils import Files
from .Arguments import EnvData
from mdlp import MDLP
from mdlp.discretization import MDLP
class Diterator:
@@ -93,6 +93,11 @@ class Datasets:
__import__(__name__),
f"Datasets{envData['source_data']}",
)
self.load = (
self.load_discretized
if envData["discretize"] == "1"
else self.load_continuous
)
self.dataset = class_name()
self.class_names = []
self._load_names()
@@ -128,28 +133,13 @@ class Datasets:
def get_class_name(self):
return self.dataset.class_name
def load(self, name, dataframe=False):
def load_continuous(self, name, dataframe=False):
try:
class_name = self.class_names[self.data_sets.index(name)]
return self.dataset.load(name, class_name, dataframe)
except (ValueError, FileNotFoundError):
raise ValueError(f"Unknown dataset: {name}")
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)
class Discretizer(Datasets):
def __init__(self, dataset_name=None):
super().__init__(dataset_name)
def load(self, name, dataframe=False):
X, y = super().load(name)
X, y = self.discretize(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
return dataset if dataframe else X, y
def discretize(self, X, y):
"""Supervised discretization with Fayyad and Irani's MDLP algorithm.
@@ -167,3 +157,13 @@ class Discretizer(Datasets):
discretiz = MDLP()
Xdisc = discretiz.fit_transform(X, y)
return Xdisc.astype(int), y.astype(int)
def load_discretized(self, name, dataframe=False):
X, y = self.load_continuous(name)
X, y = self.discretize(X, y)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
return dataset if dataframe else X, y
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)

View File

@@ -8,6 +8,7 @@ from sklearn.ensemble import (
)
from sklearn.svm import SVC
from stree import Stree
from bayesclass import TAN
from wodt import Wodt
from odte import Odte
from xgboost import XGBClassifier
@@ -20,6 +21,7 @@ class Models:
def define_models(random_state):
return {
"STree": Stree(random_state=random_state),
"TAN": TAN(random_state=random_state),
"Cart": DecisionTreeClassifier(random_state=random_state),
"ExtraTree": ExtraTreeClassifier(random_state=random_state),
"Wodt": Wodt(random_state=random_state),

View File

@@ -3,7 +3,6 @@ from .Datasets import (
DatasetsSurcov,
DatasetsTanveer,
DatasetsArff,
Discretizer,
)
from .Experiments import Experiment
from .Results import Report, Summary
@@ -13,4 +12,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez"
__license__ = "MIT License"
__author_email__ = "ricardo.montanana@alu.uclm.es"
__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"]
__all__ = ["Experiment", "Datasets", "Report", "Summary"]

View File

@@ -4,4 +4,5 @@ n_folds=5
model=ODTE
stratified=0
source_data=Arff
seeds=[271, 314, 171]
seeds=[271, 314, 171]
discretize=1

View File

@@ -6,3 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov
source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -5,4 +5,5 @@ model=ODTE
stratified=0
# Source of data Tanveer/Surcov
source_data=Surcov
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0

View File

@@ -2,7 +2,7 @@ pandas
scikit-learn
scipy
odte
mdlp
mdlp-discretization
mufs
xlsxwriter
openpyxl