mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-15 07:25:53 +00:00
refactor Discretization in datasets
This commit is contained in:
@@ -3,7 +3,7 @@ import pandas as pd
|
||||
from scipy.io import arff
|
||||
from .Utils import Files
|
||||
from .Arguments import EnvData
|
||||
from mdlp import MDLP
|
||||
from mdlp.discretization import MDLP
|
||||
|
||||
|
||||
class Diterator:
|
||||
@@ -93,6 +93,11 @@ class Datasets:
|
||||
__import__(__name__),
|
||||
f"Datasets{envData['source_data']}",
|
||||
)
|
||||
self.load = (
|
||||
self.load_discretized
|
||||
if envData["discretize"] == "1"
|
||||
else self.load_continuous
|
||||
)
|
||||
self.dataset = class_name()
|
||||
self.class_names = []
|
||||
self._load_names()
|
||||
@@ -128,28 +133,13 @@ class Datasets:
|
||||
def get_class_name(self):
|
||||
return self.dataset.class_name
|
||||
|
||||
def load(self, name, dataframe=False):
|
||||
def load_continuous(self, name, dataframe=False):
|
||||
try:
|
||||
class_name = self.class_names[self.data_sets.index(name)]
|
||||
return self.dataset.load(name, class_name, dataframe)
|
||||
except (ValueError, FileNotFoundError):
|
||||
raise ValueError(f"Unknown dataset: {name}")
|
||||
|
||||
def __iter__(self) -> Diterator:
|
||||
return Diterator(self.data_sets)
|
||||
|
||||
|
||||
class Discretizer(Datasets):
|
||||
def __init__(self, dataset_name=None):
|
||||
super().__init__(dataset_name)
|
||||
|
||||
def load(self, name, dataframe=False):
|
||||
X, y = super().load(name)
|
||||
X, y = self.discretize(X, y)
|
||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||
dataset[self.get_class_name()] = y
|
||||
return dataset if dataframe else X, y
|
||||
|
||||
def discretize(self, X, y):
|
||||
"""Supervised discretization with Fayyad and Irani's MDLP algorithm.
|
||||
|
||||
@@ -167,3 +157,13 @@ class Discretizer(Datasets):
|
||||
discretiz = MDLP()
|
||||
Xdisc = discretiz.fit_transform(X, y)
|
||||
return Xdisc.astype(int), y.astype(int)
|
||||
|
||||
def load_discretized(self, name, dataframe=False):
|
||||
X, y = self.load_continuous(name)
|
||||
X, y = self.discretize(X, y)
|
||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||
dataset[self.get_class_name()] = y
|
||||
return dataset if dataframe else X, y
|
||||
|
||||
def __iter__(self) -> Diterator:
|
||||
return Diterator(self.data_sets)
|
||||
|
@@ -8,6 +8,7 @@ from sklearn.ensemble import (
|
||||
)
|
||||
from sklearn.svm import SVC
|
||||
from stree import Stree
|
||||
from bayesclass import TAN
|
||||
from wodt import Wodt
|
||||
from odte import Odte
|
||||
from xgboost import XGBClassifier
|
||||
@@ -20,6 +21,7 @@ class Models:
|
||||
def define_models(random_state):
|
||||
return {
|
||||
"STree": Stree(random_state=random_state),
|
||||
"TAN": TAN(random_state=random_state),
|
||||
"Cart": DecisionTreeClassifier(random_state=random_state),
|
||||
"ExtraTree": ExtraTreeClassifier(random_state=random_state),
|
||||
"Wodt": Wodt(random_state=random_state),
|
||||
|
@@ -3,7 +3,6 @@ from .Datasets import (
|
||||
DatasetsSurcov,
|
||||
DatasetsTanveer,
|
||||
DatasetsArff,
|
||||
Discretizer,
|
||||
)
|
||||
from .Experiments import Experiment
|
||||
from .Results import Report, Summary
|
||||
@@ -13,4 +12,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez"
|
||||
__license__ = "MIT License"
|
||||
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
||||
|
||||
__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"]
|
||||
__all__ = ["Experiment", "Datasets", "Report", "Summary"]
|
||||
|
@@ -4,4 +4,5 @@ n_folds=5
|
||||
model=ODTE
|
||||
stratified=0
|
||||
source_data=Arff
|
||||
seeds=[271, 314, 171]
|
||||
seeds=[271, 314, 171]
|
||||
discretize=1
|
@@ -6,3 +6,4 @@ stratified=0
|
||||
# Source of data Tanveer/Surcov
|
||||
source_data=Tanveer
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
discretize=0
|
||||
|
@@ -5,4 +5,5 @@ model=ODTE
|
||||
stratified=0
|
||||
# Source of data Tanveer/Surcov
|
||||
source_data=Surcov
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
discretize=0
|
@@ -2,7 +2,7 @@ pandas
|
||||
scikit-learn
|
||||
scipy
|
||||
odte
|
||||
mdlp
|
||||
mdlp-discretization
|
||||
mufs
|
||||
xlsxwriter
|
||||
openpyxl
|
||||
|
Reference in New Issue
Block a user