mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-15 15:35:52 +00:00
refactor Discretization in datasets
This commit is contained in:
@@ -3,7 +3,7 @@ import pandas as pd
|
|||||||
from scipy.io import arff
|
from scipy.io import arff
|
||||||
from .Utils import Files
|
from .Utils import Files
|
||||||
from .Arguments import EnvData
|
from .Arguments import EnvData
|
||||||
from mdlp import MDLP
|
from mdlp.discretization import MDLP
|
||||||
|
|
||||||
|
|
||||||
class Diterator:
|
class Diterator:
|
||||||
@@ -93,6 +93,11 @@ class Datasets:
|
|||||||
__import__(__name__),
|
__import__(__name__),
|
||||||
f"Datasets{envData['source_data']}",
|
f"Datasets{envData['source_data']}",
|
||||||
)
|
)
|
||||||
|
self.load = (
|
||||||
|
self.load_discretized
|
||||||
|
if envData["discretize"] == "1"
|
||||||
|
else self.load_continuous
|
||||||
|
)
|
||||||
self.dataset = class_name()
|
self.dataset = class_name()
|
||||||
self.class_names = []
|
self.class_names = []
|
||||||
self._load_names()
|
self._load_names()
|
||||||
@@ -128,28 +133,13 @@ class Datasets:
|
|||||||
def get_class_name(self):
|
def get_class_name(self):
|
||||||
return self.dataset.class_name
|
return self.dataset.class_name
|
||||||
|
|
||||||
def load(self, name, dataframe=False):
|
def load_continuous(self, name, dataframe=False):
|
||||||
try:
|
try:
|
||||||
class_name = self.class_names[self.data_sets.index(name)]
|
class_name = self.class_names[self.data_sets.index(name)]
|
||||||
return self.dataset.load(name, class_name, dataframe)
|
return self.dataset.load(name, class_name, dataframe)
|
||||||
except (ValueError, FileNotFoundError):
|
except (ValueError, FileNotFoundError):
|
||||||
raise ValueError(f"Unknown dataset: {name}")
|
raise ValueError(f"Unknown dataset: {name}")
|
||||||
|
|
||||||
def __iter__(self) -> Diterator:
|
|
||||||
return Diterator(self.data_sets)
|
|
||||||
|
|
||||||
|
|
||||||
class Discretizer(Datasets):
|
|
||||||
def __init__(self, dataset_name=None):
|
|
||||||
super().__init__(dataset_name)
|
|
||||||
|
|
||||||
def load(self, name, dataframe=False):
|
|
||||||
X, y = super().load(name)
|
|
||||||
X, y = self.discretize(X, y)
|
|
||||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
|
||||||
dataset[self.get_class_name()] = y
|
|
||||||
return dataset if dataframe else X, y
|
|
||||||
|
|
||||||
def discretize(self, X, y):
|
def discretize(self, X, y):
|
||||||
"""Supervised discretization with Fayyad and Irani's MDLP algorithm.
|
"""Supervised discretization with Fayyad and Irani's MDLP algorithm.
|
||||||
|
|
||||||
@@ -167,3 +157,13 @@ class Discretizer(Datasets):
|
|||||||
discretiz = MDLP()
|
discretiz = MDLP()
|
||||||
Xdisc = discretiz.fit_transform(X, y)
|
Xdisc = discretiz.fit_transform(X, y)
|
||||||
return Xdisc.astype(int), y.astype(int)
|
return Xdisc.astype(int), y.astype(int)
|
||||||
|
|
||||||
|
def load_discretized(self, name, dataframe=False):
|
||||||
|
X, y = self.load_continuous(name)
|
||||||
|
X, y = self.discretize(X, y)
|
||||||
|
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||||
|
dataset[self.get_class_name()] = y
|
||||||
|
return dataset if dataframe else X, y
|
||||||
|
|
||||||
|
def __iter__(self) -> Diterator:
|
||||||
|
return Diterator(self.data_sets)
|
||||||
|
@@ -8,6 +8,7 @@ from sklearn.ensemble import (
|
|||||||
)
|
)
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from stree import Stree
|
from stree import Stree
|
||||||
|
from bayesclass import TAN
|
||||||
from wodt import Wodt
|
from wodt import Wodt
|
||||||
from odte import Odte
|
from odte import Odte
|
||||||
from xgboost import XGBClassifier
|
from xgboost import XGBClassifier
|
||||||
@@ -20,6 +21,7 @@ class Models:
|
|||||||
def define_models(random_state):
|
def define_models(random_state):
|
||||||
return {
|
return {
|
||||||
"STree": Stree(random_state=random_state),
|
"STree": Stree(random_state=random_state),
|
||||||
|
"TAN": TAN(random_state=random_state),
|
||||||
"Cart": DecisionTreeClassifier(random_state=random_state),
|
"Cart": DecisionTreeClassifier(random_state=random_state),
|
||||||
"ExtraTree": ExtraTreeClassifier(random_state=random_state),
|
"ExtraTree": ExtraTreeClassifier(random_state=random_state),
|
||||||
"Wodt": Wodt(random_state=random_state),
|
"Wodt": Wodt(random_state=random_state),
|
||||||
|
@@ -3,7 +3,6 @@ from .Datasets import (
|
|||||||
DatasetsSurcov,
|
DatasetsSurcov,
|
||||||
DatasetsTanveer,
|
DatasetsTanveer,
|
||||||
DatasetsArff,
|
DatasetsArff,
|
||||||
Discretizer,
|
|
||||||
)
|
)
|
||||||
from .Experiments import Experiment
|
from .Experiments import Experiment
|
||||||
from .Results import Report, Summary
|
from .Results import Report, Summary
|
||||||
@@ -13,4 +12,4 @@ __copyright__ = "Copyright 2020-2022, Ricardo Montañana Gómez"
|
|||||||
__license__ = "MIT License"
|
__license__ = "MIT License"
|
||||||
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
__author_email__ = "ricardo.montanana@alu.uclm.es"
|
||||||
|
|
||||||
__all__ = ["Experiment", "Datasets", "Report", "Summary", "Discretizer"]
|
__all__ = ["Experiment", "Datasets", "Report", "Summary"]
|
||||||
|
@@ -4,4 +4,5 @@ n_folds=5
|
|||||||
model=ODTE
|
model=ODTE
|
||||||
stratified=0
|
stratified=0
|
||||||
source_data=Arff
|
source_data=Arff
|
||||||
seeds=[271, 314, 171]
|
seeds=[271, 314, 171]
|
||||||
|
discretize=1
|
@@ -6,3 +6,4 @@ stratified=0
|
|||||||
# Source of data Tanveer/Surcov
|
# Source of data Tanveer/Surcov
|
||||||
source_data=Tanveer
|
source_data=Tanveer
|
||||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||||
|
discretize=0
|
||||||
|
@@ -5,4 +5,5 @@ model=ODTE
|
|||||||
stratified=0
|
stratified=0
|
||||||
# Source of data Tanveer/Surcov
|
# Source of data Tanveer/Surcov
|
||||||
source_data=Surcov
|
source_data=Surcov
|
||||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||||
|
discretize=0
|
@@ -2,7 +2,7 @@ pandas
|
|||||||
scikit-learn
|
scikit-learn
|
||||||
scipy
|
scipy
|
||||||
odte
|
odte
|
||||||
mdlp
|
mdlp-discretization
|
||||||
mufs
|
mufs
|
||||||
xlsxwriter
|
xlsxwriter
|
||||||
openpyxl
|
openpyxl
|
||||||
|
Reference in New Issue
Block a user