mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-15 15:35:52 +00:00
Split Datasets class from Experiments
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import sys
|
||||
import argparse
|
||||
from .Experiments import Models
|
||||
from .Models import Models
|
||||
from .Utils import Files, NO_ENV
|
||||
|
||||
ALL_METRICS = (
|
||||
|
103
benchmark/Datasets.py
Normal file
103
benchmark/Datasets.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from scipy.io import arff
|
||||
from .Utils import Files
|
||||
from .Arguments import EnvData
|
||||
|
||||
|
||||
class Diterator:
|
||||
def __init__(self, data):
|
||||
self._stack = data.copy()
|
||||
|
||||
def __next__(self):
|
||||
if len(self._stack) == 0:
|
||||
raise StopIteration()
|
||||
return self._stack.pop(0)
|
||||
|
||||
|
||||
class DatasetsArff:
|
||||
@staticmethod
|
||||
def dataset_names(name):
|
||||
return f"{name}.arff"
|
||||
|
||||
@staticmethod
|
||||
def folder():
|
||||
return "datasets"
|
||||
|
||||
def load(self, name, class_name="class"):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = arff.loadarff(file_name)
|
||||
df = pd.DataFrame(data[0])
|
||||
y = df[class_name]
|
||||
X = data.drop(class_name, axis=1).to_numpy()
|
||||
y = data[class_name].to_numpy()
|
||||
return X, y
|
||||
|
||||
|
||||
class DatasetsTanveer:
|
||||
@staticmethod
|
||||
def dataset_names(name):
|
||||
return f"{name}_R.dat"
|
||||
|
||||
@staticmethod
|
||||
def folder():
|
||||
return "data"
|
||||
|
||||
def load(self, name):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = pd.read_csv(
|
||||
file_name,
|
||||
sep="\t",
|
||||
index_col=0,
|
||||
)
|
||||
X = data.drop("clase", axis=1).to_numpy()
|
||||
y = data["clase"].to_numpy()
|
||||
return X, y
|
||||
|
||||
|
||||
class DatasetsSurcov:
|
||||
@staticmethod
|
||||
def dataset_names(name):
|
||||
return f"{name}.csv"
|
||||
|
||||
@staticmethod
|
||||
def folder():
|
||||
return "datasets"
|
||||
|
||||
def load(self, name):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = pd.read_csv(
|
||||
file_name,
|
||||
index_col=0,
|
||||
)
|
||||
data.dropna(axis=0, how="any", inplace=True)
|
||||
self.columns = data.columns
|
||||
col_list = ["class"]
|
||||
X = data.drop(col_list, axis=1).to_numpy()
|
||||
y = data["class"].to_numpy()
|
||||
return X, y
|
||||
|
||||
|
||||
class Datasets:
|
||||
def __init__(self, dataset_name=None):
|
||||
envData = EnvData.load()
|
||||
class_name = getattr(
|
||||
__import__(__name__),
|
||||
f"Datasets{envData['source_data']}",
|
||||
)
|
||||
self.dataset = class_name()
|
||||
if dataset_name is None:
|
||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||
with open(file_name) as f:
|
||||
self.data_sets = f.read().splitlines()
|
||||
else:
|
||||
self.data_sets = [dataset_name]
|
||||
|
||||
def load(self, name):
|
||||
try:
|
||||
return self.dataset.load(name)
|
||||
except FileNotFoundError:
|
||||
raise ValueError(f"Unknown dataset: {name}")
|
||||
|
||||
def __iter__(self) -> Diterator:
|
||||
return Diterator(self.data_sets)
|
@@ -6,7 +6,6 @@ import time
|
||||
from datetime import datetime
|
||||
from tqdm import tqdm
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import (
|
||||
StratifiedKFold,
|
||||
KFold,
|
||||
@@ -14,93 +13,14 @@ from sklearn.model_selection import (
|
||||
cross_validate,
|
||||
)
|
||||
from .Utils import Folders, Files, NO_RESULTS
|
||||
from .Datasets import Datasets
|
||||
from .Models import Models
|
||||
from .Arguments import EnvData
|
||||
|
||||
|
||||
class Randomized:
|
||||
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
|
||||
|
||||
class Diterator:
|
||||
def __init__(self, data):
|
||||
self._stack = data.copy()
|
||||
|
||||
def __next__(self):
|
||||
if len(self._stack) == 0:
|
||||
raise StopIteration()
|
||||
return self._stack.pop(0)
|
||||
|
||||
|
||||
class DatasetsTanveer:
|
||||
@staticmethod
|
||||
def dataset_names(name):
|
||||
return f"{name}_R.dat"
|
||||
|
||||
@staticmethod
|
||||
def folder():
|
||||
return "data"
|
||||
|
||||
def load(self, name):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = pd.read_csv(
|
||||
file_name,
|
||||
sep="\t",
|
||||
index_col=0,
|
||||
)
|
||||
X = data.drop("clase", axis=1).to_numpy()
|
||||
y = data["clase"].to_numpy()
|
||||
return X, y
|
||||
|
||||
|
||||
class DatasetsSurcov:
|
||||
@staticmethod
|
||||
def dataset_names(name):
|
||||
return f"{name}.csv"
|
||||
|
||||
@staticmethod
|
||||
def folder():
|
||||
return "datasets"
|
||||
|
||||
def load(self, name):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = pd.read_csv(
|
||||
file_name,
|
||||
index_col=0,
|
||||
)
|
||||
data.dropna(axis=0, how="any", inplace=True)
|
||||
self.columns = data.columns
|
||||
col_list = ["class"]
|
||||
X = data.drop(col_list, axis=1).to_numpy()
|
||||
y = data["class"].to_numpy()
|
||||
return X, y
|
||||
|
||||
|
||||
class Datasets:
|
||||
def __init__(self, dataset_name=None):
|
||||
envData = EnvData.load()
|
||||
class_name = getattr(
|
||||
__import__(__name__),
|
||||
f"Datasets{envData['source_data']}",
|
||||
)
|
||||
self.dataset = class_name()
|
||||
if dataset_name is None:
|
||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||
with open(file_name) as f:
|
||||
self.data_sets = f.read().splitlines()
|
||||
else:
|
||||
self.data_sets = [dataset_name]
|
||||
|
||||
def load(self, name):
|
||||
try:
|
||||
return self.dataset.load(name)
|
||||
except FileNotFoundError:
|
||||
raise ValueError(f"Unknown dataset: {name}")
|
||||
|
||||
def __iter__(self) -> Diterator:
|
||||
return Diterator(self.data_sets)
|
||||
|
||||
|
||||
class BestResults:
|
||||
def __init__(self, score, model, datasets, quiet=False):
|
||||
self.score_name = score
|
||||
|
@@ -7,7 +7,8 @@ import shutil
|
||||
import subprocess
|
||||
import xlsxwriter
|
||||
import numpy as np
|
||||
from .Experiments import Datasets, BestResults
|
||||
from .Experiments import BestResults
|
||||
from .Datasets import Datasets
|
||||
from .Utils import (
|
||||
Folders,
|
||||
Files,
|
||||
|
@@ -1,4 +1,5 @@
|
||||
from .Experiments import Experiment, Datasets, DatasetsSurcov, DatasetsTanveer
|
||||
from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer
|
||||
from .Experiments import Experiment
|
||||
from .Results import Report, Summary
|
||||
|
||||
__author__ = "Ricardo Montañana Gómez"
|
||||
|
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
from benchmark.Results import ReportBest
|
||||
from benchmark.Experiments import Datasets, BestResults
|
||||
from benchmark.Experiments import BestResults
|
||||
from benchmark.Datasets import Datasets
|
||||
from benchmark.Arguments import Arguments
|
||||
|
||||
"""Build a json file with the best results of a model and its hyperparameters
|
||||
|
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python
|
||||
from benchmark.Experiments import GridSearch, Datasets
|
||||
from benchmark.Experiments import GridSearch
|
||||
from benchmark.Datasets import Datasets
|
||||
from benchmark.Arguments import Arguments
|
||||
|
||||
"""Do experiment and build result file, optionally print report with results
|
||||
|
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
import os
|
||||
from benchmark.Experiments import Experiment, Datasets
|
||||
from benchmark.Experiments import Experiment
|
||||
from benchmark.Datasets import Datasets
|
||||
from benchmark.Results import Report
|
||||
from benchmark.Arguments import Arguments
|
||||
|
||||
|
@@ -3,7 +3,7 @@ import os
|
||||
import json
|
||||
from stree import Stree
|
||||
from graphviz import Source
|
||||
from benchmark.Experiments import Datasets
|
||||
from benchmark.Datasets import Datasets
|
||||
from benchmark.Utils import Files, Folders
|
||||
from benchmark.Arguments import Arguments
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
from .TestBase import TestBase
|
||||
from ..Experiments import BestResults, Datasets
|
||||
from ..Experiments import BestResults
|
||||
from ..Datasets import Datasets
|
||||
|
||||
|
||||
class BestResultTest(TestBase):
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import shutil
|
||||
from .TestBase import TestBase
|
||||
from ..Experiments import Randomized, Datasets
|
||||
from ..Experiments import Randomized
|
||||
from ..Datasets import Datasets
|
||||
|
||||
|
||||
class DatasetTest(TestBase):
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import json
|
||||
from .TestBase import TestBase
|
||||
from ..Experiments import Experiment, Datasets
|
||||
from ..Experiments import Experiment
|
||||
from ..Datasets import Datasets
|
||||
|
||||
|
||||
class ExperimentTest(TestBase):
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import json
|
||||
from .TestBase import TestBase
|
||||
from ..Experiments import GridSearch, Datasets
|
||||
from ..Experiments import GridSearch
|
||||
from ..Datasets import Datasets
|
||||
|
||||
|
||||
class GridSearchTest(TestBase):
|
||||
|
@@ -6,7 +6,7 @@
|
||||
"kernel": "liblinear",
|
||||
"multiclass_strategy": "ovr"
|
||||
},
|
||||
"v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
||||
"v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
||||
],
|
||||
"balloons": [
|
||||
0.625,
|
||||
@@ -15,6 +15,6 @@
|
||||
"kernel": "linear",
|
||||
"multiclass_strategy": "ovr"
|
||||
},
|
||||
"v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
||||
"v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
||||
]
|
||||
}
|
@@ -1,5 +1,6 @@
|
||||
pandas
|
||||
scikit-learn
|
||||
scipy
|
||||
odte
|
||||
mufs
|
||||
xlsxwriter
|
||||
|
Reference in New Issue
Block a user