Split Datasets class from Experiments

This commit is contained in:
2022-10-24 18:21:08 +02:00
parent 12024df4d8
commit e15ab3dcab
15 changed files with 127 additions and 94 deletions

View File

@@ -1,6 +1,6 @@
import sys import sys
import argparse import argparse
from .Experiments import Models from .Models import Models
from .Utils import Files, NO_ENV from .Utils import Files, NO_ENV
ALL_METRICS = ( ALL_METRICS = (

103
benchmark/Datasets.py Normal file
View File

@@ -0,0 +1,103 @@
import os
import pandas as pd
from scipy.io import arff
from .Utils import Files
from .Arguments import EnvData
class Diterator:
def __init__(self, data):
self._stack = data.copy()
def __next__(self):
if len(self._stack) == 0:
raise StopIteration()
return self._stack.pop(0)
class DatasetsArff:
@staticmethod
def dataset_names(name):
return f"{name}.arff"
@staticmethod
def folder():
return "datasets"
def load(self, name, class_name="class"):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = arff.loadarff(file_name)
df = pd.DataFrame(data[0])
y = df[class_name]
X = data.drop(class_name, axis=1).to_numpy()
y = data[class_name].to_numpy()
return X, y
class DatasetsTanveer:
@staticmethod
def dataset_names(name):
return f"{name}_R.dat"
@staticmethod
def folder():
return "data"
def load(self, name):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv(
file_name,
sep="\t",
index_col=0,
)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
return X, y
class DatasetsSurcov:
@staticmethod
def dataset_names(name):
return f"{name}.csv"
@staticmethod
def folder():
return "datasets"
def load(self, name):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv(
file_name,
index_col=0,
)
data.dropna(axis=0, how="any", inplace=True)
self.columns = data.columns
col_list = ["class"]
X = data.drop(col_list, axis=1).to_numpy()
y = data["class"].to_numpy()
return X, y
class Datasets:
def __init__(self, dataset_name=None):
envData = EnvData.load()
class_name = getattr(
__import__(__name__),
f"Datasets{envData['source_data']}",
)
self.dataset = class_name()
if dataset_name is None:
file_name = os.path.join(self.dataset.folder(), Files.index)
with open(file_name) as f:
self.data_sets = f.read().splitlines()
else:
self.data_sets = [dataset_name]
def load(self, name):
try:
return self.dataset.load(name)
except FileNotFoundError:
raise ValueError(f"Unknown dataset: {name}")
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)

View File

@@ -6,7 +6,6 @@ import time
from datetime import datetime from datetime import datetime
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
import pandas as pd
from sklearn.model_selection import ( from sklearn.model_selection import (
StratifiedKFold, StratifiedKFold,
KFold, KFold,
@@ -14,93 +13,14 @@ from sklearn.model_selection import (
cross_validate, cross_validate,
) )
from .Utils import Folders, Files, NO_RESULTS from .Utils import Folders, Files, NO_RESULTS
from .Datasets import Datasets
from .Models import Models from .Models import Models
from .Arguments import EnvData
class Randomized: class Randomized:
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
class Diterator:
def __init__(self, data):
self._stack = data.copy()
def __next__(self):
if len(self._stack) == 0:
raise StopIteration()
return self._stack.pop(0)
class DatasetsTanveer:
@staticmethod
def dataset_names(name):
return f"{name}_R.dat"
@staticmethod
def folder():
return "data"
def load(self, name):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv(
file_name,
sep="\t",
index_col=0,
)
X = data.drop("clase", axis=1).to_numpy()
y = data["clase"].to_numpy()
return X, y
class DatasetsSurcov:
@staticmethod
def dataset_names(name):
return f"{name}.csv"
@staticmethod
def folder():
return "datasets"
def load(self, name):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv(
file_name,
index_col=0,
)
data.dropna(axis=0, how="any", inplace=True)
self.columns = data.columns
col_list = ["class"]
X = data.drop(col_list, axis=1).to_numpy()
y = data["class"].to_numpy()
return X, y
class Datasets:
def __init__(self, dataset_name=None):
envData = EnvData.load()
class_name = getattr(
__import__(__name__),
f"Datasets{envData['source_data']}",
)
self.dataset = class_name()
if dataset_name is None:
file_name = os.path.join(self.dataset.folder(), Files.index)
with open(file_name) as f:
self.data_sets = f.read().splitlines()
else:
self.data_sets = [dataset_name]
def load(self, name):
try:
return self.dataset.load(name)
except FileNotFoundError:
raise ValueError(f"Unknown dataset: {name}")
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)
class BestResults: class BestResults:
def __init__(self, score, model, datasets, quiet=False): def __init__(self, score, model, datasets, quiet=False):
self.score_name = score self.score_name = score

View File

@@ -7,7 +7,8 @@ import shutil
import subprocess import subprocess
import xlsxwriter import xlsxwriter
import numpy as np import numpy as np
from .Experiments import Datasets, BestResults from .Experiments import BestResults
from .Datasets import Datasets
from .Utils import ( from .Utils import (
Folders, Folders,
Files, Files,

View File

@@ -1,4 +1,5 @@
from .Experiments import Experiment, Datasets, DatasetsSurcov, DatasetsTanveer from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer
from .Experiments import Experiment
from .Results import Report, Summary from .Results import Report, Summary
__author__ = "Ricardo Montañana Gómez" __author__ = "Ricardo Montañana Gómez"

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
from benchmark.Results import ReportBest from benchmark.Results import ReportBest
from benchmark.Experiments import Datasets, BestResults from benchmark.Experiments import BestResults
from benchmark.Datasets import Datasets
from benchmark.Arguments import Arguments from benchmark.Arguments import Arguments
"""Build a json file with the best results of a model and its hyperparameters """Build a json file with the best results of a model and its hyperparameters

View File

@@ -1,5 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
from benchmark.Experiments import GridSearch, Datasets from benchmark.Experiments import GridSearch
from benchmark.Datasets import Datasets
from benchmark.Arguments import Arguments from benchmark.Arguments import Arguments
"""Do experiment and build result file, optionally print report with results """Do experiment and build result file, optionally print report with results

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python #!/usr/bin/env python
import os import os
from benchmark.Experiments import Experiment, Datasets from benchmark.Experiments import Experiment
from benchmark.Datasets import Datasets
from benchmark.Results import Report from benchmark.Results import Report
from benchmark.Arguments import Arguments from benchmark.Arguments import Arguments

View File

@@ -3,7 +3,7 @@ import os
import json import json
from stree import Stree from stree import Stree
from graphviz import Source from graphviz import Source
from benchmark.Experiments import Datasets from benchmark.Datasets import Datasets
from benchmark.Utils import Files, Folders from benchmark.Utils import Files, Folders
from benchmark.Arguments import Arguments from benchmark.Arguments import Arguments

View File

@@ -1,6 +1,7 @@
import os import os
from .TestBase import TestBase from .TestBase import TestBase
from ..Experiments import BestResults, Datasets from ..Experiments import BestResults
from ..Datasets import Datasets
class BestResultTest(TestBase): class BestResultTest(TestBase):

View File

@@ -1,6 +1,7 @@
import shutil import shutil
from .TestBase import TestBase from .TestBase import TestBase
from ..Experiments import Randomized, Datasets from ..Experiments import Randomized
from ..Datasets import Datasets
class DatasetTest(TestBase): class DatasetTest(TestBase):

View File

@@ -1,6 +1,7 @@
import json import json
from .TestBase import TestBase from .TestBase import TestBase
from ..Experiments import Experiment, Datasets from ..Experiments import Experiment
from ..Datasets import Datasets
class ExperimentTest(TestBase): class ExperimentTest(TestBase):

View File

@@ -1,6 +1,7 @@
import json import json
from .TestBase import TestBase from .TestBase import TestBase
from ..Experiments import GridSearch, Datasets from ..Experiments import GridSearch
from ..Datasets import Datasets
class GridSearchTest(TestBase): class GridSearchTest(TestBase):

View File

@@ -6,7 +6,7 @@
"kernel": "liblinear", "kernel": "liblinear",
"multiclass_strategy": "ovr" "multiclass_strategy": "ovr"
}, },
"v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s" "v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
], ],
"balloons": [ "balloons": [
0.625, 0.625,
@@ -15,6 +15,6 @@
"kernel": "linear", "kernel": "linear",
"multiclass_strategy": "ovr" "multiclass_strategy": "ovr"
}, },
"v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s" "v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
] ]
} }

View File

@@ -1,5 +1,6 @@
pandas pandas
scikit-learn scikit-learn
scipy
odte odte
mufs mufs
xlsxwriter xlsxwriter