mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-16 16:05:54 +00:00
Split Datasets class from Experiments
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
from .Experiments import Models
|
from .Models import Models
|
||||||
from .Utils import Files, NO_ENV
|
from .Utils import Files, NO_ENV
|
||||||
|
|
||||||
ALL_METRICS = (
|
ALL_METRICS = (
|
||||||
|
103
benchmark/Datasets.py
Normal file
103
benchmark/Datasets.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from scipy.io import arff
|
||||||
|
from .Utils import Files
|
||||||
|
from .Arguments import EnvData
|
||||||
|
|
||||||
|
|
||||||
|
class Diterator:
|
||||||
|
def __init__(self, data):
|
||||||
|
self._stack = data.copy()
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
if len(self._stack) == 0:
|
||||||
|
raise StopIteration()
|
||||||
|
return self._stack.pop(0)
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetsArff:
|
||||||
|
@staticmethod
|
||||||
|
def dataset_names(name):
|
||||||
|
return f"{name}.arff"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def folder():
|
||||||
|
return "datasets"
|
||||||
|
|
||||||
|
def load(self, name, class_name="class"):
|
||||||
|
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||||
|
data = arff.loadarff(file_name)
|
||||||
|
df = pd.DataFrame(data[0])
|
||||||
|
y = df[class_name]
|
||||||
|
X = data.drop(class_name, axis=1).to_numpy()
|
||||||
|
y = data[class_name].to_numpy()
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetsTanveer:
|
||||||
|
@staticmethod
|
||||||
|
def dataset_names(name):
|
||||||
|
return f"{name}_R.dat"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def folder():
|
||||||
|
return "data"
|
||||||
|
|
||||||
|
def load(self, name):
|
||||||
|
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||||
|
data = pd.read_csv(
|
||||||
|
file_name,
|
||||||
|
sep="\t",
|
||||||
|
index_col=0,
|
||||||
|
)
|
||||||
|
X = data.drop("clase", axis=1).to_numpy()
|
||||||
|
y = data["clase"].to_numpy()
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
class DatasetsSurcov:
|
||||||
|
@staticmethod
|
||||||
|
def dataset_names(name):
|
||||||
|
return f"{name}.csv"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def folder():
|
||||||
|
return "datasets"
|
||||||
|
|
||||||
|
def load(self, name):
|
||||||
|
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||||
|
data = pd.read_csv(
|
||||||
|
file_name,
|
||||||
|
index_col=0,
|
||||||
|
)
|
||||||
|
data.dropna(axis=0, how="any", inplace=True)
|
||||||
|
self.columns = data.columns
|
||||||
|
col_list = ["class"]
|
||||||
|
X = data.drop(col_list, axis=1).to_numpy()
|
||||||
|
y = data["class"].to_numpy()
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
class Datasets:
|
||||||
|
def __init__(self, dataset_name=None):
|
||||||
|
envData = EnvData.load()
|
||||||
|
class_name = getattr(
|
||||||
|
__import__(__name__),
|
||||||
|
f"Datasets{envData['source_data']}",
|
||||||
|
)
|
||||||
|
self.dataset = class_name()
|
||||||
|
if dataset_name is None:
|
||||||
|
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||||
|
with open(file_name) as f:
|
||||||
|
self.data_sets = f.read().splitlines()
|
||||||
|
else:
|
||||||
|
self.data_sets = [dataset_name]
|
||||||
|
|
||||||
|
def load(self, name):
|
||||||
|
try:
|
||||||
|
return self.dataset.load(name)
|
||||||
|
except FileNotFoundError:
|
||||||
|
raise ValueError(f"Unknown dataset: {name}")
|
||||||
|
|
||||||
|
def __iter__(self) -> Diterator:
|
||||||
|
return Diterator(self.data_sets)
|
@@ -6,7 +6,6 @@ import time
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
|
||||||
from sklearn.model_selection import (
|
from sklearn.model_selection import (
|
||||||
StratifiedKFold,
|
StratifiedKFold,
|
||||||
KFold,
|
KFold,
|
||||||
@@ -14,93 +13,14 @@ from sklearn.model_selection import (
|
|||||||
cross_validate,
|
cross_validate,
|
||||||
)
|
)
|
||||||
from .Utils import Folders, Files, NO_RESULTS
|
from .Utils import Folders, Files, NO_RESULTS
|
||||||
|
from .Datasets import Datasets
|
||||||
from .Models import Models
|
from .Models import Models
|
||||||
from .Arguments import EnvData
|
|
||||||
|
|
||||||
|
|
||||||
class Randomized:
|
class Randomized:
|
||||||
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||||
|
|
||||||
|
|
||||||
class Diterator:
|
|
||||||
def __init__(self, data):
|
|
||||||
self._stack = data.copy()
|
|
||||||
|
|
||||||
def __next__(self):
|
|
||||||
if len(self._stack) == 0:
|
|
||||||
raise StopIteration()
|
|
||||||
return self._stack.pop(0)
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetsTanveer:
|
|
||||||
@staticmethod
|
|
||||||
def dataset_names(name):
|
|
||||||
return f"{name}_R.dat"
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def folder():
|
|
||||||
return "data"
|
|
||||||
|
|
||||||
def load(self, name):
|
|
||||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
|
||||||
data = pd.read_csv(
|
|
||||||
file_name,
|
|
||||||
sep="\t",
|
|
||||||
index_col=0,
|
|
||||||
)
|
|
||||||
X = data.drop("clase", axis=1).to_numpy()
|
|
||||||
y = data["clase"].to_numpy()
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetsSurcov:
|
|
||||||
@staticmethod
|
|
||||||
def dataset_names(name):
|
|
||||||
return f"{name}.csv"
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def folder():
|
|
||||||
return "datasets"
|
|
||||||
|
|
||||||
def load(self, name):
|
|
||||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
|
||||||
data = pd.read_csv(
|
|
||||||
file_name,
|
|
||||||
index_col=0,
|
|
||||||
)
|
|
||||||
data.dropna(axis=0, how="any", inplace=True)
|
|
||||||
self.columns = data.columns
|
|
||||||
col_list = ["class"]
|
|
||||||
X = data.drop(col_list, axis=1).to_numpy()
|
|
||||||
y = data["class"].to_numpy()
|
|
||||||
return X, y
|
|
||||||
|
|
||||||
|
|
||||||
class Datasets:
|
|
||||||
def __init__(self, dataset_name=None):
|
|
||||||
envData = EnvData.load()
|
|
||||||
class_name = getattr(
|
|
||||||
__import__(__name__),
|
|
||||||
f"Datasets{envData['source_data']}",
|
|
||||||
)
|
|
||||||
self.dataset = class_name()
|
|
||||||
if dataset_name is None:
|
|
||||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
|
||||||
with open(file_name) as f:
|
|
||||||
self.data_sets = f.read().splitlines()
|
|
||||||
else:
|
|
||||||
self.data_sets = [dataset_name]
|
|
||||||
|
|
||||||
def load(self, name):
|
|
||||||
try:
|
|
||||||
return self.dataset.load(name)
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise ValueError(f"Unknown dataset: {name}")
|
|
||||||
|
|
||||||
def __iter__(self) -> Diterator:
|
|
||||||
return Diterator(self.data_sets)
|
|
||||||
|
|
||||||
|
|
||||||
class BestResults:
|
class BestResults:
|
||||||
def __init__(self, score, model, datasets, quiet=False):
|
def __init__(self, score, model, datasets, quiet=False):
|
||||||
self.score_name = score
|
self.score_name = score
|
||||||
|
@@ -7,7 +7,8 @@ import shutil
|
|||||||
import subprocess
|
import subprocess
|
||||||
import xlsxwriter
|
import xlsxwriter
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from .Experiments import Datasets, BestResults
|
from .Experiments import BestResults
|
||||||
|
from .Datasets import Datasets
|
||||||
from .Utils import (
|
from .Utils import (
|
||||||
Folders,
|
Folders,
|
||||||
Files,
|
Files,
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
from .Experiments import Experiment, Datasets, DatasetsSurcov, DatasetsTanveer
|
from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer
|
||||||
|
from .Experiments import Experiment
|
||||||
from .Results import Report, Summary
|
from .Results import Report, Summary
|
||||||
|
|
||||||
__author__ = "Ricardo Montañana Gómez"
|
__author__ = "Ricardo Montañana Gómez"
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from benchmark.Results import ReportBest
|
from benchmark.Results import ReportBest
|
||||||
from benchmark.Experiments import Datasets, BestResults
|
from benchmark.Experiments import BestResults
|
||||||
|
from benchmark.Datasets import Datasets
|
||||||
from benchmark.Arguments import Arguments
|
from benchmark.Arguments import Arguments
|
||||||
|
|
||||||
"""Build a json file with the best results of a model and its hyperparameters
|
"""Build a json file with the best results of a model and its hyperparameters
|
||||||
|
@@ -1,5 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
from benchmark.Experiments import GridSearch, Datasets
|
from benchmark.Experiments import GridSearch
|
||||||
|
from benchmark.Datasets import Datasets
|
||||||
from benchmark.Arguments import Arguments
|
from benchmark.Arguments import Arguments
|
||||||
|
|
||||||
"""Do experiment and build result file, optionally print report with results
|
"""Do experiment and build result file, optionally print report with results
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import os
|
import os
|
||||||
from benchmark.Experiments import Experiment, Datasets
|
from benchmark.Experiments import Experiment
|
||||||
|
from benchmark.Datasets import Datasets
|
||||||
from benchmark.Results import Report
|
from benchmark.Results import Report
|
||||||
from benchmark.Arguments import Arguments
|
from benchmark.Arguments import Arguments
|
||||||
|
|
||||||
|
@@ -3,7 +3,7 @@ import os
|
|||||||
import json
|
import json
|
||||||
from stree import Stree
|
from stree import Stree
|
||||||
from graphviz import Source
|
from graphviz import Source
|
||||||
from benchmark.Experiments import Datasets
|
from benchmark.Datasets import Datasets
|
||||||
from benchmark.Utils import Files, Folders
|
from benchmark.Utils import Files, Folders
|
||||||
from benchmark.Arguments import Arguments
|
from benchmark.Arguments import Arguments
|
||||||
|
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
from .TestBase import TestBase
|
from .TestBase import TestBase
|
||||||
from ..Experiments import BestResults, Datasets
|
from ..Experiments import BestResults
|
||||||
|
from ..Datasets import Datasets
|
||||||
|
|
||||||
|
|
||||||
class BestResultTest(TestBase):
|
class BestResultTest(TestBase):
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import shutil
|
import shutil
|
||||||
from .TestBase import TestBase
|
from .TestBase import TestBase
|
||||||
from ..Experiments import Randomized, Datasets
|
from ..Experiments import Randomized
|
||||||
|
from ..Datasets import Datasets
|
||||||
|
|
||||||
|
|
||||||
class DatasetTest(TestBase):
|
class DatasetTest(TestBase):
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
from .TestBase import TestBase
|
from .TestBase import TestBase
|
||||||
from ..Experiments import Experiment, Datasets
|
from ..Experiments import Experiment
|
||||||
|
from ..Datasets import Datasets
|
||||||
|
|
||||||
|
|
||||||
class ExperimentTest(TestBase):
|
class ExperimentTest(TestBase):
|
||||||
|
@@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
from .TestBase import TestBase
|
from .TestBase import TestBase
|
||||||
from ..Experiments import GridSearch, Datasets
|
from ..Experiments import GridSearch
|
||||||
|
from ..Datasets import Datasets
|
||||||
|
|
||||||
|
|
||||||
class GridSearchTest(TestBase):
|
class GridSearchTest(TestBase):
|
||||||
|
@@ -6,7 +6,7 @@
|
|||||||
"kernel": "liblinear",
|
"kernel": "liblinear",
|
||||||
"multiclass_strategy": "ovr"
|
"multiclass_strategy": "ovr"
|
||||||
},
|
},
|
||||||
"v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
"v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
||||||
],
|
],
|
||||||
"balloons": [
|
"balloons": [
|
||||||
0.625,
|
0.625,
|
||||||
@@ -15,6 +15,6 @@
|
|||||||
"kernel": "linear",
|
"kernel": "linear",
|
||||||
"multiclass_strategy": "ovr"
|
"multiclass_strategy": "ovr"
|
||||||
},
|
},
|
||||||
"v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
"v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s"
|
||||||
]
|
]
|
||||||
}
|
}
|
@@ -1,5 +1,6 @@
|
|||||||
pandas
|
pandas
|
||||||
scikit-learn
|
scikit-learn
|
||||||
|
scipy
|
||||||
odte
|
odte
|
||||||
mufs
|
mufs
|
||||||
xlsxwriter
|
xlsxwriter
|
||||||
|
Reference in New Issue
Block a user