diff --git a/benchmark/Arguments.py b/benchmark/Arguments.py index a034934..67a2515 100644 --- a/benchmark/Arguments.py +++ b/benchmark/Arguments.py @@ -1,6 +1,6 @@ import sys import argparse -from .Experiments import Models +from .Models import Models from .Utils import Files, NO_ENV ALL_METRICS = ( diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py new file mode 100644 index 0000000..20a4894 --- /dev/null +++ b/benchmark/Datasets.py @@ -0,0 +1,103 @@ +import os +import pandas as pd +from scipy.io import arff +from .Utils import Files +from .Arguments import EnvData + + +class Diterator: + def __init__(self, data): + self._stack = data.copy() + + def __next__(self): + if len(self._stack) == 0: + raise StopIteration() + return self._stack.pop(0) + + +class DatasetsArff: + @staticmethod + def dataset_names(name): + return f"{name}.arff" + + @staticmethod + def folder(): + return "datasets" + + def load(self, name, class_name="class"): + file_name = os.path.join(self.folder(), self.dataset_names(name)) + data = arff.loadarff(file_name) + df = pd.DataFrame(data[0]) + y = df[class_name] + X = data.drop(class_name, axis=1).to_numpy() + y = data[class_name].to_numpy() + return X, y + + +class DatasetsTanveer: + @staticmethod + def dataset_names(name): + return f"{name}_R.dat" + + @staticmethod + def folder(): + return "data" + + def load(self, name): + file_name = os.path.join(self.folder(), self.dataset_names(name)) + data = pd.read_csv( + file_name, + sep="\t", + index_col=0, + ) + X = data.drop("clase", axis=1).to_numpy() + y = data["clase"].to_numpy() + return X, y + + +class DatasetsSurcov: + @staticmethod + def dataset_names(name): + return f"{name}.csv" + + @staticmethod + def folder(): + return "datasets" + + def load(self, name): + file_name = os.path.join(self.folder(), self.dataset_names(name)) + data = pd.read_csv( + file_name, + index_col=0, + ) + data.dropna(axis=0, how="any", inplace=True) + self.columns = data.columns + col_list = ["class"] + X = data.drop(col_list, axis=1).to_numpy() + y = data["class"].to_numpy() + return X, y + + +class Datasets: + def __init__(self, dataset_name=None): + envData = EnvData.load() + class_name = getattr( + __import__(__name__), + f"Datasets{envData['source_data']}", + ) + self.dataset = class_name() + if dataset_name is None: + file_name = os.path.join(self.dataset.folder(), Files.index) + with open(file_name) as f: + self.data_sets = f.read().splitlines() + else: + self.data_sets = [dataset_name] + + def load(self, name): + try: + return self.dataset.load(name) + except FileNotFoundError: + raise ValueError(f"Unknown dataset: {name}") + + def __iter__(self) -> Diterator: + return Diterator(self.data_sets) diff --git a/benchmark/Experiments.py b/benchmark/Experiments.py index ab2063a..658805a 100644 --- a/benchmark/Experiments.py +++ b/benchmark/Experiments.py @@ -6,7 +6,6 @@ import time from datetime import datetime from tqdm import tqdm import numpy as np -import pandas as pd from sklearn.model_selection import ( StratifiedKFold, KFold, @@ -14,93 +13,14 @@ from sklearn.model_selection import ( cross_validate, ) from .Utils import Folders, Files, NO_RESULTS +from .Datasets import Datasets from .Models import Models -from .Arguments import EnvData class Randomized: seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] -class Diterator: - def __init__(self, data): - self._stack = data.copy() - - def __next__(self): - if len(self._stack) == 0: - raise StopIteration() - return self._stack.pop(0) - - -class DatasetsTanveer: - @staticmethod - def dataset_names(name): - return f"{name}_R.dat" - - @staticmethod - def folder(): - return "data" - - def load(self, name): - file_name = os.path.join(self.folder(), self.dataset_names(name)) - data = pd.read_csv( - file_name, - sep="\t", - index_col=0, - ) - X = data.drop("clase", axis=1).to_numpy() - y = data["clase"].to_numpy() - return X, y - - -class DatasetsSurcov: - @staticmethod - def dataset_names(name): - return f"{name}.csv" - - @staticmethod - def folder(): - return "datasets" - - def load(self, name): - file_name = os.path.join(self.folder(), self.dataset_names(name)) - data = pd.read_csv( - file_name, - index_col=0, - ) - data.dropna(axis=0, how="any", inplace=True) - self.columns = data.columns - col_list = ["class"] - X = data.drop(col_list, axis=1).to_numpy() - y = data["class"].to_numpy() - return X, y - - -class Datasets: - def __init__(self, dataset_name=None): - envData = EnvData.load() - class_name = getattr( - __import__(__name__), - f"Datasets{envData['source_data']}", - ) - self.dataset = class_name() - if dataset_name is None: - file_name = os.path.join(self.dataset.folder(), Files.index) - with open(file_name) as f: - self.data_sets = f.read().splitlines() - else: - self.data_sets = [dataset_name] - - def load(self, name): - try: - return self.dataset.load(name) - except FileNotFoundError: - raise ValueError(f"Unknown dataset: {name}") - - def __iter__(self) -> Diterator: - return Diterator(self.data_sets) - - class BestResults: def __init__(self, score, model, datasets, quiet=False): self.score_name = score diff --git a/benchmark/Results.py b/benchmark/Results.py index f4ff7ca..c5940ab 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -7,7 +7,8 @@ import shutil import subprocess import xlsxwriter import numpy as np -from .Experiments import Datasets, BestResults +from .Experiments import BestResults +from .Datasets import Datasets from .Utils import ( Folders, Files, diff --git a/benchmark/__init__.py b/benchmark/__init__.py index bafc822..cac5b02 100644 --- a/benchmark/__init__.py +++ b/benchmark/__init__.py @@ -1,4 +1,5 @@ -from .Experiments import Experiment, Datasets, DatasetsSurcov, DatasetsTanveer +from .Datasets import Datasets, DatasetsSurcov, DatasetsTanveer +from .Experiments import Experiment from .Results import Report, Summary __author__ = "Ricardo Montañana Gómez" diff --git a/benchmark/scripts/be_build_best.py b/benchmark/scripts/be_build_best.py index 08d5f96..233bf22 100755 --- a/benchmark/scripts/be_build_best.py +++ b/benchmark/scripts/be_build_best.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from benchmark.Results import ReportBest -from benchmark.Experiments import Datasets, BestResults +from benchmark.Experiments import BestResults +from benchmark.Datasets import Datasets from benchmark.Arguments import Arguments """Build a json file with the best results of a model and its hyperparameters diff --git a/benchmark/scripts/be_grid.py b/benchmark/scripts/be_grid.py index ec2f8ae..8f10e48 100755 --- a/benchmark/scripts/be_grid.py +++ b/benchmark/scripts/be_grid.py @@ -1,5 +1,6 @@ #!/usr/bin/env python -from benchmark.Experiments import GridSearch, Datasets +from benchmark.Experiments import GridSearch +from benchmark.Datasets import Datasets from benchmark.Arguments import Arguments """Do experiment and build result file, optionally print report with results diff --git a/benchmark/scripts/be_main.py b/benchmark/scripts/be_main.py index 971598f..dcd8b0e 100755 --- a/benchmark/scripts/be_main.py +++ b/benchmark/scripts/be_main.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import os -from benchmark.Experiments import Experiment, Datasets +from benchmark.Experiments import Experiment +from benchmark.Datasets import Datasets from benchmark.Results import Report from benchmark.Arguments import Arguments diff --git a/benchmark/scripts/be_print_strees.py b/benchmark/scripts/be_print_strees.py index 0a55a19..1c41e50 100755 --- a/benchmark/scripts/be_print_strees.py +++ b/benchmark/scripts/be_print_strees.py @@ -3,7 +3,7 @@ import os import json from stree import Stree from graphviz import Source -from benchmark.Experiments import Datasets +from benchmark.Datasets import Datasets from benchmark.Utils import Files, Folders from benchmark.Arguments import Arguments diff --git a/benchmark/tests/BestResults_test.py b/benchmark/tests/BestResults_test.py index f6a4b32..76a5ea8 100644 --- a/benchmark/tests/BestResults_test.py +++ b/benchmark/tests/BestResults_test.py @@ -1,6 +1,7 @@ import os from .TestBase import TestBase -from ..Experiments import BestResults, Datasets +from ..Experiments import BestResults +from ..Datasets import Datasets class BestResultTest(TestBase): diff --git a/benchmark/tests/Dataset_test.py b/benchmark/tests/Dataset_test.py index 63ffc9c..4669922 100644 --- a/benchmark/tests/Dataset_test.py +++ b/benchmark/tests/Dataset_test.py @@ -1,6 +1,7 @@ import shutil from .TestBase import TestBase -from ..Experiments import Randomized, Datasets +from ..Experiments import Randomized +from ..Datasets import Datasets class DatasetTest(TestBase): diff --git a/benchmark/tests/Experiment_test.py b/benchmark/tests/Experiment_test.py index ca5b37f..0f8ffad 100644 --- a/benchmark/tests/Experiment_test.py +++ b/benchmark/tests/Experiment_test.py @@ -1,6 +1,7 @@ import json from .TestBase import TestBase -from ..Experiments import Experiment, Datasets +from ..Experiments import Experiment +from ..Datasets import Datasets class ExperimentTest(TestBase): diff --git a/benchmark/tests/GridSearch_test.py b/benchmark/tests/GridSearch_test.py index 4cfb0f6..b8db074 100644 --- a/benchmark/tests/GridSearch_test.py +++ b/benchmark/tests/GridSearch_test.py @@ -1,6 +1,7 @@ import json from .TestBase import TestBase -from ..Experiments import GridSearch, Datasets +from ..Experiments import GridSearch +from ..Datasets import Datasets class GridSearchTest(TestBase): diff --git a/benchmark/tests/results/grid_output_accuracy_STree.json b/benchmark/tests/results/grid_output_accuracy_STree.json index 7f197d6..731e0b7 100644 --- a/benchmark/tests/results/grid_output_accuracy_STree.json +++ b/benchmark/tests/results/grid_output_accuracy_STree.json @@ -6,7 +6,7 @@ "kernel": "liblinear", "multiclass_strategy": "ovr" }, - "v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s" + "v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s" ], "balloons": [ 0.625, @@ -15,6 +15,6 @@ "kernel": "linear", "multiclass_strategy": "ovr" }, - "v. 1.2.4, Computed on Test on 2022-02-22 at 12:00:00 took 1s" + "v. 1.3.0, Computed on Test on 2022-02-22 at 12:00:00 took 1s" ] } \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 154ab58..02446a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ pandas scikit-learn +scipy odte mufs xlsxwriter