From c21fd4849ccd325fb1010cdd85c6bdf7ca23a0d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= Date: Wed, 28 Dec 2022 19:13:58 +0100 Subject: [PATCH] Add ignore_nan and fit_params to experiments --- benchmark/Arguments.py | 9 +++++++++ benchmark/Datasets.py | 15 ++++++++++++--- benchmark/Experiments.py | 27 ++++++++++++++++++++++++--- benchmark/scripts/be_main.py | 3 ++- benchmark/tests/.env | 2 ++ benchmark/tests/.env.arff | 3 ++- benchmark/tests/.env.dist | 1 + benchmark/tests/.env.surcov | 3 ++- 8 files changed, 54 insertions(+), 9 deletions(-) diff --git a/benchmark/Arguments.py b/benchmark/Arguments.py index ffc4023..f655dd0 100644 --- a/benchmark/Arguments.py +++ b/benchmark/Arguments.py @@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser): ("-p", "--hyperparameters"), {"type": str, "required": False, "default": "{}"}, ], + "ignore_nan": [ + ("--ignore-nan",), + { + "action": EnvDefault, + "envvar": "ignore_nan", + "required": True, + "help": "Ignore nan results", + }, + ], "key": [ ("-k", "--key"), { diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index e1bb776..dc63a9b 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -35,7 +35,7 @@ class DatasetsArff: df.dropna(axis=0, how="any", inplace=True) self.dataset = df X = df.drop(class_name, axis=1) - self.features = X.columns + self.features = X.columns.to_list() self.class_name = class_name y, _ = pd.factorize(df[class_name]) X = X.to_numpy() @@ -103,12 +103,11 @@ class Datasets: ) self.discretize = envData["discretize"] == "1" self.dataset = source_name() - self.class_names = [] - self.data_sets = [] # initialize self.class_names & self.data_sets class_names, sets = self._init_names(dataset_name) self.class_names = class_names self.data_sets = sets + self.states = {} # states of discretized variables def _init_names(self, dataset_name): file_name = os.path.join(self.dataset.folder(), Files.index) @@ -161,6 +160,9 @@ class Datasets: def get_features(self): return self.dataset.features + def get_states(self, name): + return self.states[name] if name in self.states else None + def get_continuous_features(self): return self.continuous_features_dataset @@ -170,6 +172,12 @@ class Datasets: def get_dataset(self): return self.dataset.dataset + def build_states(self, name, X): + features = self.get_features() + self.states[name] = { + features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1]) + } + def load(self, name, dataframe=False): def get_range_features(X, name): c_features = self.continuous_features[name] @@ -183,6 +191,7 @@ class Datasets: self.continuous_features_dataset = get_range_features(X, name) if self.discretize: X = self.discretize_dataset(X, y) + self.build_states(name, X) dataset = pd.DataFrame(X, columns=self.get_features()) dataset[self.get_class_name()] = y self.dataset.dataset = dataset diff --git a/benchmark/Experiments.py b/benchmark/Experiments.py index 6092955..3696ffe 100644 --- a/benchmark/Experiments.py +++ b/benchmark/Experiments.py @@ -112,6 +112,7 @@ class Experiment: platform, title, progress_bar=True, + ignore_nan=True, folds=5, ): today = datetime.now() @@ -131,6 +132,7 @@ class Experiment: self.score_name = score_name self.model_name = model_name self.title = title + self.ignore_nan = ignore_nan self.stratified = stratified == "1" self.stratified_class = StratifiedKFold if self.stratified else KFold self.datasets = datasets @@ -184,7 +186,14 @@ class Experiment: self.leaves = [] self.depths = [] - def _n_fold_crossval(self, X, y, hyperparameters): + def _build_fit_params(self, name): + states = self.datasets.get_states(name) + if states is None: + return None + features = self.datasets.get_features() + return {"state_names": states, "features": features} + + def _n_fold_crossval(self, name, X, y, hyperparameters): if self.scores != []: raise ValueError("Must init experiment before!") loop = tqdm( @@ -201,6 +210,7 @@ class Experiment: shuffle=True, random_state=random_state, n_splits=self.folds ) clf = self._build_classifier(random_state, hyperparameters) + fit_params = self._build_fit_params(name) self.version = Models.get_version(self.model_name, clf) with warnings.catch_warnings(): warnings.filterwarnings("ignore") @@ -209,10 +219,21 @@ class Experiment: X, y, cv=kfold, + fit_params=fit_params, return_estimator=True, scoring=self.score_name, ) - self.scores.append(res["test_score"]) + if np.isnan(res["test_score"]).any(): + if not self.ignore_nan: + print(res["test_score"]) + raise ValueError("NaN in results") + results = [] + for item in res["test_score"]: + if not np.isnan(item): + results.append(item) + else: + results = res["test_score"] + self.scores.append(results) self.times.append(res["fit_time"]) for result_item in res["estimator"]: nodes_item, leaves_item, depth_item = Models.get_complexity( @@ -273,7 +294,7 @@ class Experiment: n_classes = len(np.unique(y)) hyperparameters = self.hyperparameters_dict[name][1] self._init_experiment() - self._n_fold_crossval(X, y, hyperparameters) + self._n_fold_crossval(name, X, y, hyperparameters) self._add_results(name, hyperparameters, samp, feat, n_classes) self._output_results() self.duration = time.time() - now diff --git a/benchmark/scripts/be_main.py b/benchmark/scripts/be_main.py index 2786967..7d34ac3 100755 --- a/benchmark/scripts/be_main.py +++ b/benchmark/scripts/be_main.py @@ -13,7 +13,7 @@ def main(args_test=None): arguments = Arguments(prog="be_main") arguments.xset("stratified").xset("score").xset("model", mandatory=True) arguments.xset("n_folds").xset("platform").xset("quiet").xset("title") - arguments.xset("report") + arguments.xset("report").xset("ignore_nan") arguments.add_exclusive( ["grid_paramfile", "best_paramfile", "hyperparameters"] ) @@ -35,6 +35,7 @@ def main(args_test=None): grid_paramfile=args.grid_paramfile, progress_bar=not args.quiet, platform=args.platform, + ignore_nan=args.ignore_nan, title=args.title, folds=args.n_folds, ) diff --git a/benchmark/tests/.env b/benchmark/tests/.env index 9641efa..f37499f 100644 --- a/benchmark/tests/.env +++ b/benchmark/tests/.env @@ -7,3 +7,5 @@ stratified=0 source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] discretize=0 +ignore_nan=0 + diff --git a/benchmark/tests/.env.arff b/benchmark/tests/.env.arff index 7f196d9..65ea052 100644 --- a/benchmark/tests/.env.arff +++ b/benchmark/tests/.env.arff @@ -5,4 +5,5 @@ model=ODTE stratified=0 source_data=Arff seeds=[271, 314, 171] -discretize=1 \ No newline at end of file +discretize=1 +ignore_nan=1 \ No newline at end of file diff --git a/benchmark/tests/.env.dist b/benchmark/tests/.env.dist index 9641efa..f1b718a 100644 --- a/benchmark/tests/.env.dist +++ b/benchmark/tests/.env.dist @@ -7,3 +7,4 @@ stratified=0 source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] discretize=0 +ignore_nan=0 diff --git a/benchmark/tests/.env.surcov b/benchmark/tests/.env.surcov index 805ec7b..600a83b 100644 --- a/benchmark/tests/.env.surcov +++ b/benchmark/tests/.env.surcov @@ -6,4 +6,5 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Surcov seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] -discretize=0 \ No newline at end of file +discretize=0 +ignore_nan=0 \ No newline at end of file