Add ignore_nan and fit_params to experiments

This commit is contained in:
2022-12-28 19:13:58 +01:00
parent 671e5af45c
commit c21fd4849c
8 changed files with 54 additions and 9 deletions

View File

@@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser):
("-p", "--hyperparameters"),
{"type": str, "required": False, "default": "{}"},
],
"ignore_nan": [
("--ignore-nan",),
{
"action": EnvDefault,
"envvar": "ignore_nan",
"required": True,
"help": "Ignore nan results",
},
],
"key": [
("-k", "--key"),
{

View File

@@ -35,7 +35,7 @@ class DatasetsArff:
df.dropna(axis=0, how="any", inplace=True)
self.dataset = df
X = df.drop(class_name, axis=1)
self.features = X.columns
self.features = X.columns.to_list()
self.class_name = class_name
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
@@ -103,12 +103,11 @@ class Datasets:
)
self.discretize = envData["discretize"] == "1"
self.dataset = source_name()
self.class_names = []
self.data_sets = []
# initialize self.class_names & self.data_sets
class_names, sets = self._init_names(dataset_name)
self.class_names = class_names
self.data_sets = sets
self.states = {} # states of discretized variables
def _init_names(self, dataset_name):
file_name = os.path.join(self.dataset.folder(), Files.index)
@@ -161,6 +160,9 @@ class Datasets:
def get_features(self):
return self.dataset.features
def get_states(self, name):
return self.states[name] if name in self.states else None
def get_continuous_features(self):
return self.continuous_features_dataset
@@ -170,6 +172,12 @@ class Datasets:
def get_dataset(self):
return self.dataset.dataset
def build_states(self, name, X):
features = self.get_features()
self.states[name] = {
features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1])
}
def load(self, name, dataframe=False):
def get_range_features(X, name):
c_features = self.continuous_features[name]
@@ -183,6 +191,7 @@ class Datasets:
self.continuous_features_dataset = get_range_features(X, name)
if self.discretize:
X = self.discretize_dataset(X, y)
self.build_states(name, X)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
self.dataset.dataset = dataset

View File

@@ -112,6 +112,7 @@ class Experiment:
platform,
title,
progress_bar=True,
ignore_nan=True,
folds=5,
):
today = datetime.now()
@@ -131,6 +132,7 @@ class Experiment:
self.score_name = score_name
self.model_name = model_name
self.title = title
self.ignore_nan = ignore_nan
self.stratified = stratified == "1"
self.stratified_class = StratifiedKFold if self.stratified else KFold
self.datasets = datasets
@@ -184,7 +186,14 @@ class Experiment:
self.leaves = []
self.depths = []
def _n_fold_crossval(self, X, y, hyperparameters):
def _build_fit_params(self, name):
states = self.datasets.get_states(name)
if states is None:
return None
features = self.datasets.get_features()
return {"state_names": states, "features": features}
def _n_fold_crossval(self, name, X, y, hyperparameters):
if self.scores != []:
raise ValueError("Must init experiment before!")
loop = tqdm(
@@ -201,6 +210,7 @@ class Experiment:
shuffle=True, random_state=random_state, n_splits=self.folds
)
clf = self._build_classifier(random_state, hyperparameters)
fit_params = self._build_fit_params(name)
self.version = Models.get_version(self.model_name, clf)
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
@@ -209,10 +219,21 @@ class Experiment:
X,
y,
cv=kfold,
fit_params=fit_params,
return_estimator=True,
scoring=self.score_name,
)
self.scores.append(res["test_score"])
if np.isnan(res["test_score"]).any():
if not self.ignore_nan:
print(res["test_score"])
raise ValueError("NaN in results")
results = []
for item in res["test_score"]:
if not np.isnan(item):
results.append(item)
else:
results = res["test_score"]
self.scores.append(results)
self.times.append(res["fit_time"])
for result_item in res["estimator"]:
nodes_item, leaves_item, depth_item = Models.get_complexity(
@@ -273,7 +294,7 @@ class Experiment:
n_classes = len(np.unique(y))
hyperparameters = self.hyperparameters_dict[name][1]
self._init_experiment()
self._n_fold_crossval(X, y, hyperparameters)
self._n_fold_crossval(name, X, y, hyperparameters)
self._add_results(name, hyperparameters, samp, feat, n_classes)
self._output_results()
self.duration = time.time() - now

View File

@@ -13,7 +13,7 @@ def main(args_test=None):
arguments = Arguments(prog="be_main")
arguments.xset("stratified").xset("score").xset("model", mandatory=True)
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
arguments.xset("report")
arguments.xset("report").xset("ignore_nan")
arguments.add_exclusive(
["grid_paramfile", "best_paramfile", "hyperparameters"]
)
@@ -35,6 +35,7 @@ def main(args_test=None):
grid_paramfile=args.grid_paramfile,
progress_bar=not args.quiet,
platform=args.platform,
ignore_nan=args.ignore_nan,
title=args.title,
folds=args.n_folds,
)

View File

@@ -7,3 +7,5 @@ stratified=0
source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0
ignore_nan=0

View File

@@ -5,4 +5,5 @@ model=ODTE
stratified=0
source_data=Arff
seeds=[271, 314, 171]
discretize=1
discretize=1
ignore_nan=1

View File

@@ -7,3 +7,4 @@ stratified=0
source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0
ignore_nan=0

View File

@@ -6,4 +6,5 @@ stratified=0
# Source of data Tanveer/Surcov
source_data=Surcov
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0
discretize=0
ignore_nan=0