Add ignore_nan and fit_params to experiments

This commit is contained in:
2022-12-28 19:13:58 +01:00
parent 671e5af45c
commit c21fd4849c
8 changed files with 54 additions and 9 deletions

View File

@@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser):
("-p", "--hyperparameters"), ("-p", "--hyperparameters"),
{"type": str, "required": False, "default": "{}"}, {"type": str, "required": False, "default": "{}"},
], ],
"ignore_nan": [
("--ignore-nan",),
{
"action": EnvDefault,
"envvar": "ignore_nan",
"required": True,
"help": "Ignore nan results",
},
],
"key": [ "key": [
("-k", "--key"), ("-k", "--key"),
{ {

View File

@@ -35,7 +35,7 @@ class DatasetsArff:
df.dropna(axis=0, how="any", inplace=True) df.dropna(axis=0, how="any", inplace=True)
self.dataset = df self.dataset = df
X = df.drop(class_name, axis=1) X = df.drop(class_name, axis=1)
self.features = X.columns self.features = X.columns.to_list()
self.class_name = class_name self.class_name = class_name
y, _ = pd.factorize(df[class_name]) y, _ = pd.factorize(df[class_name])
X = X.to_numpy() X = X.to_numpy()
@@ -103,12 +103,11 @@ class Datasets:
) )
self.discretize = envData["discretize"] == "1" self.discretize = envData["discretize"] == "1"
self.dataset = source_name() self.dataset = source_name()
self.class_names = []
self.data_sets = []
# initialize self.class_names & self.data_sets # initialize self.class_names & self.data_sets
class_names, sets = self._init_names(dataset_name) class_names, sets = self._init_names(dataset_name)
self.class_names = class_names self.class_names = class_names
self.data_sets = sets self.data_sets = sets
self.states = {} # states of discretized variables
def _init_names(self, dataset_name): def _init_names(self, dataset_name):
file_name = os.path.join(self.dataset.folder(), Files.index) file_name = os.path.join(self.dataset.folder(), Files.index)
@@ -161,6 +160,9 @@ class Datasets:
def get_features(self): def get_features(self):
return self.dataset.features return self.dataset.features
def get_states(self, name):
return self.states[name] if name in self.states else None
def get_continuous_features(self): def get_continuous_features(self):
return self.continuous_features_dataset return self.continuous_features_dataset
@@ -170,6 +172,12 @@ class Datasets:
def get_dataset(self): def get_dataset(self):
return self.dataset.dataset return self.dataset.dataset
def build_states(self, name, X):
features = self.get_features()
self.states[name] = {
features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1])
}
def load(self, name, dataframe=False): def load(self, name, dataframe=False):
def get_range_features(X, name): def get_range_features(X, name):
c_features = self.continuous_features[name] c_features = self.continuous_features[name]
@@ -183,6 +191,7 @@ class Datasets:
self.continuous_features_dataset = get_range_features(X, name) self.continuous_features_dataset = get_range_features(X, name)
if self.discretize: if self.discretize:
X = self.discretize_dataset(X, y) X = self.discretize_dataset(X, y)
self.build_states(name, X)
dataset = pd.DataFrame(X, columns=self.get_features()) dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y dataset[self.get_class_name()] = y
self.dataset.dataset = dataset self.dataset.dataset = dataset

View File

@@ -112,6 +112,7 @@ class Experiment:
platform, platform,
title, title,
progress_bar=True, progress_bar=True,
ignore_nan=True,
folds=5, folds=5,
): ):
today = datetime.now() today = datetime.now()
@@ -131,6 +132,7 @@ class Experiment:
self.score_name = score_name self.score_name = score_name
self.model_name = model_name self.model_name = model_name
self.title = title self.title = title
self.ignore_nan = ignore_nan
self.stratified = stratified == "1" self.stratified = stratified == "1"
self.stratified_class = StratifiedKFold if self.stratified else KFold self.stratified_class = StratifiedKFold if self.stratified else KFold
self.datasets = datasets self.datasets = datasets
@@ -184,7 +186,14 @@ class Experiment:
self.leaves = [] self.leaves = []
self.depths = [] self.depths = []
def _n_fold_crossval(self, X, y, hyperparameters): def _build_fit_params(self, name):
states = self.datasets.get_states(name)
if states is None:
return None
features = self.datasets.get_features()
return {"state_names": states, "features": features}
def _n_fold_crossval(self, name, X, y, hyperparameters):
if self.scores != []: if self.scores != []:
raise ValueError("Must init experiment before!") raise ValueError("Must init experiment before!")
loop = tqdm( loop = tqdm(
@@ -201,6 +210,7 @@ class Experiment:
shuffle=True, random_state=random_state, n_splits=self.folds shuffle=True, random_state=random_state, n_splits=self.folds
) )
clf = self._build_classifier(random_state, hyperparameters) clf = self._build_classifier(random_state, hyperparameters)
fit_params = self._build_fit_params(name)
self.version = Models.get_version(self.model_name, clf) self.version = Models.get_version(self.model_name, clf)
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
@@ -209,10 +219,21 @@ class Experiment:
X, X,
y, y,
cv=kfold, cv=kfold,
fit_params=fit_params,
return_estimator=True, return_estimator=True,
scoring=self.score_name, scoring=self.score_name,
) )
self.scores.append(res["test_score"]) if np.isnan(res["test_score"]).any():
if not self.ignore_nan:
print(res["test_score"])
raise ValueError("NaN in results")
results = []
for item in res["test_score"]:
if not np.isnan(item):
results.append(item)
else:
results = res["test_score"]
self.scores.append(results)
self.times.append(res["fit_time"]) self.times.append(res["fit_time"])
for result_item in res["estimator"]: for result_item in res["estimator"]:
nodes_item, leaves_item, depth_item = Models.get_complexity( nodes_item, leaves_item, depth_item = Models.get_complexity(
@@ -273,7 +294,7 @@ class Experiment:
n_classes = len(np.unique(y)) n_classes = len(np.unique(y))
hyperparameters = self.hyperparameters_dict[name][1] hyperparameters = self.hyperparameters_dict[name][1]
self._init_experiment() self._init_experiment()
self._n_fold_crossval(X, y, hyperparameters) self._n_fold_crossval(name, X, y, hyperparameters)
self._add_results(name, hyperparameters, samp, feat, n_classes) self._add_results(name, hyperparameters, samp, feat, n_classes)
self._output_results() self._output_results()
self.duration = time.time() - now self.duration = time.time() - now

View File

@@ -13,7 +13,7 @@ def main(args_test=None):
arguments = Arguments(prog="be_main") arguments = Arguments(prog="be_main")
arguments.xset("stratified").xset("score").xset("model", mandatory=True) arguments.xset("stratified").xset("score").xset("model", mandatory=True)
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title") arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
arguments.xset("report") arguments.xset("report").xset("ignore_nan")
arguments.add_exclusive( arguments.add_exclusive(
["grid_paramfile", "best_paramfile", "hyperparameters"] ["grid_paramfile", "best_paramfile", "hyperparameters"]
) )
@@ -35,6 +35,7 @@ def main(args_test=None):
grid_paramfile=args.grid_paramfile, grid_paramfile=args.grid_paramfile,
progress_bar=not args.quiet, progress_bar=not args.quiet,
platform=args.platform, platform=args.platform,
ignore_nan=args.ignore_nan,
title=args.title, title=args.title,
folds=args.n_folds, folds=args.n_folds,
) )

View File

@@ -7,3 +7,5 @@ stratified=0
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0 discretize=0
ignore_nan=0

View File

@@ -5,4 +5,5 @@ model=ODTE
stratified=0 stratified=0
source_data=Arff source_data=Arff
seeds=[271, 314, 171] seeds=[271, 314, 171]
discretize=1 discretize=1
ignore_nan=1

View File

@@ -7,3 +7,4 @@ stratified=0
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0 discretize=0
ignore_nan=0

View File

@@ -6,4 +6,5 @@ stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Surcov source_data=Surcov
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0 discretize=0
ignore_nan=0