mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-16 07:55:54 +00:00
Add ignore_nan and fit_params to experiments
This commit is contained in:
@@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser):
|
||||
("-p", "--hyperparameters"),
|
||||
{"type": str, "required": False, "default": "{}"},
|
||||
],
|
||||
"ignore_nan": [
|
||||
("--ignore-nan",),
|
||||
{
|
||||
"action": EnvDefault,
|
||||
"envvar": "ignore_nan",
|
||||
"required": True,
|
||||
"help": "Ignore nan results",
|
||||
},
|
||||
],
|
||||
"key": [
|
||||
("-k", "--key"),
|
||||
{
|
||||
|
@@ -35,7 +35,7 @@ class DatasetsArff:
|
||||
df.dropna(axis=0, how="any", inplace=True)
|
||||
self.dataset = df
|
||||
X = df.drop(class_name, axis=1)
|
||||
self.features = X.columns
|
||||
self.features = X.columns.to_list()
|
||||
self.class_name = class_name
|
||||
y, _ = pd.factorize(df[class_name])
|
||||
X = X.to_numpy()
|
||||
@@ -103,12 +103,11 @@ class Datasets:
|
||||
)
|
||||
self.discretize = envData["discretize"] == "1"
|
||||
self.dataset = source_name()
|
||||
self.class_names = []
|
||||
self.data_sets = []
|
||||
# initialize self.class_names & self.data_sets
|
||||
class_names, sets = self._init_names(dataset_name)
|
||||
self.class_names = class_names
|
||||
self.data_sets = sets
|
||||
self.states = {} # states of discretized variables
|
||||
|
||||
def _init_names(self, dataset_name):
|
||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||
@@ -161,6 +160,9 @@ class Datasets:
|
||||
def get_features(self):
|
||||
return self.dataset.features
|
||||
|
||||
def get_states(self, name):
|
||||
return self.states[name] if name in self.states else None
|
||||
|
||||
def get_continuous_features(self):
|
||||
return self.continuous_features_dataset
|
||||
|
||||
@@ -170,6 +172,12 @@ class Datasets:
|
||||
def get_dataset(self):
|
||||
return self.dataset.dataset
|
||||
|
||||
def build_states(self, name, X):
|
||||
features = self.get_features()
|
||||
self.states[name] = {
|
||||
features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1])
|
||||
}
|
||||
|
||||
def load(self, name, dataframe=False):
|
||||
def get_range_features(X, name):
|
||||
c_features = self.continuous_features[name]
|
||||
@@ -183,6 +191,7 @@ class Datasets:
|
||||
self.continuous_features_dataset = get_range_features(X, name)
|
||||
if self.discretize:
|
||||
X = self.discretize_dataset(X, y)
|
||||
self.build_states(name, X)
|
||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||
dataset[self.get_class_name()] = y
|
||||
self.dataset.dataset = dataset
|
||||
|
@@ -112,6 +112,7 @@ class Experiment:
|
||||
platform,
|
||||
title,
|
||||
progress_bar=True,
|
||||
ignore_nan=True,
|
||||
folds=5,
|
||||
):
|
||||
today = datetime.now()
|
||||
@@ -131,6 +132,7 @@ class Experiment:
|
||||
self.score_name = score_name
|
||||
self.model_name = model_name
|
||||
self.title = title
|
||||
self.ignore_nan = ignore_nan
|
||||
self.stratified = stratified == "1"
|
||||
self.stratified_class = StratifiedKFold if self.stratified else KFold
|
||||
self.datasets = datasets
|
||||
@@ -184,7 +186,14 @@ class Experiment:
|
||||
self.leaves = []
|
||||
self.depths = []
|
||||
|
||||
def _n_fold_crossval(self, X, y, hyperparameters):
|
||||
def _build_fit_params(self, name):
|
||||
states = self.datasets.get_states(name)
|
||||
if states is None:
|
||||
return None
|
||||
features = self.datasets.get_features()
|
||||
return {"state_names": states, "features": features}
|
||||
|
||||
def _n_fold_crossval(self, name, X, y, hyperparameters):
|
||||
if self.scores != []:
|
||||
raise ValueError("Must init experiment before!")
|
||||
loop = tqdm(
|
||||
@@ -201,6 +210,7 @@ class Experiment:
|
||||
shuffle=True, random_state=random_state, n_splits=self.folds
|
||||
)
|
||||
clf = self._build_classifier(random_state, hyperparameters)
|
||||
fit_params = self._build_fit_params(name)
|
||||
self.version = Models.get_version(self.model_name, clf)
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore")
|
||||
@@ -209,10 +219,21 @@ class Experiment:
|
||||
X,
|
||||
y,
|
||||
cv=kfold,
|
||||
fit_params=fit_params,
|
||||
return_estimator=True,
|
||||
scoring=self.score_name,
|
||||
)
|
||||
self.scores.append(res["test_score"])
|
||||
if np.isnan(res["test_score"]).any():
|
||||
if not self.ignore_nan:
|
||||
print(res["test_score"])
|
||||
raise ValueError("NaN in results")
|
||||
results = []
|
||||
for item in res["test_score"]:
|
||||
if not np.isnan(item):
|
||||
results.append(item)
|
||||
else:
|
||||
results = res["test_score"]
|
||||
self.scores.append(results)
|
||||
self.times.append(res["fit_time"])
|
||||
for result_item in res["estimator"]:
|
||||
nodes_item, leaves_item, depth_item = Models.get_complexity(
|
||||
@@ -273,7 +294,7 @@ class Experiment:
|
||||
n_classes = len(np.unique(y))
|
||||
hyperparameters = self.hyperparameters_dict[name][1]
|
||||
self._init_experiment()
|
||||
self._n_fold_crossval(X, y, hyperparameters)
|
||||
self._n_fold_crossval(name, X, y, hyperparameters)
|
||||
self._add_results(name, hyperparameters, samp, feat, n_classes)
|
||||
self._output_results()
|
||||
self.duration = time.time() - now
|
||||
|
@@ -13,7 +13,7 @@ def main(args_test=None):
|
||||
arguments = Arguments(prog="be_main")
|
||||
arguments.xset("stratified").xset("score").xset("model", mandatory=True)
|
||||
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
|
||||
arguments.xset("report")
|
||||
arguments.xset("report").xset("ignore_nan")
|
||||
arguments.add_exclusive(
|
||||
["grid_paramfile", "best_paramfile", "hyperparameters"]
|
||||
)
|
||||
@@ -35,6 +35,7 @@ def main(args_test=None):
|
||||
grid_paramfile=args.grid_paramfile,
|
||||
progress_bar=not args.quiet,
|
||||
platform=args.platform,
|
||||
ignore_nan=args.ignore_nan,
|
||||
title=args.title,
|
||||
folds=args.n_folds,
|
||||
)
|
||||
|
@@ -7,3 +7,5 @@ stratified=0
|
||||
source_data=Tanveer
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
discretize=0
|
||||
ignore_nan=0
|
||||
|
||||
|
@@ -6,3 +6,4 @@ stratified=0
|
||||
source_data=Arff
|
||||
seeds=[271, 314, 171]
|
||||
discretize=1
|
||||
ignore_nan=1
|
@@ -7,3 +7,4 @@ stratified=0
|
||||
source_data=Tanveer
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
discretize=0
|
||||
ignore_nan=0
|
||||
|
@@ -7,3 +7,4 @@ stratified=0
|
||||
source_data=Surcov
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
discretize=0
|
||||
ignore_nan=0
|
Reference in New Issue
Block a user