mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-16 07:55:54 +00:00
Add ignore_nan and fit_params to experiments
This commit is contained in:
@@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser):
|
|||||||
("-p", "--hyperparameters"),
|
("-p", "--hyperparameters"),
|
||||||
{"type": str, "required": False, "default": "{}"},
|
{"type": str, "required": False, "default": "{}"},
|
||||||
],
|
],
|
||||||
|
"ignore_nan": [
|
||||||
|
("--ignore-nan",),
|
||||||
|
{
|
||||||
|
"action": EnvDefault,
|
||||||
|
"envvar": "ignore_nan",
|
||||||
|
"required": True,
|
||||||
|
"help": "Ignore nan results",
|
||||||
|
},
|
||||||
|
],
|
||||||
"key": [
|
"key": [
|
||||||
("-k", "--key"),
|
("-k", "--key"),
|
||||||
{
|
{
|
||||||
|
@@ -35,7 +35,7 @@ class DatasetsArff:
|
|||||||
df.dropna(axis=0, how="any", inplace=True)
|
df.dropna(axis=0, how="any", inplace=True)
|
||||||
self.dataset = df
|
self.dataset = df
|
||||||
X = df.drop(class_name, axis=1)
|
X = df.drop(class_name, axis=1)
|
||||||
self.features = X.columns
|
self.features = X.columns.to_list()
|
||||||
self.class_name = class_name
|
self.class_name = class_name
|
||||||
y, _ = pd.factorize(df[class_name])
|
y, _ = pd.factorize(df[class_name])
|
||||||
X = X.to_numpy()
|
X = X.to_numpy()
|
||||||
@@ -103,12 +103,11 @@ class Datasets:
|
|||||||
)
|
)
|
||||||
self.discretize = envData["discretize"] == "1"
|
self.discretize = envData["discretize"] == "1"
|
||||||
self.dataset = source_name()
|
self.dataset = source_name()
|
||||||
self.class_names = []
|
|
||||||
self.data_sets = []
|
|
||||||
# initialize self.class_names & self.data_sets
|
# initialize self.class_names & self.data_sets
|
||||||
class_names, sets = self._init_names(dataset_name)
|
class_names, sets = self._init_names(dataset_name)
|
||||||
self.class_names = class_names
|
self.class_names = class_names
|
||||||
self.data_sets = sets
|
self.data_sets = sets
|
||||||
|
self.states = {} # states of discretized variables
|
||||||
|
|
||||||
def _init_names(self, dataset_name):
|
def _init_names(self, dataset_name):
|
||||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||||
@@ -161,6 +160,9 @@ class Datasets:
|
|||||||
def get_features(self):
|
def get_features(self):
|
||||||
return self.dataset.features
|
return self.dataset.features
|
||||||
|
|
||||||
|
def get_states(self, name):
|
||||||
|
return self.states[name] if name in self.states else None
|
||||||
|
|
||||||
def get_continuous_features(self):
|
def get_continuous_features(self):
|
||||||
return self.continuous_features_dataset
|
return self.continuous_features_dataset
|
||||||
|
|
||||||
@@ -170,6 +172,12 @@ class Datasets:
|
|||||||
def get_dataset(self):
|
def get_dataset(self):
|
||||||
return self.dataset.dataset
|
return self.dataset.dataset
|
||||||
|
|
||||||
|
def build_states(self, name, X):
|
||||||
|
features = self.get_features()
|
||||||
|
self.states[name] = {
|
||||||
|
features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1])
|
||||||
|
}
|
||||||
|
|
||||||
def load(self, name, dataframe=False):
|
def load(self, name, dataframe=False):
|
||||||
def get_range_features(X, name):
|
def get_range_features(X, name):
|
||||||
c_features = self.continuous_features[name]
|
c_features = self.continuous_features[name]
|
||||||
@@ -183,6 +191,7 @@ class Datasets:
|
|||||||
self.continuous_features_dataset = get_range_features(X, name)
|
self.continuous_features_dataset = get_range_features(X, name)
|
||||||
if self.discretize:
|
if self.discretize:
|
||||||
X = self.discretize_dataset(X, y)
|
X = self.discretize_dataset(X, y)
|
||||||
|
self.build_states(name, X)
|
||||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||||
dataset[self.get_class_name()] = y
|
dataset[self.get_class_name()] = y
|
||||||
self.dataset.dataset = dataset
|
self.dataset.dataset = dataset
|
||||||
|
@@ -112,6 +112,7 @@ class Experiment:
|
|||||||
platform,
|
platform,
|
||||||
title,
|
title,
|
||||||
progress_bar=True,
|
progress_bar=True,
|
||||||
|
ignore_nan=True,
|
||||||
folds=5,
|
folds=5,
|
||||||
):
|
):
|
||||||
today = datetime.now()
|
today = datetime.now()
|
||||||
@@ -131,6 +132,7 @@ class Experiment:
|
|||||||
self.score_name = score_name
|
self.score_name = score_name
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
self.title = title
|
self.title = title
|
||||||
|
self.ignore_nan = ignore_nan
|
||||||
self.stratified = stratified == "1"
|
self.stratified = stratified == "1"
|
||||||
self.stratified_class = StratifiedKFold if self.stratified else KFold
|
self.stratified_class = StratifiedKFold if self.stratified else KFold
|
||||||
self.datasets = datasets
|
self.datasets = datasets
|
||||||
@@ -184,7 +186,14 @@ class Experiment:
|
|||||||
self.leaves = []
|
self.leaves = []
|
||||||
self.depths = []
|
self.depths = []
|
||||||
|
|
||||||
def _n_fold_crossval(self, X, y, hyperparameters):
|
def _build_fit_params(self, name):
|
||||||
|
states = self.datasets.get_states(name)
|
||||||
|
if states is None:
|
||||||
|
return None
|
||||||
|
features = self.datasets.get_features()
|
||||||
|
return {"state_names": states, "features": features}
|
||||||
|
|
||||||
|
def _n_fold_crossval(self, name, X, y, hyperparameters):
|
||||||
if self.scores != []:
|
if self.scores != []:
|
||||||
raise ValueError("Must init experiment before!")
|
raise ValueError("Must init experiment before!")
|
||||||
loop = tqdm(
|
loop = tqdm(
|
||||||
@@ -201,6 +210,7 @@ class Experiment:
|
|||||||
shuffle=True, random_state=random_state, n_splits=self.folds
|
shuffle=True, random_state=random_state, n_splits=self.folds
|
||||||
)
|
)
|
||||||
clf = self._build_classifier(random_state, hyperparameters)
|
clf = self._build_classifier(random_state, hyperparameters)
|
||||||
|
fit_params = self._build_fit_params(name)
|
||||||
self.version = Models.get_version(self.model_name, clf)
|
self.version = Models.get_version(self.model_name, clf)
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.filterwarnings("ignore")
|
warnings.filterwarnings("ignore")
|
||||||
@@ -209,10 +219,21 @@ class Experiment:
|
|||||||
X,
|
X,
|
||||||
y,
|
y,
|
||||||
cv=kfold,
|
cv=kfold,
|
||||||
|
fit_params=fit_params,
|
||||||
return_estimator=True,
|
return_estimator=True,
|
||||||
scoring=self.score_name,
|
scoring=self.score_name,
|
||||||
)
|
)
|
||||||
self.scores.append(res["test_score"])
|
if np.isnan(res["test_score"]).any():
|
||||||
|
if not self.ignore_nan:
|
||||||
|
print(res["test_score"])
|
||||||
|
raise ValueError("NaN in results")
|
||||||
|
results = []
|
||||||
|
for item in res["test_score"]:
|
||||||
|
if not np.isnan(item):
|
||||||
|
results.append(item)
|
||||||
|
else:
|
||||||
|
results = res["test_score"]
|
||||||
|
self.scores.append(results)
|
||||||
self.times.append(res["fit_time"])
|
self.times.append(res["fit_time"])
|
||||||
for result_item in res["estimator"]:
|
for result_item in res["estimator"]:
|
||||||
nodes_item, leaves_item, depth_item = Models.get_complexity(
|
nodes_item, leaves_item, depth_item = Models.get_complexity(
|
||||||
@@ -273,7 +294,7 @@ class Experiment:
|
|||||||
n_classes = len(np.unique(y))
|
n_classes = len(np.unique(y))
|
||||||
hyperparameters = self.hyperparameters_dict[name][1]
|
hyperparameters = self.hyperparameters_dict[name][1]
|
||||||
self._init_experiment()
|
self._init_experiment()
|
||||||
self._n_fold_crossval(X, y, hyperparameters)
|
self._n_fold_crossval(name, X, y, hyperparameters)
|
||||||
self._add_results(name, hyperparameters, samp, feat, n_classes)
|
self._add_results(name, hyperparameters, samp, feat, n_classes)
|
||||||
self._output_results()
|
self._output_results()
|
||||||
self.duration = time.time() - now
|
self.duration = time.time() - now
|
||||||
|
@@ -13,7 +13,7 @@ def main(args_test=None):
|
|||||||
arguments = Arguments(prog="be_main")
|
arguments = Arguments(prog="be_main")
|
||||||
arguments.xset("stratified").xset("score").xset("model", mandatory=True)
|
arguments.xset("stratified").xset("score").xset("model", mandatory=True)
|
||||||
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
|
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
|
||||||
arguments.xset("report")
|
arguments.xset("report").xset("ignore_nan")
|
||||||
arguments.add_exclusive(
|
arguments.add_exclusive(
|
||||||
["grid_paramfile", "best_paramfile", "hyperparameters"]
|
["grid_paramfile", "best_paramfile", "hyperparameters"]
|
||||||
)
|
)
|
||||||
@@ -35,6 +35,7 @@ def main(args_test=None):
|
|||||||
grid_paramfile=args.grid_paramfile,
|
grid_paramfile=args.grid_paramfile,
|
||||||
progress_bar=not args.quiet,
|
progress_bar=not args.quiet,
|
||||||
platform=args.platform,
|
platform=args.platform,
|
||||||
|
ignore_nan=args.ignore_nan,
|
||||||
title=args.title,
|
title=args.title,
|
||||||
folds=args.n_folds,
|
folds=args.n_folds,
|
||||||
)
|
)
|
||||||
|
@@ -7,3 +7,5 @@ stratified=0
|
|||||||
source_data=Tanveer
|
source_data=Tanveer
|
||||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||||
discretize=0
|
discretize=0
|
||||||
|
ignore_nan=0
|
||||||
|
|
||||||
|
@@ -5,4 +5,5 @@ model=ODTE
|
|||||||
stratified=0
|
stratified=0
|
||||||
source_data=Arff
|
source_data=Arff
|
||||||
seeds=[271, 314, 171]
|
seeds=[271, 314, 171]
|
||||||
discretize=1
|
discretize=1
|
||||||
|
ignore_nan=1
|
@@ -7,3 +7,4 @@ stratified=0
|
|||||||
source_data=Tanveer
|
source_data=Tanveer
|
||||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||||
discretize=0
|
discretize=0
|
||||||
|
ignore_nan=0
|
||||||
|
@@ -6,4 +6,5 @@ stratified=0
|
|||||||
# Source of data Tanveer/Surcov
|
# Source of data Tanveer/Surcov
|
||||||
source_data=Surcov
|
source_data=Surcov
|
||||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||||
discretize=0
|
discretize=0
|
||||||
|
ignore_nan=0
|
Reference in New Issue
Block a user