Merge pull request #9 from Doctorado-ML/continuous_features

Continuous features
This commit is contained in:
Ricardo Montañana Gómez
2023-01-15 10:55:49 +01:00
committed by GitHub
30 changed files with 234 additions and 103 deletions

View File

@@ -18,7 +18,7 @@ jobs:
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python }} - name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2 uses: actions/setup-python@v4
with: with:
python-version: ${{ matrix.python }} python-version: ${{ matrix.python }}
# Make dot command available in the environment # Make dot command available in the environment
@@ -53,7 +53,7 @@ jobs:
coverage run -m unittest -v benchmark.tests coverage run -m unittest -v benchmark.tests
coverage xml coverage xml
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v1 uses: codecov/codecov-action@v3
with: with:
token: ${{ secrets.CODECOV_TOKEN }} token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml files: ./coverage.xml

View File

@@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser):
("-p", "--hyperparameters"), ("-p", "--hyperparameters"),
{"type": str, "required": False, "default": "{}"}, {"type": str, "required": False, "default": "{}"},
], ],
"ignore_nan": [
("--ignore-nan",),
{
"default": False,
"action": "store_true",
"required": False,
"help": "Ignore nan results",
},
],
"key": [ "key": [
("-k", "--key"), ("-k", "--key"),
{ {

View File

@@ -2,10 +2,11 @@ import os
from types import SimpleNamespace from types import SimpleNamespace
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import json
from scipy.io import arff from scipy.io import arff
from .Utils import Files from .Utils import Files
from .Arguments import EnvData from .Arguments import EnvData
from mdlp.discretization import MDLP from fimdlp.mdlp import FImdlp
class Diterator: class Diterator:
@@ -27,6 +28,12 @@ class DatasetsArff:
def folder(): def folder():
return "datasets" return "datasets"
@staticmethod
def get_range_features(X, c_features):
if c_features.strip() == "all":
return list(range(X.shape[1]))
return json.loads(c_features)
def load(self, name, class_name): def load(self, name, class_name):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = arff.loadarff(file_name) data = arff.loadarff(file_name)
@@ -34,7 +41,7 @@ class DatasetsArff:
df.dropna(axis=0, how="any", inplace=True) df.dropna(axis=0, how="any", inplace=True)
self.dataset = df self.dataset = df
X = df.drop(class_name, axis=1) X = df.drop(class_name, axis=1)
self.features = X.columns self.features = X.columns.to_list()
self.class_name = class_name self.class_name = class_name
y, _ = pd.factorize(df[class_name]) y, _ = pd.factorize(df[class_name])
X = X.to_numpy() X = X.to_numpy()
@@ -50,6 +57,10 @@ class DatasetsTanveer:
def folder(): def folder():
return "data" return "data"
@staticmethod
def get_range_features(X, name):
return []
def load(self, name, *args): def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv( data = pd.read_csv(
@@ -75,6 +86,10 @@ class DatasetsSurcov:
def folder(): def folder():
return "datasets" return "datasets"
@staticmethod
def get_range_features(X, name):
return []
def load(self, name, *args): def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name)) file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv( data = pd.read_csv(
@@ -102,16 +117,16 @@ class Datasets:
) )
self.discretize = envData["discretize"] == "1" self.discretize = envData["discretize"] == "1"
self.dataset = source_name() self.dataset = source_name()
self.class_names = []
self.data_sets = []
# initialize self.class_names & self.data_sets # initialize self.class_names & self.data_sets
class_names, sets = self._init_names(dataset_name) class_names, sets = self._init_names(dataset_name)
self.class_names = class_names self.class_names = class_names
self.data_sets = sets self.data_sets = sets
self.states = {} # states of discretized variables
def _init_names(self, dataset_name): def _init_names(self, dataset_name):
file_name = os.path.join(self.dataset.folder(), Files.index) file_name = os.path.join(self.dataset.folder(), Files.index)
default_class = "class" default_class = "class"
self.continuous_features = {}
with open(file_name) as f: with open(file_name) as f:
sets = f.read().splitlines() sets = f.read().splitlines()
class_names = [default_class] * len(sets) class_names = [default_class] * len(sets)
@@ -119,10 +134,14 @@ class Datasets:
result = [] result = []
class_names = [] class_names = []
for data in sets: for data in sets:
name, class_name = data.split(",") name, class_name, features = data.split(",", 2)
result.append(name) result.append(name)
class_names.append(class_name) class_names.append(class_name)
self.continuous_features[name] = features
sets = result sets = result
else:
for name in sets:
self.continuous_features[name] = None
# Set as dataset list the dataset passed as argument # Set as dataset list the dataset passed as argument
if dataset_name is None: if dataset_name is None:
return class_names, sets return class_names, sets
@@ -137,6 +156,7 @@ class Datasets:
self.discretize = False self.discretize = False
X, y = self.load(name) X, y = self.load(name)
attr = SimpleNamespace() attr = SimpleNamespace()
attr.dataset = name
values, counts = np.unique(y, return_counts=True) values, counts = np.unique(y, return_counts=True)
comp = "" comp = ""
sep = "" sep = ""
@@ -147,24 +167,41 @@ class Datasets:
attr.classes = len(np.unique(y)) attr.classes = len(np.unique(y))
attr.samples = X.shape[0] attr.samples = X.shape[0]
attr.features = X.shape[1] attr.features = X.shape[1]
attr.cont_features = len(self.get_continuous_features())
self.discretize = tmp self.discretize = tmp
return attr return attr
def get_features(self): def get_features(self):
return self.dataset.features return self.dataset.features
def get_states(self, name):
return self.states[name] if name in self.states else None
def get_continuous_features(self):
return self.continuous_features_dataset
def get_class_name(self): def get_class_name(self):
return self.dataset.class_name return self.dataset.class_name
def get_dataset(self): def get_dataset(self):
return self.dataset.dataset return self.dataset.dataset
def build_states(self, name, X):
features = self.get_features()
self.states[name] = {
features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1])
}
def load(self, name, dataframe=False): def load(self, name, dataframe=False):
try: try:
class_name = self.class_names[self.data_sets.index(name)] class_name = self.class_names[self.data_sets.index(name)]
X, y = self.dataset.load(name, class_name) X, y = self.dataset.load(name, class_name)
self.continuous_features_dataset = self.dataset.get_range_features(
X, self.continuous_features[name]
)
if self.discretize: if self.discretize:
X = self.discretize_dataset(X, y) X = self.discretize_dataset(X, y)
self.build_states(name, X)
dataset = pd.DataFrame(X, columns=self.get_features()) dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y dataset[self.get_class_name()] = y
self.dataset.dataset = dataset self.dataset.dataset = dataset
@@ -188,9 +225,8 @@ class Datasets:
------- -------
tuple (X, y) of numpy.ndarray tuple (X, y) of numpy.ndarray
""" """
discretiz = MDLP(random_state=17, dtype=np.int32) discretiz = FImdlp(algorithm=0)
Xdisc = discretiz.fit_transform(X, y) return discretiz.fit_transform(X, y)
return Xdisc
def __iter__(self) -> Diterator: def __iter__(self) -> Diterator:
return Diterator(self.data_sets) return Diterator(self.data_sets)

View File

@@ -112,6 +112,7 @@ class Experiment:
platform, platform,
title, title,
progress_bar=True, progress_bar=True,
ignore_nan=True,
folds=5, folds=5,
): ):
today = datetime.now() today = datetime.now()
@@ -131,6 +132,7 @@ class Experiment:
self.score_name = score_name self.score_name = score_name
self.model_name = model_name self.model_name = model_name
self.title = title self.title = title
self.ignore_nan = ignore_nan
self.stratified = stratified == "1" self.stratified = stratified == "1"
self.stratified_class = StratifiedKFold if self.stratified else KFold self.stratified_class = StratifiedKFold if self.stratified else KFold
self.datasets = datasets self.datasets = datasets
@@ -184,7 +186,14 @@ class Experiment:
self.leaves = [] self.leaves = []
self.depths = [] self.depths = []
def _n_fold_crossval(self, X, y, hyperparameters): def _build_fit_params(self, name):
states = self.datasets.get_states(name)
if states is None:
return None
features = self.datasets.get_features()
return {"state_names": states, "features": features}
def _n_fold_crossval(self, name, X, y, hyperparameters):
if self.scores != []: if self.scores != []:
raise ValueError("Must init experiment before!") raise ValueError("Must init experiment before!")
loop = tqdm( loop = tqdm(
@@ -201,6 +210,7 @@ class Experiment:
shuffle=True, random_state=random_state, n_splits=self.folds shuffle=True, random_state=random_state, n_splits=self.folds
) )
clf = self._build_classifier(random_state, hyperparameters) clf = self._build_classifier(random_state, hyperparameters)
fit_params = self._build_fit_params(name)
self.version = Models.get_version(self.model_name, clf) self.version = Models.get_version(self.model_name, clf)
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings("ignore") warnings.filterwarnings("ignore")
@@ -209,11 +219,19 @@ class Experiment:
X, X,
y, y,
cv=kfold, cv=kfold,
fit_params=fit_params,
return_estimator=True, return_estimator=True,
scoring=self.score_name, scoring=self.score_name,
) )
self.scores.append(res["test_score"]) if np.isnan(res["test_score"]).any():
self.times.append(res["fit_time"]) if not self.ignore_nan:
print(res["test_score"])
raise ValueError("NaN in results")
results = res["test_score"][~np.isnan(res["test_score"])]
else:
results = res["test_score"]
self.scores.extend(results)
self.times.extend(res["fit_time"])
for result_item in res["estimator"]: for result_item in res["estimator"]:
nodes_item, leaves_item, depth_item = Models.get_complexity( nodes_item, leaves_item, depth_item = Models.get_complexity(
self.model_name, result_item self.model_name, result_item
@@ -273,7 +291,7 @@ class Experiment:
n_classes = len(np.unique(y)) n_classes = len(np.unique(y))
hyperparameters = self.hyperparameters_dict[name][1] hyperparameters = self.hyperparameters_dict[name][1]
self._init_experiment() self._init_experiment()
self._n_fold_crossval(X, y, hyperparameters) self._n_fold_crossval(name, X, y, hyperparameters)
self._add_results(name, hyperparameters, samp, feat, n_classes) self._add_results(name, hyperparameters, samp, feat, n_classes)
self._output_results() self._output_results()
self.duration = time.time() - now self.duration = time.time() - now

View File

@@ -15,6 +15,24 @@ from xgboost import XGBClassifier
import sklearn import sklearn
import xgboost import xgboost
import random
class MockModel(SVC):
# Only used for testing
def predict(self, X):
if random.random() < 0.1:
return [float("NaN")] * len(X)
return super().predict(X)
def nodes_leaves(self):
return 0, 0
def fit(self, X, y, **kwargs):
kwargs.pop("state_names", None)
kwargs.pop("features", None)
return super().fit(X, y, **kwargs)
class Models: class Models:
@staticmethod @staticmethod
@@ -22,27 +40,27 @@ class Models:
return { return {
"STree": Stree(random_state=random_state), "STree": Stree(random_state=random_state),
"TAN": TAN(random_state=random_state), "TAN": TAN(random_state=random_state),
"KDB": KDB(k=3), "KDB": KDB(k=2),
"AODE": AODE(random_state=random_state), "AODE": AODE(random_state=random_state),
"Cart": DecisionTreeClassifier(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state),
"ExtraTree": ExtraTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state),
"Wodt": Wodt(random_state=random_state), "Wodt": Wodt(random_state=random_state),
"SVC": SVC(random_state=random_state), "SVC": SVC(random_state=random_state),
"ODTE": Odte( "ODTE": Odte(
base_estimator=Stree(random_state=random_state), estimator=Stree(random_state=random_state),
random_state=random_state, random_state=random_state,
), ),
"BaggingStree": BaggingClassifier( "BaggingStree": BaggingClassifier(
base_estimator=Stree(random_state=random_state), estimator=Stree(random_state=random_state),
random_state=random_state, random_state=random_state,
), ),
"BaggingWodt": BaggingClassifier( "BaggingWodt": BaggingClassifier(
base_estimator=Wodt(random_state=random_state), estimator=Wodt(random_state=random_state),
random_state=random_state, random_state=random_state,
), ),
"XGBoost": XGBClassifier(random_state=random_state), "XGBoost": XGBClassifier(random_state=random_state),
"AdaBoostStree": AdaBoostClassifier( "AdaBoostStree": AdaBoostClassifier(
base_estimator=Stree( estimator=Stree(
random_state=random_state, random_state=random_state,
), ),
algorithm="SAMME", algorithm="SAMME",
@@ -50,6 +68,7 @@ class Models:
), ),
"GBC": GradientBoostingClassifier(random_state=random_state), "GBC": GradientBoostingClassifier(random_state=random_state),
"RandomForest": RandomForestClassifier(random_state=random_state), "RandomForest": RandomForestClassifier(random_state=random_state),
"Mock": MockModel(random_state=random_state),
} }
@staticmethod @staticmethod

View File

@@ -684,7 +684,7 @@ class ReportDatasets:
"bg_color": self.color1, "bg_color": self.color1,
} }
) )
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format) self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format)
self.sheet.merge_range( self.sheet.merge_range(
1, 1,
0, 0,
@@ -697,24 +697,24 @@ class ReportDatasets:
1, 1,
1, 1,
1, 1,
3, 4,
"Cross validation", "Cross validation",
merge_format_subheader_right, merge_format_subheader_right,
) )
self.sheet.write( self.sheet.write(
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left 1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left
) )
self.sheet.merge_range( self.sheet.merge_range(
2, 2,
1, 1,
2, 2,
3, 4,
"Stratified", "Stratified",
merge_format_subheader_right, merge_format_subheader_right,
) )
self.sheet.write( self.sheet.write(
2, 2,
4, 5,
f"{'True' if self.env['stratified']=='1' else 'False'}", f"{'True' if self.env['stratified']=='1' else 'False'}",
merge_format_subheader_left, merge_format_subheader_left,
) )
@@ -722,13 +722,13 @@ class ReportDatasets:
3, 3,
1, 1,
3, 3,
3, 4,
"Discretized", "Discretized",
merge_format_subheader_right, merge_format_subheader_right,
) )
self.sheet.write( self.sheet.write(
3, 3,
4, 5,
f"{'True' if self.env['discretize']=='1' else 'False'}", f"{'True' if self.env['discretize']=='1' else 'False'}",
merge_format_subheader_left, merge_format_subheader_left,
) )
@@ -736,18 +736,19 @@ class ReportDatasets:
4, 4,
1, 1,
4, 4,
3, 4,
"Seeds", "Seeds",
merge_format_subheader_right, merge_format_subheader_right,
) )
self.sheet.write( self.sheet.write(
4, 4, f"{self.env['seeds']}", merge_format_subheader_left 4, 5, f"{self.env['seeds']}", merge_format_subheader_left
) )
self.update_max_length(len(self.env["seeds"]) + 1) self.update_max_length(len(self.env["seeds"]) + 1)
header_cols = [ header_cols = [
("Dataset", 30), ("Dataset", 30),
("Samples", 10), ("Samples", 10),
("Features", 10), ("Features", 10),
("Continuous", 10),
("Classes", 10), ("Classes", 10),
("Balance", 50), ("Balance", 50),
] ]
@@ -767,7 +768,7 @@ class ReportDatasets:
def footer(self): def footer(self):
# set Balance column width to max length # set Balance column width to max length
self.sheet.set_column(4, 4, self.max_length) self.sheet.set_column(5, 5, self.max_length)
self.sheet.freeze_panes(6, 1) self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines(2) self.sheet.hide_gridlines(2)
if self.close: if self.close:
@@ -789,8 +790,9 @@ class ReportDatasets:
self.sheet.write(self.row, col, result.dataset, normal) self.sheet.write(self.row, col, result.dataset, normal)
self.sheet.write(self.row, col + 1, result.samples, integer) self.sheet.write(self.row, col + 1, result.samples, integer)
self.sheet.write(self.row, col + 2, result.features, integer) self.sheet.write(self.row, col + 2, result.features, integer)
self.sheet.write(self.row, col + 3, result.classes, normal) self.sheet.write(self.row, col + 3, result.cont_features, integer)
self.sheet.write(self.row, col + 4, result.balance, normal) self.sheet.write(self.row, col + 4, result.classes, normal)
self.sheet.write(self.row, col + 5, result.balance, normal)
self.update_max_length(len(result.balance)) self.update_max_length(len(result.balance))
self.row += 1 self.row += 1
@@ -807,11 +809,11 @@ class ReportDatasets:
print(color_line, end="") print(color_line, end="")
print(self.header_text) print(self.header_text)
print("") print("")
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance")
print("=" * 30 + " ====== ===== === " + "=" * 60) print("=" * 30 + " ====== ===== ==== === " + "=" * 60)
for dataset in data_sets: for dataset in data_sets:
attributes = data_sets.get_attributes(dataset) attributes = data_sets.get_attributes(dataset)
attributes.dataset = dataset
if self.excel: if self.excel:
self.print_line(attributes) self.print_line(attributes)
color_line = ( color_line = (
@@ -823,8 +825,8 @@ class ReportDatasets:
print(color_line, end="") print(color_line, end="")
print( print(
f"{dataset:30s} {attributes.samples:6,d} " f"{dataset:30s} {attributes.samples:6,d} "
f"{attributes.features:5,d} {attributes.classes:3d} " f"{attributes.features:5,d} {attributes.cont_features:4,d}"
f"{attributes.balance:40s}" f" {attributes.classes:3d} {attributes.balance:40s}"
) )
if self.excel: if self.excel:
self.footer() self.footer()

View File

@@ -46,7 +46,7 @@ def main(args_test=None):
'{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": ' '{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": '
'"ovr"}', '"ovr"}',
'{"C": 5, "kernel": "rbf", "gamma": "auto"}', '{"C": 5, "kernel": "rbf", "gamma": "auto"}',
'{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", ' '{"C": 0.05, "max_iter": 10000, "kernel": "liblinear", '
'"multiclass_strategy": "ovr"}', '"multiclass_strategy": "ovr"}',
'{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}', '{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": ' '{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": '
@@ -97,7 +97,7 @@ def main(args_test=None):
for item in results: for item in results:
results_tmp = {"n_jobs": [-1], "n_estimators": [100]} results_tmp = {"n_jobs": [-1], "n_estimators": [100]}
for key, value in results[item].items(): for key, value in results[item].items():
new_key = f"base_estimator__{key}" new_key = f"estimator__{key}"
try: try:
results_tmp[new_key] = sorted(value) results_tmp[new_key] = sorted(value)
except TypeError: except TypeError:
@@ -111,6 +111,7 @@ def main(args_test=None):
t2 = sorted([x for x in value if isinstance(x, str)]) t2 = sorted([x for x in value if isinstance(x, str)])
results_tmp[new_key] = t1 + t2 results_tmp[new_key] = t1 + t2
output.append(results_tmp) output.append(results_tmp)
# save results # save results
file_name = Files.grid_input(args.score, args.model) file_name = Files.grid_input(args.score, args.model)
file_output = os.path.join(Folders.results, file_name) file_output = os.path.join(Folders.results, file_name)

View File

@@ -13,7 +13,7 @@ def main(args_test=None):
arguments = Arguments(prog="be_main") arguments = Arguments(prog="be_main")
arguments.xset("stratified").xset("score").xset("model", mandatory=True) arguments.xset("stratified").xset("score").xset("model", mandatory=True)
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title") arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
arguments.xset("report") arguments.xset("report").xset("ignore_nan")
arguments.add_exclusive( arguments.add_exclusive(
["grid_paramfile", "best_paramfile", "hyperparameters"] ["grid_paramfile", "best_paramfile", "hyperparameters"]
) )
@@ -35,6 +35,7 @@ def main(args_test=None):
grid_paramfile=args.grid_paramfile, grid_paramfile=args.grid_paramfile,
progress_bar=not args.quiet, progress_bar=not args.quiet,
platform=args.platform, platform=args.platform,
ignore_nan=args.ignore_nan,
title=args.title, title=args.title,
folds=args.n_folds, folds=args.n_folds,
) )

View File

@@ -6,4 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0 discretize=0

View File

@@ -6,4 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov # Source of data Tanveer/Surcov
source_data=Tanveer source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0 discretize=0

View File

@@ -18,7 +18,7 @@ class BestResultTest(TestBase):
"C": 7, "C": 7,
"gamma": 0.1, "gamma": 0.1,
"kernel": "rbf", "kernel": "rbf",
"max_iter": 10000.0, "max_iter": 10000,
"multiclass_strategy": "ovr", "multiclass_strategy": "ovr",
}, },
"results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json", "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json",

View File

@@ -1,4 +1,3 @@
import shutil
from .TestBase import TestBase from .TestBase import TestBase
from ..Experiments import Randomized from ..Experiments import Randomized
from ..Datasets import Datasets from ..Datasets import Datasets
@@ -17,10 +16,6 @@ class DatasetTest(TestBase):
self.set_env(".env.dist") self.set_env(".env.dist")
return super().tearDown() return super().tearDown()
@staticmethod
def set_env(env):
shutil.copy(env, ".env")
def test_Randomized(self): def test_Randomized(self):
expected = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] expected = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
self.assertSequenceEqual(Randomized.seeds(), expected) self.assertSequenceEqual(Randomized.seeds(), expected)

View File

@@ -1,4 +1,6 @@
import json import json
from io import StringIO
from unittest.mock import patch
from .TestBase import TestBase from .TestBase import TestBase
from ..Experiments import Experiment from ..Experiments import Experiment
from ..Datasets import Datasets from ..Datasets import Datasets
@@ -8,10 +10,12 @@ class ExperimentTest(TestBase):
def setUp(self): def setUp(self):
self.exp = self.build_exp() self.exp = self.build_exp()
def build_exp(self, hyperparams=False, grid=False): def build_exp(
self, hyperparams=False, grid=False, model="STree", ignore_nan=False
):
params = { params = {
"score_name": "accuracy", "score_name": "accuracy",
"model_name": "STree", "model_name": model,
"stratified": "0", "stratified": "0",
"datasets": Datasets(), "datasets": Datasets(),
"hyperparams_dict": "{}", "hyperparams_dict": "{}",
@@ -21,6 +25,7 @@ class ExperimentTest(TestBase):
"title": "Test", "title": "Test",
"progress_bar": False, "progress_bar": False,
"folds": 2, "folds": 2,
"ignore_nan": ignore_nan,
} }
return Experiment(**params) return Experiment(**params)
@@ -31,6 +36,7 @@ class ExperimentTest(TestBase):
], ],
".", ".",
) )
self.set_env(".env.dist")
return super().tearDown() return super().tearDown()
def test_build_hyperparams_file(self): def test_build_hyperparams_file(self):
@@ -46,7 +52,7 @@ class ExperimentTest(TestBase):
"C": 7, "C": 7,
"gamma": 0.1, "gamma": 0.1,
"kernel": "rbf", "kernel": "rbf",
"max_iter": 10000.0, "max_iter": 10000,
"multiclass_strategy": "ovr", "multiclass_strategy": "ovr",
}, },
"results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json", "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json",
@@ -89,7 +95,7 @@ class ExperimentTest(TestBase):
def test_exception_n_fold_crossval(self): def test_exception_n_fold_crossval(self):
self.exp.do_experiment() self.exp.do_experiment()
with self.assertRaises(ValueError): with self.assertRaises(ValueError):
self.exp._n_fold_crossval([], [], {}) self.exp._n_fold_crossval("", [], [], {})
def test_do_experiment(self): def test_do_experiment(self):
self.exp.do_experiment() self.exp.do_experiment()
@@ -131,3 +137,39 @@ class ExperimentTest(TestBase):
): ):
for key, value in expected_result.items(): for key, value in expected_result.items():
self.assertEqual(computed_result[key], value) self.assertEqual(computed_result[key], value)
def test_build_fit_parameters(self):
self.set_env(".env.arff")
expected = {
"state_names": {
"sepallength": [0, 1, 2],
"sepalwidth": [0, 1, 3, 4],
"petallength": [0, 1, 2, 3],
"petalwidth": [0, 1, 2, 3],
},
"features": [
"sepallength",
"sepalwidth",
"petallength",
"petalwidth",
],
}
exp = self.build_exp(model="TAN")
X, y = exp.datasets.load("iris")
computed = exp._build_fit_params("iris")
for key, value in expected["state_names"].items():
self.assertEqual(computed["state_names"][key], value)
for feature in expected["features"]:
self.assertIn(feature, computed["features"])
@patch("sys.stdout", new_callable=StringIO)
def test_experiment_with_nan_not_ignored(self, mock_output):
exp = self.build_exp(model="Mock")
self.assertRaises(ValueError, exp.do_experiment)
output_text = mock_output.getvalue().splitlines()
expected = "[ nan 0.8974359]"
self.assertEqual(expected, output_text[0])
def test_experiment_with_nan_ignored(self):
self.exp = self.build_exp(model="Mock", ignore_nan=True)
self.exp.do_experiment()

View File

@@ -70,19 +70,19 @@ class ModelTest(TestBase):
def test_BaggingStree(self): def test_BaggingStree(self):
clf = Models.get_model("BaggingStree") clf = Models.get_model("BaggingStree")
self.assertIsInstance(clf, BaggingClassifier) self.assertIsInstance(clf, BaggingClassifier)
clf_base = clf.base_estimator clf_base = clf.estimator
self.assertIsInstance(clf_base, Stree) self.assertIsInstance(clf_base, Stree)
def test_BaggingWodt(self): def test_BaggingWodt(self):
clf = Models.get_model("BaggingWodt") clf = Models.get_model("BaggingWodt")
self.assertIsInstance(clf, BaggingClassifier) self.assertIsInstance(clf, BaggingClassifier)
clf_base = clf.base_estimator clf_base = clf.estimator
self.assertIsInstance(clf_base, Wodt) self.assertIsInstance(clf_base, Wodt)
def test_AdaBoostStree(self): def test_AdaBoostStree(self):
clf = Models.get_model("AdaBoostStree") clf = Models.get_model("AdaBoostStree")
self.assertIsInstance(clf, AdaBoostClassifier) self.assertIsInstance(clf, AdaBoostClassifier)
clf_base = clf.base_estimator clf_base = clf.estimator
self.assertIsInstance(clf_base, Stree) self.assertIsInstance(clf_base, Stree)
def test_unknown_classifier(self): def test_unknown_classifier(self):

View File

@@ -4,6 +4,7 @@ import pathlib
import sys import sys
import csv import csv
import unittest import unittest
import shutil
from importlib import import_module from importlib import import_module
from io import StringIO from io import StringIO
from unittest.mock import patch from unittest.mock import patch
@@ -19,6 +20,10 @@ class TestBase(unittest.TestCase):
self.stree_version = "1.2.4" self.stree_version = "1.2.4"
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@staticmethod
def set_env(env):
shutil.copy(env, ".env")
def remove_files(self, files, folder): def remove_files(self, files, folder):
for file_name in files: for file_name in files:
file_name = os.path.join(folder, file_name) file_name = os.path.join(folder, file_name)

View File

@@ -1,2 +1,2 @@
iris,class iris,class,all
wine,class wine,class,[0, 1]

View File

@@ -1 +1 @@
{"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]} {"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]}

View File

@@ -17,10 +17,10 @@
"features": 4, "features": 4,
"classes": 3, "classes": 3,
"hyperparameters": { "hyperparameters": {
"C": 10000.0, "C": 10000,
"gamma": 0.1, "gamma": 0.1,
"kernel": "rbf", "kernel": "rbf",
"max_iter": 10000.0, "max_iter": 10000,
"multiclass_strategy": "ovr" "multiclass_strategy": "ovr"
}, },
"nodes": 7.0, "nodes": 7.0,
@@ -40,7 +40,7 @@
"C": 7, "C": 7,
"gamma": 0.1, "gamma": 0.1,
"kernel": "rbf", "kernel": "rbf",
"max_iter": 10000.0, "max_iter": 10000,
"multiclass_strategy": "ovr" "multiclass_strategy": "ovr"
}, },
"nodes": 3.0, "nodes": 3.0,

View File

@@ -27,7 +27,7 @@ class BePrintStrees(TestBase):
stdout.getvalue(), f"File {file_name} generated\n" stdout.getvalue(), f"File {file_name} generated\n"
) )
computed_size = os.path.getsize(file_name) computed_size = os.path.getsize(file_name)
self.assertGreater(computed_size, 25000) self.assertGreater(computed_size, 24500)
def test_be_print_strees_dataset_color(self): def test_be_print_strees_dataset_color(self):
for name in self.datasets: for name in self.datasets:

View File

@@ -6,13 +6,13 @@
"n_estimators": [ "n_estimators": [
100 100
], ],
"base_estimator__C": [ "estimator__C": [
1.0 1.0
], ],
"base_estimator__kernel": [ "estimator__kernel": [
"linear" "linear"
], ],
"base_estimator__multiclass_strategy": [ "estimator__multiclass_strategy": [
"ovo" "ovo"
] ]
}, },
@@ -23,7 +23,7 @@
"n_estimators": [ "n_estimators": [
100 100
], ],
"base_estimator__C": [ "estimator__C": [
0.001, 0.001,
0.0275, 0.0275,
0.05, 0.05,
@@ -36,10 +36,10 @@
7, 7,
10000.0 10000.0
], ],
"base_estimator__kernel": [ "estimator__kernel": [
"liblinear" "liblinear"
], ],
"base_estimator__multiclass_strategy": [ "estimator__multiclass_strategy": [
"ovr" "ovr"
] ]
}, },
@@ -50,7 +50,7 @@
"n_estimators": [ "n_estimators": [
100 100
], ],
"base_estimator__C": [ "estimator__C": [
0.05, 0.05,
1.0, 1.0,
1.05, 1.05,
@@ -62,7 +62,7 @@
57, 57,
10000.0 10000.0
], ],
"base_estimator__gamma": [ "estimator__gamma": [
0.001, 0.001,
0.1, 0.1,
0.14, 0.14,
@@ -70,10 +70,10 @@
"auto", "auto",
"scale" "scale"
], ],
"base_estimator__kernel": [ "estimator__kernel": [
"rbf" "rbf"
], ],
"base_estimator__multiclass_strategy": [ "estimator__multiclass_strategy": [
"ovr" "ovr"
] ]
}, },
@@ -84,20 +84,20 @@
"n_estimators": [ "n_estimators": [
100 100
], ],
"base_estimator__C": [ "estimator__C": [
0.05, 0.05,
0.2, 0.2,
1.0, 1.0,
8.25 8.25
], ],
"base_estimator__gamma": [ "estimator__gamma": [
0.1, 0.1,
"scale" "scale"
], ],
"base_estimator__kernel": [ "estimator__kernel": [
"poly" "poly"
], ],
"base_estimator__multiclass_strategy": [ "estimator__multiclass_strategy": [
"ovo", "ovo",
"ovr" "ovr"
] ]

View File

@@ -9,7 +9,7 @@
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
============================== ====== ===== === ======= ======= ======= =============== ================= =============== ============================== ====== ===== === ======= ======= ======= =============== ================= ===============
balance-scale 625 4 3 23.32 12.16 6.44 0.840160±0.0304 0.013745±0.0019 {'splitter': 'best', 'max_features': 'auto'} balance-scale 625 4 3 23.32 12.16 6.44 0.840160±0.0304 0.013745±0.0019 {'splitter': 'best', 'max_features': 'auto'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
************************************************************************************************************************* *************************************************************************************************************************
* accuracy compared to STree_default (liblinear-ovr) .: 0.0422 * * accuracy compared to STree_default (liblinear-ovr) .: 0.0422 *
************************************************************************************************************************* *************************************************************************************************************************

View File

@@ -32,7 +32,7 @@
7;9;"0.0150468069702512" 7;9;"0.0150468069702512"
7;10;"0.01404867172241211" 7;10;"0.01404867172241211"
7;11;"0.002026269126958884" 7;11;"0.002026269126958884"
7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" 7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
8;1;"balloons" 8;1;"balloons"
8;2;"16" 8;2;"16"
8;3;"4" 8;3;"4"
@@ -44,5 +44,5 @@
8;9;"0.2850146195080759" 8;9;"0.2850146195080759"
8;10;"0.0008541679382324218" 8;10;"0.0008541679382324218"
8;11;"3.629469326417878e-05" 8;11;"3.629469326417878e-05"
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" 8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454" 10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454"

View File

@@ -32,7 +32,7 @@
7;10;0.0150468069702512 7;10;0.0150468069702512
7;11;0.01404867172241211 7;11;0.01404867172241211
7;12;0.002026269126958884 7;12;0.002026269126958884
7;13;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" 7;13;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
8;1;"balloons" 8;1;"balloons"
8;2;16 8;2;16
8;3;4 8;3;4
@@ -45,7 +45,7 @@
8;10;0.2850146195080759 8;10;0.2850146195080759
8;11;0.0008541679382324218 8;11;0.0008541679382324218
8;12;3.629469326417878e-05 8;12;3.629469326417878e-05
8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" 8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
11;2;"✔" 11;2;"✔"
11;3;1 11;3;1
11;4;"Equal to best" 11;4;"Equal to best"

View File

@@ -1,25 +1,28 @@
1;1;"Datasets used in benchmark ver. 0.2.0" 1;1;"Datasets used in benchmark ver. 0.4.0"
2;1;" Default score accuracy" 2;1;" Default score accuracy"
2;2;"Cross validation" 2;2;"Cross validation"
2;5;"5 Folds" 2;6;"5 Folds"
3;2;"Stratified" 3;2;"Stratified"
3;5;"False" 3;6;"False"
4;2;"Discretized" 4;2;"Discretized"
4;5;"False" 4;6;"False"
5;2;"Seeds" 5;2;"Seeds"
5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]" 5;6;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]"
6;1;"Dataset" 6;1;"Dataset"
6;2;"Samples" 6;2;"Samples"
6;3;"Features" 6;3;"Features"
6;4;"Classes" 6;4;"Continuous"
6;5;"Balance" 6;5;"Classes"
6;6;"Balance"
7;1;"balance-scale" 7;1;"balance-scale"
7;2;"625" 7;2;"625"
7;3;"4" 7;3;"4"
7;4;"3" 7;4;"0"
7;5;" 7.84%/ 46.08%/ 46.08%" 7;5;"3"
7;6;" 7.84%/ 46.08%/ 46.08%"
8;1;"balloons" 8;1;"balloons"
8;2;"16" 8;2;"16"
8;3;"4" 8;3;"4"
8;4;"2" 8;4;"0"
8;5;"56.25%/ 43.75%" 8;5;"2"
8;6;"56.25%/ 43.75%"

View File

@@ -32,7 +32,7 @@
7;9;"0.0150468069702512" 7;9;"0.0150468069702512"
7;10;"0.01404867172241211" 7;10;"0.01404867172241211"
7;11;"0.002026269126958884" 7;11;"0.002026269126958884"
7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" 7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
8;1;"balloons" 8;1;"balloons"
8;2;"16" 8;2;"16"
8;3;"4" 8;3;"4"
@@ -44,5 +44,5 @@
8;9;"0.2850146195080759" 8;9;"0.2850146195080759"
8;10;"0.0008541679382324218" 8;10;"0.0008541679382324218"
8;11;"3.629469326417878e-05" 8;11;"3.629469326417878e-05"
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" 8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454" 10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454"

View File

@@ -8,8 +8,8 @@
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
============================== ====== ===== === ======= ======= ======= =============== ================= =============== ============================== ====== ===== === ======= ======= ======= =============== ================= ===============
balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
************************************************************************************************************************* *************************************************************************************************************************
* accuracy compared to STree_default (liblinear-ovr) .: 0.0454 * * accuracy compared to STree_default (liblinear-ovr) .: 0.0454 *
************************************************************************************************************************* *************************************************************************************************************************

View File

@@ -5,7 +5,7 @@
Dataset Score File/Message Hyperparameters Dataset Score File/Message Hyperparameters
============================== ======== ============================================================================ ============================================= ============================== ======== ============================================================================ =============================================
balance-scale 0.980000 results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json {'splitter': 'best', 'max_features': 'auto'} balance-scale 0.980000 results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json {'splitter': 'best', 'max_features': 'auto'}
balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
****************************************************************************************************************************************************************** ******************************************************************************************************************************************************************
* accuracy compared to STree_default (liblinear-ovr) .: 0.0457 * * accuracy compared to STree_default (liblinear-ovr) .: 0.0457 *
****************************************************************************************************************************************************************** ******************************************************************************************************************************************************************

View File

@@ -8,8 +8,8 @@
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
============================== ====== ===== === ======= ======= ======= =============== ================= =============== ============================== ====== ===== === ======= ======= ======= =============== ================= ===============
balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
************************************************************************************************************************* *************************************************************************************************************************
* ✔ Equal to best .....: 1 * * ✔ Equal to best .....: 1 *
* accuracy compared to STree_default (liblinear-ovr) .: 0.0454 * * accuracy compared to STree_default (liblinear-ovr) .: 0.0454 *

View File

@@ -1,6 +1,6 @@
Datasets used in benchmark ver. 0.2.0 Datasets used in benchmark ver. 0.2.0
Dataset Sampl. Feat. Cls Balance Dataset Sampl. Feat. Cont Cls Balance
============================== ====== ===== === ============================================================ ============================== ====== ===== ==== === ============================================================
balance-scale 625 4 3 7.84%/ 46.08%/ 46.08% balance-scale 625 4 0 3 7.84%/ 46.08%/ 46.08%
balloons 16 4 2 56.25%/ 43.75% balloons 16 4 0 2 56.25%/ 43.75%

View File

@@ -3,7 +3,7 @@ scikit-learn
scipy scipy
odte odte
cython cython
mdlp-discretization fimdlp
mufs mufs
bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git
xlsxwriter xlsxwriter