mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-17 00:15:55 +00:00
Merge pull request #9 from Doctorado-ML/continuous_features
Continuous features
This commit is contained in:
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python ${{ matrix.python }}
|
||||
uses: actions/setup-python@v2
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
# Make dot command available in the environment
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
coverage run -m unittest -v benchmark.tests
|
||||
coverage xml
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
uses: codecov/codecov-action@v3
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
files: ./coverage.xml
|
||||
|
@@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser):
|
||||
("-p", "--hyperparameters"),
|
||||
{"type": str, "required": False, "default": "{}"},
|
||||
],
|
||||
"ignore_nan": [
|
||||
("--ignore-nan",),
|
||||
{
|
||||
"default": False,
|
||||
"action": "store_true",
|
||||
"required": False,
|
||||
"help": "Ignore nan results",
|
||||
},
|
||||
],
|
||||
"key": [
|
||||
("-k", "--key"),
|
||||
{
|
||||
|
@@ -2,10 +2,11 @@ import os
|
||||
from types import SimpleNamespace
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import json
|
||||
from scipy.io import arff
|
||||
from .Utils import Files
|
||||
from .Arguments import EnvData
|
||||
from mdlp.discretization import MDLP
|
||||
from fimdlp.mdlp import FImdlp
|
||||
|
||||
|
||||
class Diterator:
|
||||
@@ -27,6 +28,12 @@ class DatasetsArff:
|
||||
def folder():
|
||||
return "datasets"
|
||||
|
||||
@staticmethod
|
||||
def get_range_features(X, c_features):
|
||||
if c_features.strip() == "all":
|
||||
return list(range(X.shape[1]))
|
||||
return json.loads(c_features)
|
||||
|
||||
def load(self, name, class_name):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = arff.loadarff(file_name)
|
||||
@@ -34,7 +41,7 @@ class DatasetsArff:
|
||||
df.dropna(axis=0, how="any", inplace=True)
|
||||
self.dataset = df
|
||||
X = df.drop(class_name, axis=1)
|
||||
self.features = X.columns
|
||||
self.features = X.columns.to_list()
|
||||
self.class_name = class_name
|
||||
y, _ = pd.factorize(df[class_name])
|
||||
X = X.to_numpy()
|
||||
@@ -50,6 +57,10 @@ class DatasetsTanveer:
|
||||
def folder():
|
||||
return "data"
|
||||
|
||||
@staticmethod
|
||||
def get_range_features(X, name):
|
||||
return []
|
||||
|
||||
def load(self, name, *args):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = pd.read_csv(
|
||||
@@ -75,6 +86,10 @@ class DatasetsSurcov:
|
||||
def folder():
|
||||
return "datasets"
|
||||
|
||||
@staticmethod
|
||||
def get_range_features(X, name):
|
||||
return []
|
||||
|
||||
def load(self, name, *args):
|
||||
file_name = os.path.join(self.folder(), self.dataset_names(name))
|
||||
data = pd.read_csv(
|
||||
@@ -102,16 +117,16 @@ class Datasets:
|
||||
)
|
||||
self.discretize = envData["discretize"] == "1"
|
||||
self.dataset = source_name()
|
||||
self.class_names = []
|
||||
self.data_sets = []
|
||||
# initialize self.class_names & self.data_sets
|
||||
class_names, sets = self._init_names(dataset_name)
|
||||
self.class_names = class_names
|
||||
self.data_sets = sets
|
||||
self.states = {} # states of discretized variables
|
||||
|
||||
def _init_names(self, dataset_name):
|
||||
file_name = os.path.join(self.dataset.folder(), Files.index)
|
||||
default_class = "class"
|
||||
self.continuous_features = {}
|
||||
with open(file_name) as f:
|
||||
sets = f.read().splitlines()
|
||||
class_names = [default_class] * len(sets)
|
||||
@@ -119,10 +134,14 @@ class Datasets:
|
||||
result = []
|
||||
class_names = []
|
||||
for data in sets:
|
||||
name, class_name = data.split(",")
|
||||
name, class_name, features = data.split(",", 2)
|
||||
result.append(name)
|
||||
class_names.append(class_name)
|
||||
self.continuous_features[name] = features
|
||||
sets = result
|
||||
else:
|
||||
for name in sets:
|
||||
self.continuous_features[name] = None
|
||||
# Set as dataset list the dataset passed as argument
|
||||
if dataset_name is None:
|
||||
return class_names, sets
|
||||
@@ -137,6 +156,7 @@ class Datasets:
|
||||
self.discretize = False
|
||||
X, y = self.load(name)
|
||||
attr = SimpleNamespace()
|
||||
attr.dataset = name
|
||||
values, counts = np.unique(y, return_counts=True)
|
||||
comp = ""
|
||||
sep = ""
|
||||
@@ -147,24 +167,41 @@ class Datasets:
|
||||
attr.classes = len(np.unique(y))
|
||||
attr.samples = X.shape[0]
|
||||
attr.features = X.shape[1]
|
||||
attr.cont_features = len(self.get_continuous_features())
|
||||
self.discretize = tmp
|
||||
return attr
|
||||
|
||||
def get_features(self):
|
||||
return self.dataset.features
|
||||
|
||||
def get_states(self, name):
|
||||
return self.states[name] if name in self.states else None
|
||||
|
||||
def get_continuous_features(self):
|
||||
return self.continuous_features_dataset
|
||||
|
||||
def get_class_name(self):
|
||||
return self.dataset.class_name
|
||||
|
||||
def get_dataset(self):
|
||||
return self.dataset.dataset
|
||||
|
||||
def build_states(self, name, X):
|
||||
features = self.get_features()
|
||||
self.states[name] = {
|
||||
features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1])
|
||||
}
|
||||
|
||||
def load(self, name, dataframe=False):
|
||||
try:
|
||||
class_name = self.class_names[self.data_sets.index(name)]
|
||||
X, y = self.dataset.load(name, class_name)
|
||||
self.continuous_features_dataset = self.dataset.get_range_features(
|
||||
X, self.continuous_features[name]
|
||||
)
|
||||
if self.discretize:
|
||||
X = self.discretize_dataset(X, y)
|
||||
self.build_states(name, X)
|
||||
dataset = pd.DataFrame(X, columns=self.get_features())
|
||||
dataset[self.get_class_name()] = y
|
||||
self.dataset.dataset = dataset
|
||||
@@ -188,9 +225,8 @@ class Datasets:
|
||||
-------
|
||||
tuple (X, y) of numpy.ndarray
|
||||
"""
|
||||
discretiz = MDLP(random_state=17, dtype=np.int32)
|
||||
Xdisc = discretiz.fit_transform(X, y)
|
||||
return Xdisc
|
||||
discretiz = FImdlp(algorithm=0)
|
||||
return discretiz.fit_transform(X, y)
|
||||
|
||||
def __iter__(self) -> Diterator:
|
||||
return Diterator(self.data_sets)
|
||||
|
@@ -112,6 +112,7 @@ class Experiment:
|
||||
platform,
|
||||
title,
|
||||
progress_bar=True,
|
||||
ignore_nan=True,
|
||||
folds=5,
|
||||
):
|
||||
today = datetime.now()
|
||||
@@ -131,6 +132,7 @@ class Experiment:
|
||||
self.score_name = score_name
|
||||
self.model_name = model_name
|
||||
self.title = title
|
||||
self.ignore_nan = ignore_nan
|
||||
self.stratified = stratified == "1"
|
||||
self.stratified_class = StratifiedKFold if self.stratified else KFold
|
||||
self.datasets = datasets
|
||||
@@ -184,7 +186,14 @@ class Experiment:
|
||||
self.leaves = []
|
||||
self.depths = []
|
||||
|
||||
def _n_fold_crossval(self, X, y, hyperparameters):
|
||||
def _build_fit_params(self, name):
|
||||
states = self.datasets.get_states(name)
|
||||
if states is None:
|
||||
return None
|
||||
features = self.datasets.get_features()
|
||||
return {"state_names": states, "features": features}
|
||||
|
||||
def _n_fold_crossval(self, name, X, y, hyperparameters):
|
||||
if self.scores != []:
|
||||
raise ValueError("Must init experiment before!")
|
||||
loop = tqdm(
|
||||
@@ -201,6 +210,7 @@ class Experiment:
|
||||
shuffle=True, random_state=random_state, n_splits=self.folds
|
||||
)
|
||||
clf = self._build_classifier(random_state, hyperparameters)
|
||||
fit_params = self._build_fit_params(name)
|
||||
self.version = Models.get_version(self.model_name, clf)
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore")
|
||||
@@ -209,11 +219,19 @@ class Experiment:
|
||||
X,
|
||||
y,
|
||||
cv=kfold,
|
||||
fit_params=fit_params,
|
||||
return_estimator=True,
|
||||
scoring=self.score_name,
|
||||
)
|
||||
self.scores.append(res["test_score"])
|
||||
self.times.append(res["fit_time"])
|
||||
if np.isnan(res["test_score"]).any():
|
||||
if not self.ignore_nan:
|
||||
print(res["test_score"])
|
||||
raise ValueError("NaN in results")
|
||||
results = res["test_score"][~np.isnan(res["test_score"])]
|
||||
else:
|
||||
results = res["test_score"]
|
||||
self.scores.extend(results)
|
||||
self.times.extend(res["fit_time"])
|
||||
for result_item in res["estimator"]:
|
||||
nodes_item, leaves_item, depth_item = Models.get_complexity(
|
||||
self.model_name, result_item
|
||||
@@ -273,7 +291,7 @@ class Experiment:
|
||||
n_classes = len(np.unique(y))
|
||||
hyperparameters = self.hyperparameters_dict[name][1]
|
||||
self._init_experiment()
|
||||
self._n_fold_crossval(X, y, hyperparameters)
|
||||
self._n_fold_crossval(name, X, y, hyperparameters)
|
||||
self._add_results(name, hyperparameters, samp, feat, n_classes)
|
||||
self._output_results()
|
||||
self.duration = time.time() - now
|
||||
|
@@ -15,6 +15,24 @@ from xgboost import XGBClassifier
|
||||
import sklearn
|
||||
import xgboost
|
||||
|
||||
import random
|
||||
|
||||
|
||||
class MockModel(SVC):
|
||||
# Only used for testing
|
||||
def predict(self, X):
|
||||
if random.random() < 0.1:
|
||||
return [float("NaN")] * len(X)
|
||||
return super().predict(X)
|
||||
|
||||
def nodes_leaves(self):
|
||||
return 0, 0
|
||||
|
||||
def fit(self, X, y, **kwargs):
|
||||
kwargs.pop("state_names", None)
|
||||
kwargs.pop("features", None)
|
||||
return super().fit(X, y, **kwargs)
|
||||
|
||||
|
||||
class Models:
|
||||
@staticmethod
|
||||
@@ -22,27 +40,27 @@ class Models:
|
||||
return {
|
||||
"STree": Stree(random_state=random_state),
|
||||
"TAN": TAN(random_state=random_state),
|
||||
"KDB": KDB(k=3),
|
||||
"KDB": KDB(k=2),
|
||||
"AODE": AODE(random_state=random_state),
|
||||
"Cart": DecisionTreeClassifier(random_state=random_state),
|
||||
"ExtraTree": ExtraTreeClassifier(random_state=random_state),
|
||||
"Wodt": Wodt(random_state=random_state),
|
||||
"SVC": SVC(random_state=random_state),
|
||||
"ODTE": Odte(
|
||||
base_estimator=Stree(random_state=random_state),
|
||||
estimator=Stree(random_state=random_state),
|
||||
random_state=random_state,
|
||||
),
|
||||
"BaggingStree": BaggingClassifier(
|
||||
base_estimator=Stree(random_state=random_state),
|
||||
estimator=Stree(random_state=random_state),
|
||||
random_state=random_state,
|
||||
),
|
||||
"BaggingWodt": BaggingClassifier(
|
||||
base_estimator=Wodt(random_state=random_state),
|
||||
estimator=Wodt(random_state=random_state),
|
||||
random_state=random_state,
|
||||
),
|
||||
"XGBoost": XGBClassifier(random_state=random_state),
|
||||
"AdaBoostStree": AdaBoostClassifier(
|
||||
base_estimator=Stree(
|
||||
estimator=Stree(
|
||||
random_state=random_state,
|
||||
),
|
||||
algorithm="SAMME",
|
||||
@@ -50,6 +68,7 @@ class Models:
|
||||
),
|
||||
"GBC": GradientBoostingClassifier(random_state=random_state),
|
||||
"RandomForest": RandomForestClassifier(random_state=random_state),
|
||||
"Mock": MockModel(random_state=random_state),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
|
@@ -684,7 +684,7 @@ class ReportDatasets:
|
||||
"bg_color": self.color1,
|
||||
}
|
||||
)
|
||||
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
|
||||
self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format)
|
||||
self.sheet.merge_range(
|
||||
1,
|
||||
0,
|
||||
@@ -697,24 +697,24 @@ class ReportDatasets:
|
||||
1,
|
||||
1,
|
||||
1,
|
||||
3,
|
||||
4,
|
||||
"Cross validation",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
|
||||
1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left
|
||||
)
|
||||
self.sheet.merge_range(
|
||||
2,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
"Stratified",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
2,
|
||||
4,
|
||||
5,
|
||||
f"{'True' if self.env['stratified']=='1' else 'False'}",
|
||||
merge_format_subheader_left,
|
||||
)
|
||||
@@ -722,13 +722,13 @@ class ReportDatasets:
|
||||
3,
|
||||
1,
|
||||
3,
|
||||
3,
|
||||
4,
|
||||
"Discretized",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
f"{'True' if self.env['discretize']=='1' else 'False'}",
|
||||
merge_format_subheader_left,
|
||||
)
|
||||
@@ -736,18 +736,19 @@ class ReportDatasets:
|
||||
4,
|
||||
1,
|
||||
4,
|
||||
3,
|
||||
4,
|
||||
"Seeds",
|
||||
merge_format_subheader_right,
|
||||
)
|
||||
self.sheet.write(
|
||||
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
|
||||
4, 5, f"{self.env['seeds']}", merge_format_subheader_left
|
||||
)
|
||||
self.update_max_length(len(self.env["seeds"]) + 1)
|
||||
header_cols = [
|
||||
("Dataset", 30),
|
||||
("Samples", 10),
|
||||
("Features", 10),
|
||||
("Continuous", 10),
|
||||
("Classes", 10),
|
||||
("Balance", 50),
|
||||
]
|
||||
@@ -767,7 +768,7 @@ class ReportDatasets:
|
||||
|
||||
def footer(self):
|
||||
# set Balance column width to max length
|
||||
self.sheet.set_column(4, 4, self.max_length)
|
||||
self.sheet.set_column(5, 5, self.max_length)
|
||||
self.sheet.freeze_panes(6, 1)
|
||||
self.sheet.hide_gridlines(2)
|
||||
if self.close:
|
||||
@@ -789,8 +790,9 @@ class ReportDatasets:
|
||||
self.sheet.write(self.row, col, result.dataset, normal)
|
||||
self.sheet.write(self.row, col + 1, result.samples, integer)
|
||||
self.sheet.write(self.row, col + 2, result.features, integer)
|
||||
self.sheet.write(self.row, col + 3, result.classes, normal)
|
||||
self.sheet.write(self.row, col + 4, result.balance, normal)
|
||||
self.sheet.write(self.row, col + 3, result.cont_features, integer)
|
||||
self.sheet.write(self.row, col + 4, result.classes, normal)
|
||||
self.sheet.write(self.row, col + 5, result.balance, normal)
|
||||
self.update_max_length(len(result.balance))
|
||||
self.row += 1
|
||||
|
||||
@@ -807,11 +809,11 @@ class ReportDatasets:
|
||||
print(color_line, end="")
|
||||
print(self.header_text)
|
||||
print("")
|
||||
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
|
||||
print("=" * 30 + " ====== ===== === " + "=" * 60)
|
||||
print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance")
|
||||
print("=" * 30 + " ====== ===== ==== === " + "=" * 60)
|
||||
for dataset in data_sets:
|
||||
attributes = data_sets.get_attributes(dataset)
|
||||
attributes.dataset = dataset
|
||||
|
||||
if self.excel:
|
||||
self.print_line(attributes)
|
||||
color_line = (
|
||||
@@ -823,8 +825,8 @@ class ReportDatasets:
|
||||
print(color_line, end="")
|
||||
print(
|
||||
f"{dataset:30s} {attributes.samples:6,d} "
|
||||
f"{attributes.features:5,d} {attributes.classes:3d} "
|
||||
f"{attributes.balance:40s}"
|
||||
f"{attributes.features:5,d} {attributes.cont_features:4,d}"
|
||||
f" {attributes.classes:3d} {attributes.balance:40s}"
|
||||
)
|
||||
if self.excel:
|
||||
self.footer()
|
||||
|
@@ -46,7 +46,7 @@ def main(args_test=None):
|
||||
'{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": '
|
||||
'"ovr"}',
|
||||
'{"C": 5, "kernel": "rbf", "gamma": "auto"}',
|
||||
'{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", '
|
||||
'{"C": 0.05, "max_iter": 10000, "kernel": "liblinear", '
|
||||
'"multiclass_strategy": "ovr"}',
|
||||
'{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
|
||||
'{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": '
|
||||
@@ -97,7 +97,7 @@ def main(args_test=None):
|
||||
for item in results:
|
||||
results_tmp = {"n_jobs": [-1], "n_estimators": [100]}
|
||||
for key, value in results[item].items():
|
||||
new_key = f"base_estimator__{key}"
|
||||
new_key = f"estimator__{key}"
|
||||
try:
|
||||
results_tmp[new_key] = sorted(value)
|
||||
except TypeError:
|
||||
@@ -111,6 +111,7 @@ def main(args_test=None):
|
||||
t2 = sorted([x for x in value if isinstance(x, str)])
|
||||
results_tmp[new_key] = t1 + t2
|
||||
output.append(results_tmp)
|
||||
|
||||
# save results
|
||||
file_name = Files.grid_input(args.score, args.model)
|
||||
file_output = os.path.join(Folders.results, file_name)
|
||||
|
@@ -13,7 +13,7 @@ def main(args_test=None):
|
||||
arguments = Arguments(prog="be_main")
|
||||
arguments.xset("stratified").xset("score").xset("model", mandatory=True)
|
||||
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
|
||||
arguments.xset("report")
|
||||
arguments.xset("report").xset("ignore_nan")
|
||||
arguments.add_exclusive(
|
||||
["grid_paramfile", "best_paramfile", "hyperparameters"]
|
||||
)
|
||||
@@ -35,6 +35,7 @@ def main(args_test=None):
|
||||
grid_paramfile=args.grid_paramfile,
|
||||
progress_bar=not args.quiet,
|
||||
platform=args.platform,
|
||||
ignore_nan=args.ignore_nan,
|
||||
title=args.title,
|
||||
folds=args.n_folds,
|
||||
)
|
||||
|
@@ -6,4 +6,4 @@ stratified=0
|
||||
# Source of data Tanveer/Surcov
|
||||
source_data=Tanveer
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
discretize=0
|
||||
discretize=0
|
@@ -6,4 +6,4 @@ stratified=0
|
||||
# Source of data Tanveer/Surcov
|
||||
source_data=Tanveer
|
||||
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
discretize=0
|
||||
discretize=0
|
@@ -18,7 +18,7 @@ class BestResultTest(TestBase):
|
||||
"C": 7,
|
||||
"gamma": 0.1,
|
||||
"kernel": "rbf",
|
||||
"max_iter": 10000.0,
|
||||
"max_iter": 10000,
|
||||
"multiclass_strategy": "ovr",
|
||||
},
|
||||
"results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json",
|
||||
|
@@ -1,4 +1,3 @@
|
||||
import shutil
|
||||
from .TestBase import TestBase
|
||||
from ..Experiments import Randomized
|
||||
from ..Datasets import Datasets
|
||||
@@ -17,10 +16,6 @@ class DatasetTest(TestBase):
|
||||
self.set_env(".env.dist")
|
||||
return super().tearDown()
|
||||
|
||||
@staticmethod
|
||||
def set_env(env):
|
||||
shutil.copy(env, ".env")
|
||||
|
||||
def test_Randomized(self):
|
||||
expected = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||
self.assertSequenceEqual(Randomized.seeds(), expected)
|
||||
|
@@ -1,4 +1,6 @@
|
||||
import json
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
from .TestBase import TestBase
|
||||
from ..Experiments import Experiment
|
||||
from ..Datasets import Datasets
|
||||
@@ -8,10 +10,12 @@ class ExperimentTest(TestBase):
|
||||
def setUp(self):
|
||||
self.exp = self.build_exp()
|
||||
|
||||
def build_exp(self, hyperparams=False, grid=False):
|
||||
def build_exp(
|
||||
self, hyperparams=False, grid=False, model="STree", ignore_nan=False
|
||||
):
|
||||
params = {
|
||||
"score_name": "accuracy",
|
||||
"model_name": "STree",
|
||||
"model_name": model,
|
||||
"stratified": "0",
|
||||
"datasets": Datasets(),
|
||||
"hyperparams_dict": "{}",
|
||||
@@ -21,6 +25,7 @@ class ExperimentTest(TestBase):
|
||||
"title": "Test",
|
||||
"progress_bar": False,
|
||||
"folds": 2,
|
||||
"ignore_nan": ignore_nan,
|
||||
}
|
||||
return Experiment(**params)
|
||||
|
||||
@@ -31,6 +36,7 @@ class ExperimentTest(TestBase):
|
||||
],
|
||||
".",
|
||||
)
|
||||
self.set_env(".env.dist")
|
||||
return super().tearDown()
|
||||
|
||||
def test_build_hyperparams_file(self):
|
||||
@@ -46,7 +52,7 @@ class ExperimentTest(TestBase):
|
||||
"C": 7,
|
||||
"gamma": 0.1,
|
||||
"kernel": "rbf",
|
||||
"max_iter": 10000.0,
|
||||
"max_iter": 10000,
|
||||
"multiclass_strategy": "ovr",
|
||||
},
|
||||
"results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json",
|
||||
@@ -89,7 +95,7 @@ class ExperimentTest(TestBase):
|
||||
def test_exception_n_fold_crossval(self):
|
||||
self.exp.do_experiment()
|
||||
with self.assertRaises(ValueError):
|
||||
self.exp._n_fold_crossval([], [], {})
|
||||
self.exp._n_fold_crossval("", [], [], {})
|
||||
|
||||
def test_do_experiment(self):
|
||||
self.exp.do_experiment()
|
||||
@@ -131,3 +137,39 @@ class ExperimentTest(TestBase):
|
||||
):
|
||||
for key, value in expected_result.items():
|
||||
self.assertEqual(computed_result[key], value)
|
||||
|
||||
def test_build_fit_parameters(self):
|
||||
self.set_env(".env.arff")
|
||||
expected = {
|
||||
"state_names": {
|
||||
"sepallength": [0, 1, 2],
|
||||
"sepalwidth": [0, 1, 3, 4],
|
||||
"petallength": [0, 1, 2, 3],
|
||||
"petalwidth": [0, 1, 2, 3],
|
||||
},
|
||||
"features": [
|
||||
"sepallength",
|
||||
"sepalwidth",
|
||||
"petallength",
|
||||
"petalwidth",
|
||||
],
|
||||
}
|
||||
exp = self.build_exp(model="TAN")
|
||||
X, y = exp.datasets.load("iris")
|
||||
computed = exp._build_fit_params("iris")
|
||||
for key, value in expected["state_names"].items():
|
||||
self.assertEqual(computed["state_names"][key], value)
|
||||
for feature in expected["features"]:
|
||||
self.assertIn(feature, computed["features"])
|
||||
|
||||
@patch("sys.stdout", new_callable=StringIO)
|
||||
def test_experiment_with_nan_not_ignored(self, mock_output):
|
||||
exp = self.build_exp(model="Mock")
|
||||
self.assertRaises(ValueError, exp.do_experiment)
|
||||
output_text = mock_output.getvalue().splitlines()
|
||||
expected = "[ nan 0.8974359]"
|
||||
self.assertEqual(expected, output_text[0])
|
||||
|
||||
def test_experiment_with_nan_ignored(self):
|
||||
self.exp = self.build_exp(model="Mock", ignore_nan=True)
|
||||
self.exp.do_experiment()
|
||||
|
@@ -70,19 +70,19 @@ class ModelTest(TestBase):
|
||||
def test_BaggingStree(self):
|
||||
clf = Models.get_model("BaggingStree")
|
||||
self.assertIsInstance(clf, BaggingClassifier)
|
||||
clf_base = clf.base_estimator
|
||||
clf_base = clf.estimator
|
||||
self.assertIsInstance(clf_base, Stree)
|
||||
|
||||
def test_BaggingWodt(self):
|
||||
clf = Models.get_model("BaggingWodt")
|
||||
self.assertIsInstance(clf, BaggingClassifier)
|
||||
clf_base = clf.base_estimator
|
||||
clf_base = clf.estimator
|
||||
self.assertIsInstance(clf_base, Wodt)
|
||||
|
||||
def test_AdaBoostStree(self):
|
||||
clf = Models.get_model("AdaBoostStree")
|
||||
self.assertIsInstance(clf, AdaBoostClassifier)
|
||||
clf_base = clf.base_estimator
|
||||
clf_base = clf.estimator
|
||||
self.assertIsInstance(clf_base, Stree)
|
||||
|
||||
def test_unknown_classifier(self):
|
||||
|
@@ -4,6 +4,7 @@ import pathlib
|
||||
import sys
|
||||
import csv
|
||||
import unittest
|
||||
import shutil
|
||||
from importlib import import_module
|
||||
from io import StringIO
|
||||
from unittest.mock import patch
|
||||
@@ -19,6 +20,10 @@ class TestBase(unittest.TestCase):
|
||||
self.stree_version = "1.2.4"
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def set_env(env):
|
||||
shutil.copy(env, ".env")
|
||||
|
||||
def remove_files(self, files, folder):
|
||||
for file_name in files:
|
||||
file_name = os.path.join(folder, file_name)
|
||||
|
@@ -1,2 +1,2 @@
|
||||
iris,class
|
||||
wine,class
|
||||
iris,class,all
|
||||
wine,class,[0, 1]
|
||||
|
@@ -1 +1 @@
|
||||
{"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]}
|
||||
{"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]}
|
@@ -17,10 +17,10 @@
|
||||
"features": 4,
|
||||
"classes": 3,
|
||||
"hyperparameters": {
|
||||
"C": 10000.0,
|
||||
"C": 10000,
|
||||
"gamma": 0.1,
|
||||
"kernel": "rbf",
|
||||
"max_iter": 10000.0,
|
||||
"max_iter": 10000,
|
||||
"multiclass_strategy": "ovr"
|
||||
},
|
||||
"nodes": 7.0,
|
||||
@@ -40,7 +40,7 @@
|
||||
"C": 7,
|
||||
"gamma": 0.1,
|
||||
"kernel": "rbf",
|
||||
"max_iter": 10000.0,
|
||||
"max_iter": 10000,
|
||||
"multiclass_strategy": "ovr"
|
||||
},
|
||||
"nodes": 3.0,
|
||||
|
@@ -27,7 +27,7 @@ class BePrintStrees(TestBase):
|
||||
stdout.getvalue(), f"File {file_name} generated\n"
|
||||
)
|
||||
computed_size = os.path.getsize(file_name)
|
||||
self.assertGreater(computed_size, 25000)
|
||||
self.assertGreater(computed_size, 24500)
|
||||
|
||||
def test_be_print_strees_dataset_color(self):
|
||||
for name in self.datasets:
|
||||
|
@@ -6,13 +6,13 @@
|
||||
"n_estimators": [
|
||||
100
|
||||
],
|
||||
"base_estimator__C": [
|
||||
"estimator__C": [
|
||||
1.0
|
||||
],
|
||||
"base_estimator__kernel": [
|
||||
"estimator__kernel": [
|
||||
"linear"
|
||||
],
|
||||
"base_estimator__multiclass_strategy": [
|
||||
"estimator__multiclass_strategy": [
|
||||
"ovo"
|
||||
]
|
||||
},
|
||||
@@ -23,7 +23,7 @@
|
||||
"n_estimators": [
|
||||
100
|
||||
],
|
||||
"base_estimator__C": [
|
||||
"estimator__C": [
|
||||
0.001,
|
||||
0.0275,
|
||||
0.05,
|
||||
@@ -36,10 +36,10 @@
|
||||
7,
|
||||
10000.0
|
||||
],
|
||||
"base_estimator__kernel": [
|
||||
"estimator__kernel": [
|
||||
"liblinear"
|
||||
],
|
||||
"base_estimator__multiclass_strategy": [
|
||||
"estimator__multiclass_strategy": [
|
||||
"ovr"
|
||||
]
|
||||
},
|
||||
@@ -50,7 +50,7 @@
|
||||
"n_estimators": [
|
||||
100
|
||||
],
|
||||
"base_estimator__C": [
|
||||
"estimator__C": [
|
||||
0.05,
|
||||
1.0,
|
||||
1.05,
|
||||
@@ -62,7 +62,7 @@
|
||||
57,
|
||||
10000.0
|
||||
],
|
||||
"base_estimator__gamma": [
|
||||
"estimator__gamma": [
|
||||
0.001,
|
||||
0.1,
|
||||
0.14,
|
||||
@@ -70,10 +70,10 @@
|
||||
"auto",
|
||||
"scale"
|
||||
],
|
||||
"base_estimator__kernel": [
|
||||
"estimator__kernel": [
|
||||
"rbf"
|
||||
],
|
||||
"base_estimator__multiclass_strategy": [
|
||||
"estimator__multiclass_strategy": [
|
||||
"ovr"
|
||||
]
|
||||
},
|
||||
@@ -84,20 +84,20 @@
|
||||
"n_estimators": [
|
||||
100
|
||||
],
|
||||
"base_estimator__C": [
|
||||
"estimator__C": [
|
||||
0.05,
|
||||
0.2,
|
||||
1.0,
|
||||
8.25
|
||||
],
|
||||
"base_estimator__gamma": [
|
||||
"estimator__gamma": [
|
||||
0.1,
|
||||
"scale"
|
||||
],
|
||||
"base_estimator__kernel": [
|
||||
"estimator__kernel": [
|
||||
"poly"
|
||||
],
|
||||
"base_estimator__multiclass_strategy": [
|
||||
"estimator__multiclass_strategy": [
|
||||
"ovo",
|
||||
"ovr"
|
||||
]
|
||||
|
@@ -9,7 +9,7 @@
|
||||
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
|
||||
============================== ====== ===== === ======= ======= ======= =============== ================= ===============
|
||||
[96mbalance-scale 625 4 3 23.32 12.16 6.44 0.840160±0.0304 0.013745±0.0019 {'splitter': 'best', 'max_features': 'auto'}
|
||||
[94mballoons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
|
||||
[94mballoons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
|
||||
[94m*************************************************************************************************************************
|
||||
[94m* accuracy compared to STree_default (liblinear-ovr) .: 0.0422 *
|
||||
[94m*************************************************************************************************************************
|
||||
|
@@ -32,7 +32,7 @@
|
||||
7;9;"0.0150468069702512"
|
||||
7;10;"0.01404867172241211"
|
||||
7;11;"0.002026269126958884"
|
||||
7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
|
||||
7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
|
||||
8;1;"balloons"
|
||||
8;2;"16"
|
||||
8;3;"4"
|
||||
@@ -44,5 +44,5 @@
|
||||
8;9;"0.2850146195080759"
|
||||
8;10;"0.0008541679382324218"
|
||||
8;11;"3.629469326417878e-05"
|
||||
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
|
||||
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
|
||||
10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454"
|
@@ -32,7 +32,7 @@
|
||||
7;10;0.0150468069702512
|
||||
7;11;0.01404867172241211
|
||||
7;12;0.002026269126958884
|
||||
7;13;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
|
||||
7;13;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
|
||||
8;1;"balloons"
|
||||
8;2;16
|
||||
8;3;4
|
||||
@@ -45,7 +45,7 @@
|
||||
8;10;0.2850146195080759
|
||||
8;11;0.0008541679382324218
|
||||
8;12;3.629469326417878e-05
|
||||
8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
|
||||
8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
|
||||
11;2;"✔"
|
||||
11;3;1
|
||||
11;4;"Equal to best"
|
||||
|
@@ -1,25 +1,28 @@
|
||||
1;1;"Datasets used in benchmark ver. 0.2.0"
|
||||
1;1;"Datasets used in benchmark ver. 0.4.0"
|
||||
2;1;" Default score accuracy"
|
||||
2;2;"Cross validation"
|
||||
2;5;"5 Folds"
|
||||
2;6;"5 Folds"
|
||||
3;2;"Stratified"
|
||||
3;5;"False"
|
||||
3;6;"False"
|
||||
4;2;"Discretized"
|
||||
4;5;"False"
|
||||
4;6;"False"
|
||||
5;2;"Seeds"
|
||||
5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]"
|
||||
5;6;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]"
|
||||
6;1;"Dataset"
|
||||
6;2;"Samples"
|
||||
6;3;"Features"
|
||||
6;4;"Classes"
|
||||
6;5;"Balance"
|
||||
6;4;"Continuous"
|
||||
6;5;"Classes"
|
||||
6;6;"Balance"
|
||||
7;1;"balance-scale"
|
||||
7;2;"625"
|
||||
7;3;"4"
|
||||
7;4;"3"
|
||||
7;5;" 7.84%/ 46.08%/ 46.08%"
|
||||
7;4;"0"
|
||||
7;5;"3"
|
||||
7;6;" 7.84%/ 46.08%/ 46.08%"
|
||||
8;1;"balloons"
|
||||
8;2;"16"
|
||||
8;3;"4"
|
||||
8;4;"2"
|
||||
8;5;"56.25%/ 43.75%"
|
||||
8;4;"0"
|
||||
8;5;"2"
|
||||
8;6;"56.25%/ 43.75%"
|
||||
|
@@ -32,7 +32,7 @@
|
||||
7;9;"0.0150468069702512"
|
||||
7;10;"0.01404867172241211"
|
||||
7;11;"0.002026269126958884"
|
||||
7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
|
||||
7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
|
||||
8;1;"balloons"
|
||||
8;2;"16"
|
||||
8;3;"4"
|
||||
@@ -44,5 +44,5 @@
|
||||
8;9;"0.2850146195080759"
|
||||
8;10;"0.0008541679382324218"
|
||||
8;11;"3.629469326417878e-05"
|
||||
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
|
||||
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
|
||||
10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454"
|
||||
|
@@ -8,8 +8,8 @@
|
||||
|
||||
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
|
||||
============================== ====== ===== === ======= ======= ======= =============== ================= ===============
|
||||
[96mbalance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
|
||||
[94mballoons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
|
||||
[96mbalance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
|
||||
[94mballoons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
|
||||
[94m*************************************************************************************************************************
|
||||
[94m* accuracy compared to STree_default (liblinear-ovr) .: 0.0454 *
|
||||
[94m*************************************************************************************************************************
|
||||
|
@@ -5,7 +5,7 @@
|
||||
Dataset Score File/Message Hyperparameters
|
||||
============================== ======== ============================================================================ =============================================
|
||||
balance-scale 0.980000 results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json {'splitter': 'best', 'max_features': 'auto'}
|
||||
balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
|
||||
balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
|
||||
******************************************************************************************************************************************************************
|
||||
* accuracy compared to STree_default (liblinear-ovr) .: 0.0457 *
|
||||
******************************************************************************************************************************************************************
|
||||
|
@@ -8,8 +8,8 @@
|
||||
|
||||
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
|
||||
============================== ====== ===== === ======= ======= ======= =============== ================= ===============
|
||||
[96mbalance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
|
||||
[94mballoons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
|
||||
[96mbalance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
|
||||
[94mballoons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
|
||||
[94m*************************************************************************************************************************
|
||||
[94m* ✔ Equal to best .....: 1 *
|
||||
[94m* accuracy compared to STree_default (liblinear-ovr) .: 0.0454 *
|
||||
|
@@ -1,6 +1,6 @@
|
||||
[94mDatasets used in benchmark ver. 0.2.0
|
||||
|
||||
Dataset Sampl. Feat. Cls Balance
|
||||
============================== ====== ===== === ============================================================
|
||||
[96mbalance-scale 625 4 3 7.84%/ 46.08%/ 46.08%
|
||||
[94mballoons 16 4 2 56.25%/ 43.75%
|
||||
Dataset Sampl. Feat. Cont Cls Balance
|
||||
============================== ====== ===== ==== === ============================================================
|
||||
[96mbalance-scale 625 4 0 3 7.84%/ 46.08%/ 46.08%
|
||||
[94mballoons 16 4 0 2 56.25%/ 43.75%
|
||||
|
@@ -3,7 +3,7 @@ scikit-learn
|
||||
scipy
|
||||
odte
|
||||
cython
|
||||
mdlp-discretization
|
||||
fimdlp
|
||||
mufs
|
||||
bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git
|
||||
xlsxwriter
|
||||
|
Reference in New Issue
Block a user