Merge pull request #9 from Doctorado-ML/continuous_features

Continuous features
This commit is contained in:
Ricardo Montañana Gómez
2023-01-15 10:55:49 +01:00
committed by GitHub
30 changed files with 234 additions and 103 deletions

View File

@@ -18,7 +18,7 @@ jobs:
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
# Make dot command available in the environment
@@ -53,7 +53,7 @@ jobs:
coverage run -m unittest -v benchmark.tests
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml

View File

@@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser):
("-p", "--hyperparameters"),
{"type": str, "required": False, "default": "{}"},
],
"ignore_nan": [
("--ignore-nan",),
{
"default": False,
"action": "store_true",
"required": False,
"help": "Ignore nan results",
},
],
"key": [
("-k", "--key"),
{

View File

@@ -2,10 +2,11 @@ import os
from types import SimpleNamespace
import pandas as pd
import numpy as np
import json
from scipy.io import arff
from .Utils import Files
from .Arguments import EnvData
from mdlp.discretization import MDLP
from fimdlp.mdlp import FImdlp
class Diterator:
@@ -27,6 +28,12 @@ class DatasetsArff:
def folder():
return "datasets"
@staticmethod
def get_range_features(X, c_features):
if c_features.strip() == "all":
return list(range(X.shape[1]))
return json.loads(c_features)
def load(self, name, class_name):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = arff.loadarff(file_name)
@@ -34,7 +41,7 @@ class DatasetsArff:
df.dropna(axis=0, how="any", inplace=True)
self.dataset = df
X = df.drop(class_name, axis=1)
self.features = X.columns
self.features = X.columns.to_list()
self.class_name = class_name
y, _ = pd.factorize(df[class_name])
X = X.to_numpy()
@@ -50,6 +57,10 @@ class DatasetsTanveer:
def folder():
return "data"
@staticmethod
def get_range_features(X, name):
return []
def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv(
@@ -75,6 +86,10 @@ class DatasetsSurcov:
def folder():
return "datasets"
@staticmethod
def get_range_features(X, name):
return []
def load(self, name, *args):
file_name = os.path.join(self.folder(), self.dataset_names(name))
data = pd.read_csv(
@@ -102,16 +117,16 @@ class Datasets:
)
self.discretize = envData["discretize"] == "1"
self.dataset = source_name()
self.class_names = []
self.data_sets = []
# initialize self.class_names & self.data_sets
class_names, sets = self._init_names(dataset_name)
self.class_names = class_names
self.data_sets = sets
self.states = {} # states of discretized variables
def _init_names(self, dataset_name):
file_name = os.path.join(self.dataset.folder(), Files.index)
default_class = "class"
self.continuous_features = {}
with open(file_name) as f:
sets = f.read().splitlines()
class_names = [default_class] * len(sets)
@@ -119,10 +134,14 @@ class Datasets:
result = []
class_names = []
for data in sets:
name, class_name = data.split(",")
name, class_name, features = data.split(",", 2)
result.append(name)
class_names.append(class_name)
self.continuous_features[name] = features
sets = result
else:
for name in sets:
self.continuous_features[name] = None
# Set as dataset list the dataset passed as argument
if dataset_name is None:
return class_names, sets
@@ -137,6 +156,7 @@ class Datasets:
self.discretize = False
X, y = self.load(name)
attr = SimpleNamespace()
attr.dataset = name
values, counts = np.unique(y, return_counts=True)
comp = ""
sep = ""
@@ -147,24 +167,41 @@ class Datasets:
attr.classes = len(np.unique(y))
attr.samples = X.shape[0]
attr.features = X.shape[1]
attr.cont_features = len(self.get_continuous_features())
self.discretize = tmp
return attr
def get_features(self):
return self.dataset.features
def get_states(self, name):
return self.states[name] if name in self.states else None
def get_continuous_features(self):
return self.continuous_features_dataset
def get_class_name(self):
return self.dataset.class_name
def get_dataset(self):
return self.dataset.dataset
def build_states(self, name, X):
features = self.get_features()
self.states[name] = {
features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1])
}
def load(self, name, dataframe=False):
try:
class_name = self.class_names[self.data_sets.index(name)]
X, y = self.dataset.load(name, class_name)
self.continuous_features_dataset = self.dataset.get_range_features(
X, self.continuous_features[name]
)
if self.discretize:
X = self.discretize_dataset(X, y)
self.build_states(name, X)
dataset = pd.DataFrame(X, columns=self.get_features())
dataset[self.get_class_name()] = y
self.dataset.dataset = dataset
@@ -188,9 +225,8 @@ class Datasets:
-------
tuple (X, y) of numpy.ndarray
"""
discretiz = MDLP(random_state=17, dtype=np.int32)
Xdisc = discretiz.fit_transform(X, y)
return Xdisc
discretiz = FImdlp(algorithm=0)
return discretiz.fit_transform(X, y)
def __iter__(self) -> Diterator:
return Diterator(self.data_sets)

View File

@@ -112,6 +112,7 @@ class Experiment:
platform,
title,
progress_bar=True,
ignore_nan=True,
folds=5,
):
today = datetime.now()
@@ -131,6 +132,7 @@ class Experiment:
self.score_name = score_name
self.model_name = model_name
self.title = title
self.ignore_nan = ignore_nan
self.stratified = stratified == "1"
self.stratified_class = StratifiedKFold if self.stratified else KFold
self.datasets = datasets
@@ -184,7 +186,14 @@ class Experiment:
self.leaves = []
self.depths = []
def _n_fold_crossval(self, X, y, hyperparameters):
def _build_fit_params(self, name):
states = self.datasets.get_states(name)
if states is None:
return None
features = self.datasets.get_features()
return {"state_names": states, "features": features}
def _n_fold_crossval(self, name, X, y, hyperparameters):
if self.scores != []:
raise ValueError("Must init experiment before!")
loop = tqdm(
@@ -201,6 +210,7 @@ class Experiment:
shuffle=True, random_state=random_state, n_splits=self.folds
)
clf = self._build_classifier(random_state, hyperparameters)
fit_params = self._build_fit_params(name)
self.version = Models.get_version(self.model_name, clf)
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
@@ -209,11 +219,19 @@ class Experiment:
X,
y,
cv=kfold,
fit_params=fit_params,
return_estimator=True,
scoring=self.score_name,
)
self.scores.append(res["test_score"])
self.times.append(res["fit_time"])
if np.isnan(res["test_score"]).any():
if not self.ignore_nan:
print(res["test_score"])
raise ValueError("NaN in results")
results = res["test_score"][~np.isnan(res["test_score"])]
else:
results = res["test_score"]
self.scores.extend(results)
self.times.extend(res["fit_time"])
for result_item in res["estimator"]:
nodes_item, leaves_item, depth_item = Models.get_complexity(
self.model_name, result_item
@@ -273,7 +291,7 @@ class Experiment:
n_classes = len(np.unique(y))
hyperparameters = self.hyperparameters_dict[name][1]
self._init_experiment()
self._n_fold_crossval(X, y, hyperparameters)
self._n_fold_crossval(name, X, y, hyperparameters)
self._add_results(name, hyperparameters, samp, feat, n_classes)
self._output_results()
self.duration = time.time() - now

View File

@@ -15,6 +15,24 @@ from xgboost import XGBClassifier
import sklearn
import xgboost
import random
class MockModel(SVC):
# Only used for testing
def predict(self, X):
if random.random() < 0.1:
return [float("NaN")] * len(X)
return super().predict(X)
def nodes_leaves(self):
return 0, 0
def fit(self, X, y, **kwargs):
kwargs.pop("state_names", None)
kwargs.pop("features", None)
return super().fit(X, y, **kwargs)
class Models:
@staticmethod
@@ -22,27 +40,27 @@ class Models:
return {
"STree": Stree(random_state=random_state),
"TAN": TAN(random_state=random_state),
"KDB": KDB(k=3),
"KDB": KDB(k=2),
"AODE": AODE(random_state=random_state),
"Cart": DecisionTreeClassifier(random_state=random_state),
"ExtraTree": ExtraTreeClassifier(random_state=random_state),
"Wodt": Wodt(random_state=random_state),
"SVC": SVC(random_state=random_state),
"ODTE": Odte(
base_estimator=Stree(random_state=random_state),
estimator=Stree(random_state=random_state),
random_state=random_state,
),
"BaggingStree": BaggingClassifier(
base_estimator=Stree(random_state=random_state),
estimator=Stree(random_state=random_state),
random_state=random_state,
),
"BaggingWodt": BaggingClassifier(
base_estimator=Wodt(random_state=random_state),
estimator=Wodt(random_state=random_state),
random_state=random_state,
),
"XGBoost": XGBClassifier(random_state=random_state),
"AdaBoostStree": AdaBoostClassifier(
base_estimator=Stree(
estimator=Stree(
random_state=random_state,
),
algorithm="SAMME",
@@ -50,6 +68,7 @@ class Models:
),
"GBC": GradientBoostingClassifier(random_state=random_state),
"RandomForest": RandomForestClassifier(random_state=random_state),
"Mock": MockModel(random_state=random_state),
}
@staticmethod

View File

@@ -684,7 +684,7 @@ class ReportDatasets:
"bg_color": self.color1,
}
)
self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format)
self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format)
self.sheet.merge_range(
1,
0,
@@ -697,24 +697,24 @@ class ReportDatasets:
1,
1,
1,
3,
4,
"Cross validation",
merge_format_subheader_right,
)
self.sheet.write(
1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left
1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left
)
self.sheet.merge_range(
2,
1,
2,
3,
4,
"Stratified",
merge_format_subheader_right,
)
self.sheet.write(
2,
4,
5,
f"{'True' if self.env['stratified']=='1' else 'False'}",
merge_format_subheader_left,
)
@@ -722,13 +722,13 @@ class ReportDatasets:
3,
1,
3,
3,
4,
"Discretized",
merge_format_subheader_right,
)
self.sheet.write(
3,
4,
5,
f"{'True' if self.env['discretize']=='1' else 'False'}",
merge_format_subheader_left,
)
@@ -736,18 +736,19 @@ class ReportDatasets:
4,
1,
4,
3,
4,
"Seeds",
merge_format_subheader_right,
)
self.sheet.write(
4, 4, f"{self.env['seeds']}", merge_format_subheader_left
4, 5, f"{self.env['seeds']}", merge_format_subheader_left
)
self.update_max_length(len(self.env["seeds"]) + 1)
header_cols = [
("Dataset", 30),
("Samples", 10),
("Features", 10),
("Continuous", 10),
("Classes", 10),
("Balance", 50),
]
@@ -767,7 +768,7 @@ class ReportDatasets:
def footer(self):
# set Balance column width to max length
self.sheet.set_column(4, 4, self.max_length)
self.sheet.set_column(5, 5, self.max_length)
self.sheet.freeze_panes(6, 1)
self.sheet.hide_gridlines(2)
if self.close:
@@ -789,8 +790,9 @@ class ReportDatasets:
self.sheet.write(self.row, col, result.dataset, normal)
self.sheet.write(self.row, col + 1, result.samples, integer)
self.sheet.write(self.row, col + 2, result.features, integer)
self.sheet.write(self.row, col + 3, result.classes, normal)
self.sheet.write(self.row, col + 4, result.balance, normal)
self.sheet.write(self.row, col + 3, result.cont_features, integer)
self.sheet.write(self.row, col + 4, result.classes, normal)
self.sheet.write(self.row, col + 5, result.balance, normal)
self.update_max_length(len(result.balance))
self.row += 1
@@ -807,11 +809,11 @@ class ReportDatasets:
print(color_line, end="")
print(self.header_text)
print("")
print(f"{'Dataset':30s} Sampl. Feat. Cls Balance")
print("=" * 30 + " ====== ===== === " + "=" * 60)
print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance")
print("=" * 30 + " ====== ===== ==== === " + "=" * 60)
for dataset in data_sets:
attributes = data_sets.get_attributes(dataset)
attributes.dataset = dataset
if self.excel:
self.print_line(attributes)
color_line = (
@@ -823,8 +825,8 @@ class ReportDatasets:
print(color_line, end="")
print(
f"{dataset:30s} {attributes.samples:6,d} "
f"{attributes.features:5,d} {attributes.classes:3d} "
f"{attributes.balance:40s}"
f"{attributes.features:5,d} {attributes.cont_features:4,d}"
f" {attributes.classes:3d} {attributes.balance:40s}"
)
if self.excel:
self.footer()

View File

@@ -46,7 +46,7 @@ def main(args_test=None):
'{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": '
'"ovr"}',
'{"C": 5, "kernel": "rbf", "gamma": "auto"}',
'{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", '
'{"C": 0.05, "max_iter": 10000, "kernel": "liblinear", '
'"multiclass_strategy": "ovr"}',
'{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}',
'{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": '
@@ -97,7 +97,7 @@ def main(args_test=None):
for item in results:
results_tmp = {"n_jobs": [-1], "n_estimators": [100]}
for key, value in results[item].items():
new_key = f"base_estimator__{key}"
new_key = f"estimator__{key}"
try:
results_tmp[new_key] = sorted(value)
except TypeError:
@@ -111,6 +111,7 @@ def main(args_test=None):
t2 = sorted([x for x in value if isinstance(x, str)])
results_tmp[new_key] = t1 + t2
output.append(results_tmp)
# save results
file_name = Files.grid_input(args.score, args.model)
file_output = os.path.join(Folders.results, file_name)

View File

@@ -13,7 +13,7 @@ def main(args_test=None):
arguments = Arguments(prog="be_main")
arguments.xset("stratified").xset("score").xset("model", mandatory=True)
arguments.xset("n_folds").xset("platform").xset("quiet").xset("title")
arguments.xset("report")
arguments.xset("report").xset("ignore_nan")
arguments.add_exclusive(
["grid_paramfile", "best_paramfile", "hyperparameters"]
)
@@ -35,6 +35,7 @@ def main(args_test=None):
grid_paramfile=args.grid_paramfile,
progress_bar=not args.quiet,
platform=args.platform,
ignore_nan=args.ignore_nan,
title=args.title,
folds=args.n_folds,
)

View File

@@ -6,4 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov
source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0
discretize=0

View File

@@ -6,4 +6,4 @@ stratified=0
# Source of data Tanveer/Surcov
source_data=Tanveer
seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
discretize=0
discretize=0

View File

@@ -18,7 +18,7 @@ class BestResultTest(TestBase):
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"max_iter": 10000,
"multiclass_strategy": "ovr",
},
"results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json",

View File

@@ -1,4 +1,3 @@
import shutil
from .TestBase import TestBase
from ..Experiments import Randomized
from ..Datasets import Datasets
@@ -17,10 +16,6 @@ class DatasetTest(TestBase):
self.set_env(".env.dist")
return super().tearDown()
@staticmethod
def set_env(env):
shutil.copy(env, ".env")
def test_Randomized(self):
expected = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
self.assertSequenceEqual(Randomized.seeds(), expected)

View File

@@ -1,4 +1,6 @@
import json
from io import StringIO
from unittest.mock import patch
from .TestBase import TestBase
from ..Experiments import Experiment
from ..Datasets import Datasets
@@ -8,10 +10,12 @@ class ExperimentTest(TestBase):
def setUp(self):
self.exp = self.build_exp()
def build_exp(self, hyperparams=False, grid=False):
def build_exp(
self, hyperparams=False, grid=False, model="STree", ignore_nan=False
):
params = {
"score_name": "accuracy",
"model_name": "STree",
"model_name": model,
"stratified": "0",
"datasets": Datasets(),
"hyperparams_dict": "{}",
@@ -21,6 +25,7 @@ class ExperimentTest(TestBase):
"title": "Test",
"progress_bar": False,
"folds": 2,
"ignore_nan": ignore_nan,
}
return Experiment(**params)
@@ -31,6 +36,7 @@ class ExperimentTest(TestBase):
],
".",
)
self.set_env(".env.dist")
return super().tearDown()
def test_build_hyperparams_file(self):
@@ -46,7 +52,7 @@ class ExperimentTest(TestBase):
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"max_iter": 10000,
"multiclass_strategy": "ovr",
},
"results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json",
@@ -89,7 +95,7 @@ class ExperimentTest(TestBase):
def test_exception_n_fold_crossval(self):
self.exp.do_experiment()
with self.assertRaises(ValueError):
self.exp._n_fold_crossval([], [], {})
self.exp._n_fold_crossval("", [], [], {})
def test_do_experiment(self):
self.exp.do_experiment()
@@ -131,3 +137,39 @@ class ExperimentTest(TestBase):
):
for key, value in expected_result.items():
self.assertEqual(computed_result[key], value)
def test_build_fit_parameters(self):
self.set_env(".env.arff")
expected = {
"state_names": {
"sepallength": [0, 1, 2],
"sepalwidth": [0, 1, 3, 4],
"petallength": [0, 1, 2, 3],
"petalwidth": [0, 1, 2, 3],
},
"features": [
"sepallength",
"sepalwidth",
"petallength",
"petalwidth",
],
}
exp = self.build_exp(model="TAN")
X, y = exp.datasets.load("iris")
computed = exp._build_fit_params("iris")
for key, value in expected["state_names"].items():
self.assertEqual(computed["state_names"][key], value)
for feature in expected["features"]:
self.assertIn(feature, computed["features"])
@patch("sys.stdout", new_callable=StringIO)
def test_experiment_with_nan_not_ignored(self, mock_output):
exp = self.build_exp(model="Mock")
self.assertRaises(ValueError, exp.do_experiment)
output_text = mock_output.getvalue().splitlines()
expected = "[ nan 0.8974359]"
self.assertEqual(expected, output_text[0])
def test_experiment_with_nan_ignored(self):
self.exp = self.build_exp(model="Mock", ignore_nan=True)
self.exp.do_experiment()

View File

@@ -70,19 +70,19 @@ class ModelTest(TestBase):
def test_BaggingStree(self):
clf = Models.get_model("BaggingStree")
self.assertIsInstance(clf, BaggingClassifier)
clf_base = clf.base_estimator
clf_base = clf.estimator
self.assertIsInstance(clf_base, Stree)
def test_BaggingWodt(self):
clf = Models.get_model("BaggingWodt")
self.assertIsInstance(clf, BaggingClassifier)
clf_base = clf.base_estimator
clf_base = clf.estimator
self.assertIsInstance(clf_base, Wodt)
def test_AdaBoostStree(self):
clf = Models.get_model("AdaBoostStree")
self.assertIsInstance(clf, AdaBoostClassifier)
clf_base = clf.base_estimator
clf_base = clf.estimator
self.assertIsInstance(clf_base, Stree)
def test_unknown_classifier(self):

View File

@@ -4,6 +4,7 @@ import pathlib
import sys
import csv
import unittest
import shutil
from importlib import import_module
from io import StringIO
from unittest.mock import patch
@@ -19,6 +20,10 @@ class TestBase(unittest.TestCase):
self.stree_version = "1.2.4"
super().__init__(*args, **kwargs)
@staticmethod
def set_env(env):
shutil.copy(env, ".env")
def remove_files(self, files, folder):
for file_name in files:
file_name = os.path.join(folder, file_name)

View File

@@ -1,2 +1,2 @@
iris,class
wine,class
iris,class,all
wine,class,[0, 1]

View File

@@ -1 +1 @@
{"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]}
{"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]}

View File

@@ -17,10 +17,10 @@
"features": 4,
"classes": 3,
"hyperparameters": {
"C": 10000.0,
"C": 10000,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"max_iter": 10000,
"multiclass_strategy": "ovr"
},
"nodes": 7.0,
@@ -40,7 +40,7 @@
"C": 7,
"gamma": 0.1,
"kernel": "rbf",
"max_iter": 10000.0,
"max_iter": 10000,
"multiclass_strategy": "ovr"
},
"nodes": 3.0,

View File

@@ -27,7 +27,7 @@ class BePrintStrees(TestBase):
stdout.getvalue(), f"File {file_name} generated\n"
)
computed_size = os.path.getsize(file_name)
self.assertGreater(computed_size, 25000)
self.assertGreater(computed_size, 24500)
def test_be_print_strees_dataset_color(self):
for name in self.datasets:

View File

@@ -6,13 +6,13 @@
"n_estimators": [
100
],
"base_estimator__C": [
"estimator__C": [
1.0
],
"base_estimator__kernel": [
"estimator__kernel": [
"linear"
],
"base_estimator__multiclass_strategy": [
"estimator__multiclass_strategy": [
"ovo"
]
},
@@ -23,7 +23,7 @@
"n_estimators": [
100
],
"base_estimator__C": [
"estimator__C": [
0.001,
0.0275,
0.05,
@@ -36,10 +36,10 @@
7,
10000.0
],
"base_estimator__kernel": [
"estimator__kernel": [
"liblinear"
],
"base_estimator__multiclass_strategy": [
"estimator__multiclass_strategy": [
"ovr"
]
},
@@ -50,7 +50,7 @@
"n_estimators": [
100
],
"base_estimator__C": [
"estimator__C": [
0.05,
1.0,
1.05,
@@ -62,7 +62,7 @@
57,
10000.0
],
"base_estimator__gamma": [
"estimator__gamma": [
0.001,
0.1,
0.14,
@@ -70,10 +70,10 @@
"auto",
"scale"
],
"base_estimator__kernel": [
"estimator__kernel": [
"rbf"
],
"base_estimator__multiclass_strategy": [
"estimator__multiclass_strategy": [
"ovr"
]
},
@@ -84,20 +84,20 @@
"n_estimators": [
100
],
"base_estimator__C": [
"estimator__C": [
0.05,
0.2,
1.0,
8.25
],
"base_estimator__gamma": [
"estimator__gamma": [
0.1,
"scale"
],
"base_estimator__kernel": [
"estimator__kernel": [
"poly"
],
"base_estimator__multiclass_strategy": [
"estimator__multiclass_strategy": [
"ovo",
"ovr"
]

View File

@@ -9,7 +9,7 @@
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
============================== ====== ===== === ======= ======= ======= =============== ================= ===============
balance-scale 625 4 3 23.32 12.16 6.44 0.840160±0.0304 0.013745±0.0019 {'splitter': 'best', 'max_features': 'auto'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
*************************************************************************************************************************
* accuracy compared to STree_default (liblinear-ovr) .: 0.0422 *
*************************************************************************************************************************

View File

@@ -32,7 +32,7 @@
7;9;"0.0150468069702512"
7;10;"0.01404867172241211"
7;11;"0.002026269126958884"
7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
8;1;"balloons"
8;2;"16"
8;3;"4"
@@ -44,5 +44,5 @@
8;9;"0.2850146195080759"
8;10;"0.0008541679382324218"
8;11;"3.629469326417878e-05"
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454"

View File

@@ -32,7 +32,7 @@
7;10;0.0150468069702512
7;11;0.01404867172241211
7;12;0.002026269126958884
7;13;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
7;13;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
8;1;"balloons"
8;2;16
8;3;4
@@ -45,7 +45,7 @@
8;10;0.2850146195080759
8;11;0.0008541679382324218
8;12;3.629469326417878e-05
8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
11;2;"✔"
11;3;1
11;4;"Equal to best"

View File

@@ -1,25 +1,28 @@
1;1;"Datasets used in benchmark ver. 0.2.0"
1;1;"Datasets used in benchmark ver. 0.4.0"
2;1;" Default score accuracy"
2;2;"Cross validation"
2;5;"5 Folds"
2;6;"5 Folds"
3;2;"Stratified"
3;5;"False"
3;6;"False"
4;2;"Discretized"
4;5;"False"
4;6;"False"
5;2;"Seeds"
5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]"
5;6;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]"
6;1;"Dataset"
6;2;"Samples"
6;3;"Features"
6;4;"Classes"
6;5;"Balance"
6;4;"Continuous"
6;5;"Classes"
6;6;"Balance"
7;1;"balance-scale"
7;2;"625"
7;3;"4"
7;4;"3"
7;5;" 7.84%/ 46.08%/ 46.08%"
7;4;"0"
7;5;"3"
7;6;" 7.84%/ 46.08%/ 46.08%"
8;1;"balloons"
8;2;"16"
8;3;"4"
8;4;"2"
8;5;"56.25%/ 43.75%"
8;4;"0"
8;5;"2"
8;6;"56.25%/ 43.75%"

View File

@@ -32,7 +32,7 @@
7;9;"0.0150468069702512"
7;10;"0.01404867172241211"
7;11;"0.002026269126958884"
7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
8;1;"balloons"
8;2;"16"
8;3;"4"
@@ -44,5 +44,5 @@
8;9;"0.2850146195080759"
8;10;"0.0008541679382324218"
8;11;"3.629469326417878e-05"
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}"
8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}"
10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454"

View File

@@ -8,8 +8,8 @@
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
============================== ====== ===== === ======= ======= ======= =============== ================= ===============
balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
*************************************************************************************************************************
* accuracy compared to STree_default (liblinear-ovr) .: 0.0454 *
*************************************************************************************************************************

View File

@@ -5,7 +5,7 @@
Dataset Score File/Message Hyperparameters
============================== ======== ============================================================================ =============================================
balance-scale 0.980000 results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json {'splitter': 'best', 'max_features': 'auto'}
balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
******************************************************************************************************************************************************************
* accuracy compared to STree_default (liblinear-ovr) .: 0.0457 *
******************************************************************************************************************************************************************

View File

@@ -8,8 +8,8 @@
Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters
============================== ====== ===== === ======= ======= ======= =============== ================= ===============
balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}
balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}
*************************************************************************************************************************
* ✔ Equal to best .....: 1 *
* accuracy compared to STree_default (liblinear-ovr) .: 0.0454 *

View File

@@ -1,6 +1,6 @@
Datasets used in benchmark ver. 0.2.0
Dataset Sampl. Feat. Cls Balance
============================== ====== ===== === ============================================================
balance-scale 625 4 3 7.84%/ 46.08%/ 46.08%
balloons 16 4 2 56.25%/ 43.75%
Dataset Sampl. Feat. Cont Cls Balance
============================== ====== ===== ==== === ============================================================
balance-scale 625 4 0 3 7.84%/ 46.08%/ 46.08%
balloons 16 4 0 2 56.25%/ 43.75%

View File

@@ -3,7 +3,7 @@ scikit-learn
scipy
odte
cython
mdlp-discretization
fimdlp
mufs
bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git
xlsxwriter