diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a78698d..243b1df 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -18,7 +18,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python }} # Make dot command available in the environment @@ -53,7 +53,7 @@ jobs: coverage run -m unittest -v benchmark.tests coverage xml - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v3 with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml diff --git a/benchmark/Arguments.py b/benchmark/Arguments.py index ffc4023..1ffd623 100644 --- a/benchmark/Arguments.py +++ b/benchmark/Arguments.py @@ -123,6 +123,15 @@ class Arguments(argparse.ArgumentParser): ("-p", "--hyperparameters"), {"type": str, "required": False, "default": "{}"}, ], + "ignore_nan": [ + ("--ignore-nan",), + { + "default": False, + "action": "store_true", + "required": False, + "help": "Ignore nan results", + }, + ], "key": [ ("-k", "--key"), { diff --git a/benchmark/Datasets.py b/benchmark/Datasets.py index 2a966e7..eac6ed7 100644 --- a/benchmark/Datasets.py +++ b/benchmark/Datasets.py @@ -2,10 +2,11 @@ import os from types import SimpleNamespace import pandas as pd import numpy as np +import json from scipy.io import arff from .Utils import Files from .Arguments import EnvData -from mdlp.discretization import MDLP +from fimdlp.mdlp import FImdlp class Diterator: @@ -27,6 +28,12 @@ class DatasetsArff: def folder(): return "datasets" + @staticmethod + def get_range_features(X, c_features): + if c_features.strip() == "all": + return list(range(X.shape[1])) + return json.loads(c_features) + def load(self, name, class_name): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = arff.loadarff(file_name) @@ -34,7 +41,7 @@ class DatasetsArff: df.dropna(axis=0, how="any", inplace=True) self.dataset = df X = df.drop(class_name, axis=1) - self.features = X.columns + self.features = X.columns.to_list() self.class_name = class_name y, _ = pd.factorize(df[class_name]) X = X.to_numpy() @@ -50,6 +57,10 @@ class DatasetsTanveer: def folder(): return "data" + @staticmethod + def get_range_features(X, name): + return [] + def load(self, name, *args): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = pd.read_csv( @@ -75,6 +86,10 @@ class DatasetsSurcov: def folder(): return "datasets" + @staticmethod + def get_range_features(X, name): + return [] + def load(self, name, *args): file_name = os.path.join(self.folder(), self.dataset_names(name)) data = pd.read_csv( @@ -102,16 +117,16 @@ class Datasets: ) self.discretize = envData["discretize"] == "1" self.dataset = source_name() - self.class_names = [] - self.data_sets = [] # initialize self.class_names & self.data_sets class_names, sets = self._init_names(dataset_name) self.class_names = class_names self.data_sets = sets + self.states = {} # states of discretized variables def _init_names(self, dataset_name): file_name = os.path.join(self.dataset.folder(), Files.index) default_class = "class" + self.continuous_features = {} with open(file_name) as f: sets = f.read().splitlines() class_names = [default_class] * len(sets) @@ -119,10 +134,14 @@ class Datasets: result = [] class_names = [] for data in sets: - name, class_name = data.split(",") + name, class_name, features = data.split(",", 2) result.append(name) class_names.append(class_name) + self.continuous_features[name] = features sets = result + else: + for name in sets: + self.continuous_features[name] = None # Set as dataset list the dataset passed as argument if dataset_name is None: return class_names, sets @@ -137,6 +156,7 @@ class Datasets: self.discretize = False X, y = self.load(name) attr = SimpleNamespace() + attr.dataset = name values, counts = np.unique(y, return_counts=True) comp = "" sep = "" @@ -147,24 +167,41 @@ class Datasets: attr.classes = len(np.unique(y)) attr.samples = X.shape[0] attr.features = X.shape[1] + attr.cont_features = len(self.get_continuous_features()) self.discretize = tmp return attr def get_features(self): return self.dataset.features + def get_states(self, name): + return self.states[name] if name in self.states else None + + def get_continuous_features(self): + return self.continuous_features_dataset + def get_class_name(self): return self.dataset.class_name def get_dataset(self): return self.dataset.dataset + def build_states(self, name, X): + features = self.get_features() + self.states[name] = { + features[i]: np.unique(X[:, i]).tolist() for i in range(X.shape[1]) + } + def load(self, name, dataframe=False): try: class_name = self.class_names[self.data_sets.index(name)] X, y = self.dataset.load(name, class_name) + self.continuous_features_dataset = self.dataset.get_range_features( + X, self.continuous_features[name] + ) if self.discretize: X = self.discretize_dataset(X, y) + self.build_states(name, X) dataset = pd.DataFrame(X, columns=self.get_features()) dataset[self.get_class_name()] = y self.dataset.dataset = dataset @@ -188,9 +225,8 @@ class Datasets: ------- tuple (X, y) of numpy.ndarray """ - discretiz = MDLP(random_state=17, dtype=np.int32) - Xdisc = discretiz.fit_transform(X, y) - return Xdisc + discretiz = FImdlp(algorithm=0) + return discretiz.fit_transform(X, y) def __iter__(self) -> Diterator: return Diterator(self.data_sets) diff --git a/benchmark/Experiments.py b/benchmark/Experiments.py index 6092955..6865f98 100644 --- a/benchmark/Experiments.py +++ b/benchmark/Experiments.py @@ -112,6 +112,7 @@ class Experiment: platform, title, progress_bar=True, + ignore_nan=True, folds=5, ): today = datetime.now() @@ -131,6 +132,7 @@ class Experiment: self.score_name = score_name self.model_name = model_name self.title = title + self.ignore_nan = ignore_nan self.stratified = stratified == "1" self.stratified_class = StratifiedKFold if self.stratified else KFold self.datasets = datasets @@ -184,7 +186,14 @@ class Experiment: self.leaves = [] self.depths = [] - def _n_fold_crossval(self, X, y, hyperparameters): + def _build_fit_params(self, name): + states = self.datasets.get_states(name) + if states is None: + return None + features = self.datasets.get_features() + return {"state_names": states, "features": features} + + def _n_fold_crossval(self, name, X, y, hyperparameters): if self.scores != []: raise ValueError("Must init experiment before!") loop = tqdm( @@ -201,6 +210,7 @@ class Experiment: shuffle=True, random_state=random_state, n_splits=self.folds ) clf = self._build_classifier(random_state, hyperparameters) + fit_params = self._build_fit_params(name) self.version = Models.get_version(self.model_name, clf) with warnings.catch_warnings(): warnings.filterwarnings("ignore") @@ -209,11 +219,19 @@ class Experiment: X, y, cv=kfold, + fit_params=fit_params, return_estimator=True, scoring=self.score_name, ) - self.scores.append(res["test_score"]) - self.times.append(res["fit_time"]) + if np.isnan(res["test_score"]).any(): + if not self.ignore_nan: + print(res["test_score"]) + raise ValueError("NaN in results") + results = res["test_score"][~np.isnan(res["test_score"])] + else: + results = res["test_score"] + self.scores.extend(results) + self.times.extend(res["fit_time"]) for result_item in res["estimator"]: nodes_item, leaves_item, depth_item = Models.get_complexity( self.model_name, result_item @@ -273,7 +291,7 @@ class Experiment: n_classes = len(np.unique(y)) hyperparameters = self.hyperparameters_dict[name][1] self._init_experiment() - self._n_fold_crossval(X, y, hyperparameters) + self._n_fold_crossval(name, X, y, hyperparameters) self._add_results(name, hyperparameters, samp, feat, n_classes) self._output_results() self.duration = time.time() - now diff --git a/benchmark/Models.py b/benchmark/Models.py index 215dbe8..2e06c5c 100644 --- a/benchmark/Models.py +++ b/benchmark/Models.py @@ -15,6 +15,24 @@ from xgboost import XGBClassifier import sklearn import xgboost +import random + + +class MockModel(SVC): + # Only used for testing + def predict(self, X): + if random.random() < 0.1: + return [float("NaN")] * len(X) + return super().predict(X) + + def nodes_leaves(self): + return 0, 0 + + def fit(self, X, y, **kwargs): + kwargs.pop("state_names", None) + kwargs.pop("features", None) + return super().fit(X, y, **kwargs) + class Models: @staticmethod @@ -22,27 +40,27 @@ class Models: return { "STree": Stree(random_state=random_state), "TAN": TAN(random_state=random_state), - "KDB": KDB(k=3), + "KDB": KDB(k=2), "AODE": AODE(random_state=random_state), "Cart": DecisionTreeClassifier(random_state=random_state), "ExtraTree": ExtraTreeClassifier(random_state=random_state), "Wodt": Wodt(random_state=random_state), "SVC": SVC(random_state=random_state), "ODTE": Odte( - base_estimator=Stree(random_state=random_state), + estimator=Stree(random_state=random_state), random_state=random_state, ), "BaggingStree": BaggingClassifier( - base_estimator=Stree(random_state=random_state), + estimator=Stree(random_state=random_state), random_state=random_state, ), "BaggingWodt": BaggingClassifier( - base_estimator=Wodt(random_state=random_state), + estimator=Wodt(random_state=random_state), random_state=random_state, ), "XGBoost": XGBClassifier(random_state=random_state), "AdaBoostStree": AdaBoostClassifier( - base_estimator=Stree( + estimator=Stree( random_state=random_state, ), algorithm="SAMME", @@ -50,6 +68,7 @@ class Models: ), "GBC": GradientBoostingClassifier(random_state=random_state), "RandomForest": RandomForestClassifier(random_state=random_state), + "Mock": MockModel(random_state=random_state), } @staticmethod diff --git a/benchmark/Results.py b/benchmark/Results.py index 28376b0..2188ea3 100644 --- a/benchmark/Results.py +++ b/benchmark/Results.py @@ -684,7 +684,7 @@ class ReportDatasets: "bg_color": self.color1, } ) - self.sheet.merge_range(0, 0, 0, 4, self.header_text, merge_format) + self.sheet.merge_range(0, 0, 0, 5, self.header_text, merge_format) self.sheet.merge_range( 1, 0, @@ -697,24 +697,24 @@ class ReportDatasets: 1, 1, 1, - 3, + 4, "Cross validation", merge_format_subheader_right, ) self.sheet.write( - 1, 4, f"{self.env['n_folds']} Folds", merge_format_subheader_left + 1, 5, f"{self.env['n_folds']} Folds", merge_format_subheader_left ) self.sheet.merge_range( 2, 1, 2, - 3, + 4, "Stratified", merge_format_subheader_right, ) self.sheet.write( 2, - 4, + 5, f"{'True' if self.env['stratified']=='1' else 'False'}", merge_format_subheader_left, ) @@ -722,13 +722,13 @@ class ReportDatasets: 3, 1, 3, - 3, + 4, "Discretized", merge_format_subheader_right, ) self.sheet.write( 3, - 4, + 5, f"{'True' if self.env['discretize']=='1' else 'False'}", merge_format_subheader_left, ) @@ -736,18 +736,19 @@ class ReportDatasets: 4, 1, 4, - 3, + 4, "Seeds", merge_format_subheader_right, ) self.sheet.write( - 4, 4, f"{self.env['seeds']}", merge_format_subheader_left + 4, 5, f"{self.env['seeds']}", merge_format_subheader_left ) self.update_max_length(len(self.env["seeds"]) + 1) header_cols = [ ("Dataset", 30), ("Samples", 10), ("Features", 10), + ("Continuous", 10), ("Classes", 10), ("Balance", 50), ] @@ -767,7 +768,7 @@ class ReportDatasets: def footer(self): # set Balance column width to max length - self.sheet.set_column(4, 4, self.max_length) + self.sheet.set_column(5, 5, self.max_length) self.sheet.freeze_panes(6, 1) self.sheet.hide_gridlines(2) if self.close: @@ -789,8 +790,9 @@ class ReportDatasets: self.sheet.write(self.row, col, result.dataset, normal) self.sheet.write(self.row, col + 1, result.samples, integer) self.sheet.write(self.row, col + 2, result.features, integer) - self.sheet.write(self.row, col + 3, result.classes, normal) - self.sheet.write(self.row, col + 4, result.balance, normal) + self.sheet.write(self.row, col + 3, result.cont_features, integer) + self.sheet.write(self.row, col + 4, result.classes, normal) + self.sheet.write(self.row, col + 5, result.balance, normal) self.update_max_length(len(result.balance)) self.row += 1 @@ -807,11 +809,11 @@ class ReportDatasets: print(color_line, end="") print(self.header_text) print("") - print(f"{'Dataset':30s} Sampl. Feat. Cls Balance") - print("=" * 30 + " ====== ===== === " + "=" * 60) + print(f"{'Dataset':30s} Sampl. Feat. Cont Cls Balance") + print("=" * 30 + " ====== ===== ==== === " + "=" * 60) for dataset in data_sets: attributes = data_sets.get_attributes(dataset) - attributes.dataset = dataset + if self.excel: self.print_line(attributes) color_line = ( @@ -823,8 +825,8 @@ class ReportDatasets: print(color_line, end="") print( f"{dataset:30s} {attributes.samples:6,d} " - f"{attributes.features:5,d} {attributes.classes:3d} " - f"{attributes.balance:40s}" + f"{attributes.features:5,d} {attributes.cont_features:4,d}" + f" {attributes.classes:3d} {attributes.balance:40s}" ) if self.excel: self.footer() diff --git a/benchmark/scripts/be_build_grid.py b/benchmark/scripts/be_build_grid.py index d4ea653..5dd5bb7 100755 --- a/benchmark/scripts/be_build_grid.py +++ b/benchmark/scripts/be_build_grid.py @@ -46,7 +46,7 @@ def main(args_test=None): '{"C": 7, "gamma": 0.1, "kernel": "rbf", "multiclass_strategy": ' '"ovr"}', '{"C": 5, "kernel": "rbf", "gamma": "auto"}', - '{"C": 0.05, "max_iter": 10000.0, "kernel": "liblinear", ' + '{"C": 0.05, "max_iter": 10000, "kernel": "liblinear", ' '"multiclass_strategy": "ovr"}', '{"C":0.0275, "kernel": "liblinear", "multiclass_strategy": "ovr"}', '{"C": 7, "gamma": 10.0, "kernel": "rbf", "multiclass_strategy": ' @@ -97,7 +97,7 @@ def main(args_test=None): for item in results: results_tmp = {"n_jobs": [-1], "n_estimators": [100]} for key, value in results[item].items(): - new_key = f"base_estimator__{key}" + new_key = f"estimator__{key}" try: results_tmp[new_key] = sorted(value) except TypeError: @@ -111,6 +111,7 @@ def main(args_test=None): t2 = sorted([x for x in value if isinstance(x, str)]) results_tmp[new_key] = t1 + t2 output.append(results_tmp) + # save results file_name = Files.grid_input(args.score, args.model) file_output = os.path.join(Folders.results, file_name) diff --git a/benchmark/scripts/be_main.py b/benchmark/scripts/be_main.py index 2786967..7d34ac3 100755 --- a/benchmark/scripts/be_main.py +++ b/benchmark/scripts/be_main.py @@ -13,7 +13,7 @@ def main(args_test=None): arguments = Arguments(prog="be_main") arguments.xset("stratified").xset("score").xset("model", mandatory=True) arguments.xset("n_folds").xset("platform").xset("quiet").xset("title") - arguments.xset("report") + arguments.xset("report").xset("ignore_nan") arguments.add_exclusive( ["grid_paramfile", "best_paramfile", "hyperparameters"] ) @@ -35,6 +35,7 @@ def main(args_test=None): grid_paramfile=args.grid_paramfile, progress_bar=not args.quiet, platform=args.platform, + ignore_nan=args.ignore_nan, title=args.title, folds=args.n_folds, ) diff --git a/benchmark/tests/.env b/benchmark/tests/.env index 9641efa..f554eaf 100644 --- a/benchmark/tests/.env +++ b/benchmark/tests/.env @@ -6,4 +6,4 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] -discretize=0 +discretize=0 \ No newline at end of file diff --git a/benchmark/tests/.env.dist b/benchmark/tests/.env.dist index 9641efa..f554eaf 100644 --- a/benchmark/tests/.env.dist +++ b/benchmark/tests/.env.dist @@ -6,4 +6,4 @@ stratified=0 # Source of data Tanveer/Surcov source_data=Tanveer seeds=[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] -discretize=0 +discretize=0 \ No newline at end of file diff --git a/benchmark/tests/BestResults_test.py b/benchmark/tests/BestResults_test.py index 76a5ea8..f4df24b 100644 --- a/benchmark/tests/BestResults_test.py +++ b/benchmark/tests/BestResults_test.py @@ -18,7 +18,7 @@ class BestResultTest(TestBase): "C": 7, "gamma": 0.1, "kernel": "rbf", - "max_iter": 10000.0, + "max_iter": 10000, "multiclass_strategy": "ovr", }, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json", diff --git a/benchmark/tests/Dataset_test.py b/benchmark/tests/Dataset_test.py index 3c1ca49..ce5521c 100644 --- a/benchmark/tests/Dataset_test.py +++ b/benchmark/tests/Dataset_test.py @@ -1,4 +1,3 @@ -import shutil from .TestBase import TestBase from ..Experiments import Randomized from ..Datasets import Datasets @@ -17,10 +16,6 @@ class DatasetTest(TestBase): self.set_env(".env.dist") return super().tearDown() - @staticmethod - def set_env(env): - shutil.copy(env, ".env") - def test_Randomized(self): expected = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] self.assertSequenceEqual(Randomized.seeds(), expected) diff --git a/benchmark/tests/Experiment_test.py b/benchmark/tests/Experiment_test.py index 0f8ffad..4ac6b24 100644 --- a/benchmark/tests/Experiment_test.py +++ b/benchmark/tests/Experiment_test.py @@ -1,4 +1,6 @@ import json +from io import StringIO +from unittest.mock import patch from .TestBase import TestBase from ..Experiments import Experiment from ..Datasets import Datasets @@ -8,10 +10,12 @@ class ExperimentTest(TestBase): def setUp(self): self.exp = self.build_exp() - def build_exp(self, hyperparams=False, grid=False): + def build_exp( + self, hyperparams=False, grid=False, model="STree", ignore_nan=False + ): params = { "score_name": "accuracy", - "model_name": "STree", + "model_name": model, "stratified": "0", "datasets": Datasets(), "hyperparams_dict": "{}", @@ -21,6 +25,7 @@ class ExperimentTest(TestBase): "title": "Test", "progress_bar": False, "folds": 2, + "ignore_nan": ignore_nan, } return Experiment(**params) @@ -31,6 +36,7 @@ class ExperimentTest(TestBase): ], ".", ) + self.set_env(".env.dist") return super().tearDown() def test_build_hyperparams_file(self): @@ -46,7 +52,7 @@ class ExperimentTest(TestBase): "C": 7, "gamma": 0.1, "kernel": "rbf", - "max_iter": 10000.0, + "max_iter": 10000, "multiclass_strategy": "ovr", }, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json", @@ -89,7 +95,7 @@ class ExperimentTest(TestBase): def test_exception_n_fold_crossval(self): self.exp.do_experiment() with self.assertRaises(ValueError): - self.exp._n_fold_crossval([], [], {}) + self.exp._n_fold_crossval("", [], [], {}) def test_do_experiment(self): self.exp.do_experiment() @@ -131,3 +137,39 @@ class ExperimentTest(TestBase): ): for key, value in expected_result.items(): self.assertEqual(computed_result[key], value) + + def test_build_fit_parameters(self): + self.set_env(".env.arff") + expected = { + "state_names": { + "sepallength": [0, 1, 2], + "sepalwidth": [0, 1, 3, 4], + "petallength": [0, 1, 2, 3], + "petalwidth": [0, 1, 2, 3], + }, + "features": [ + "sepallength", + "sepalwidth", + "petallength", + "petalwidth", + ], + } + exp = self.build_exp(model="TAN") + X, y = exp.datasets.load("iris") + computed = exp._build_fit_params("iris") + for key, value in expected["state_names"].items(): + self.assertEqual(computed["state_names"][key], value) + for feature in expected["features"]: + self.assertIn(feature, computed["features"]) + + @patch("sys.stdout", new_callable=StringIO) + def test_experiment_with_nan_not_ignored(self, mock_output): + exp = self.build_exp(model="Mock") + self.assertRaises(ValueError, exp.do_experiment) + output_text = mock_output.getvalue().splitlines() + expected = "[ nan 0.8974359]" + self.assertEqual(expected, output_text[0]) + + def test_experiment_with_nan_ignored(self): + self.exp = self.build_exp(model="Mock", ignore_nan=True) + self.exp.do_experiment() diff --git a/benchmark/tests/Models_test.py b/benchmark/tests/Models_test.py index 911cc95..e804f95 100644 --- a/benchmark/tests/Models_test.py +++ b/benchmark/tests/Models_test.py @@ -70,19 +70,19 @@ class ModelTest(TestBase): def test_BaggingStree(self): clf = Models.get_model("BaggingStree") self.assertIsInstance(clf, BaggingClassifier) - clf_base = clf.base_estimator + clf_base = clf.estimator self.assertIsInstance(clf_base, Stree) def test_BaggingWodt(self): clf = Models.get_model("BaggingWodt") self.assertIsInstance(clf, BaggingClassifier) - clf_base = clf.base_estimator + clf_base = clf.estimator self.assertIsInstance(clf_base, Wodt) def test_AdaBoostStree(self): clf = Models.get_model("AdaBoostStree") self.assertIsInstance(clf, AdaBoostClassifier) - clf_base = clf.base_estimator + clf_base = clf.estimator self.assertIsInstance(clf_base, Stree) def test_unknown_classifier(self): diff --git a/benchmark/tests/TestBase.py b/benchmark/tests/TestBase.py index 96d5e7d..b25bc81 100644 --- a/benchmark/tests/TestBase.py +++ b/benchmark/tests/TestBase.py @@ -4,6 +4,7 @@ import pathlib import sys import csv import unittest +import shutil from importlib import import_module from io import StringIO from unittest.mock import patch @@ -19,6 +20,10 @@ class TestBase(unittest.TestCase): self.stree_version = "1.2.4" super().__init__(*args, **kwargs) + @staticmethod + def set_env(env): + shutil.copy(env, ".env") + def remove_files(self, files, folder): for file_name in files: file_name = os.path.join(folder, file_name) diff --git a/benchmark/tests/datasets/all.txt b/benchmark/tests/datasets/all.txt index ddf732a..48584fd 100644 --- a/benchmark/tests/datasets/all.txt +++ b/benchmark/tests/datasets/all.txt @@ -1,2 +1,2 @@ -iris,class -wine,class +iris,class,all +wine,class,[0, 1] diff --git a/benchmark/tests/results/best_results_accuracy_STree.json b/benchmark/tests/results/best_results_accuracy_STree.json index 1cbd82e..4a4ee75 100644 --- a/benchmark/tests/results/best_results_accuracy_STree.json +++ b/benchmark/tests/results/best_results_accuracy_STree.json @@ -1 +1 @@ -{"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000.0, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]} \ No newline at end of file +{"balance-scale": [0.98, {"splitter": "best", "max_features": "auto"}, "results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json"], "balloons": [0.86, {"C": 7, "gamma": 0.1, "kernel": "rbf", "max_iter": 10000, "multiclass_strategy": "ovr"}, "results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json"]} \ No newline at end of file diff --git a/benchmark/tests/results/results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json b/benchmark/tests/results/results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json index 197b1f6..4988acf 100644 --- a/benchmark/tests/results/results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json +++ b/benchmark/tests/results/results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json @@ -17,10 +17,10 @@ "features": 4, "classes": 3, "hyperparameters": { - "C": 10000.0, + "C": 10000, "gamma": 0.1, "kernel": "rbf", - "max_iter": 10000.0, + "max_iter": 10000, "multiclass_strategy": "ovr" }, "nodes": 7.0, @@ -40,7 +40,7 @@ "C": 7, "gamma": 0.1, "kernel": "rbf", - "max_iter": 10000.0, + "max_iter": 10000, "multiclass_strategy": "ovr" }, "nodes": 3.0, diff --git a/benchmark/tests/scripts/Be_Print_Strees_test.py b/benchmark/tests/scripts/Be_Print_Strees_test.py index 3e7dde9..a234a2c 100644 --- a/benchmark/tests/scripts/Be_Print_Strees_test.py +++ b/benchmark/tests/scripts/Be_Print_Strees_test.py @@ -27,7 +27,7 @@ class BePrintStrees(TestBase): stdout.getvalue(), f"File {file_name} generated\n" ) computed_size = os.path.getsize(file_name) - self.assertGreater(computed_size, 25000) + self.assertGreater(computed_size, 24500) def test_be_print_strees_dataset_color(self): for name in self.datasets: diff --git a/benchmark/tests/test_files/be_build_grid.test b/benchmark/tests/test_files/be_build_grid.test index 332abc1..f3549e2 100644 --- a/benchmark/tests/test_files/be_build_grid.test +++ b/benchmark/tests/test_files/be_build_grid.test @@ -6,13 +6,13 @@ "n_estimators": [ 100 ], - "base_estimator__C": [ + "estimator__C": [ 1.0 ], - "base_estimator__kernel": [ + "estimator__kernel": [ "linear" ], - "base_estimator__multiclass_strategy": [ + "estimator__multiclass_strategy": [ "ovo" ] }, @@ -23,7 +23,7 @@ "n_estimators": [ 100 ], - "base_estimator__C": [ + "estimator__C": [ 0.001, 0.0275, 0.05, @@ -36,10 +36,10 @@ 7, 10000.0 ], - "base_estimator__kernel": [ + "estimator__kernel": [ "liblinear" ], - "base_estimator__multiclass_strategy": [ + "estimator__multiclass_strategy": [ "ovr" ] }, @@ -50,7 +50,7 @@ "n_estimators": [ 100 ], - "base_estimator__C": [ + "estimator__C": [ 0.05, 1.0, 1.05, @@ -62,7 +62,7 @@ 57, 10000.0 ], - "base_estimator__gamma": [ + "estimator__gamma": [ 0.001, 0.1, 0.14, @@ -70,10 +70,10 @@ "auto", "scale" ], - "base_estimator__kernel": [ + "estimator__kernel": [ "rbf" ], - "base_estimator__multiclass_strategy": [ + "estimator__multiclass_strategy": [ "ovr" ] }, @@ -84,20 +84,20 @@ "n_estimators": [ 100 ], - "base_estimator__C": [ + "estimator__C": [ 0.05, 0.2, 1.0, 8.25 ], - "base_estimator__gamma": [ + "estimator__gamma": [ 0.1, "scale" ], - "base_estimator__kernel": [ + "estimator__kernel": [ "poly" ], - "base_estimator__multiclass_strategy": [ + "estimator__multiclass_strategy": [ "ovo", "ovr" ] diff --git a/benchmark/tests/test_files/be_main_best.test b/benchmark/tests/test_files/be_main_best.test index a18372b..a694c57 100644 --- a/benchmark/tests/test_files/be_main_best.test +++ b/benchmark/tests/test_files/be_main_best.test @@ -9,7 +9,7 @@ Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters ============================== ====== ===== === ======= ======= ======= =============== ================= =============== balance-scale 625 4 3 23.32 12.16 6.44 0.840160±0.0304 0.013745±0.0019 {'splitter': 'best', 'max_features': 'auto'} -balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} +balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000388±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'} ************************************************************************************************************************* * accuracy compared to STree_default (liblinear-ovr) .: 0.0422 * ************************************************************************************************************************* diff --git a/benchmark/tests/test_files/excel.test b/benchmark/tests/test_files/excel.test index dbb9675..33e813f 100644 --- a/benchmark/tests/test_files/excel.test +++ b/benchmark/tests/test_files/excel.test @@ -32,7 +32,7 @@ 7;9;"0.0150468069702512" 7;10;"0.01404867172241211" 7;11;"0.002026269126958884" -7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" +7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}" 8;1;"balloons" 8;2;"16" 8;3;"4" @@ -44,5 +44,5 @@ 8;9;"0.2850146195080759" 8;10;"0.0008541679382324218" 8;11;"3.629469326417878e-05" -8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" +8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}" 10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454" \ No newline at end of file diff --git a/benchmark/tests/test_files/excel_compared.test b/benchmark/tests/test_files/excel_compared.test index 2b44c1e..92b678d 100644 --- a/benchmark/tests/test_files/excel_compared.test +++ b/benchmark/tests/test_files/excel_compared.test @@ -32,7 +32,7 @@ 7;10;0.0150468069702512 7;11;0.01404867172241211 7;12;0.002026269126958884 -7;13;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" +7;13;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}" 8;1;"balloons" 8;2;16 8;3;4 @@ -45,7 +45,7 @@ 8;10;0.2850146195080759 8;11;0.0008541679382324218 8;12;3.629469326417878e-05 -8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" +8;13;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}" 11;2;"✔" 11;3;1 11;4;"Equal to best" diff --git a/benchmark/tests/test_files/exreport_excel_Datasets.test b/benchmark/tests/test_files/exreport_excel_Datasets.test index 5c2f35a..054b981 100644 --- a/benchmark/tests/test_files/exreport_excel_Datasets.test +++ b/benchmark/tests/test_files/exreport_excel_Datasets.test @@ -1,25 +1,28 @@ -1;1;"Datasets used in benchmark ver. 0.2.0" +1;1;"Datasets used in benchmark ver. 0.4.0" 2;1;" Default score accuracy" 2;2;"Cross validation" -2;5;"5 Folds" +2;6;"5 Folds" 3;2;"Stratified" -3;5;"False" +3;6;"False" 4;2;"Discretized" -4;5;"False" +4;6;"False" 5;2;"Seeds" -5;5;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]" +5;6;"[57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]" 6;1;"Dataset" 6;2;"Samples" 6;3;"Features" -6;4;"Classes" -6;5;"Balance" +6;4;"Continuous" +6;5;"Classes" +6;6;"Balance" 7;1;"balance-scale" 7;2;"625" 7;3;"4" -7;4;"3" -7;5;" 7.84%/ 46.08%/ 46.08%" +7;4;"0" +7;5;"3" +7;6;" 7.84%/ 46.08%/ 46.08%" 8;1;"balloons" 8;2;"16" 8;3;"4" -8;4;"2" -8;5;"56.25%/ 43.75%" +8;4;"0" +8;5;"2" +8;6;"56.25%/ 43.75%" diff --git a/benchmark/tests/test_files/exreport_excel_STree.test b/benchmark/tests/test_files/exreport_excel_STree.test index 6a164b5..dd1728a 100644 --- a/benchmark/tests/test_files/exreport_excel_STree.test +++ b/benchmark/tests/test_files/exreport_excel_STree.test @@ -32,7 +32,7 @@ 7;9;"0.0150468069702512" 7;10;"0.01404867172241211" 7;11;"0.002026269126958884" -7;12;"{'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" +7;12;"{'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}" 8;1;"balloons" 8;2;"16" 8;3;"4" @@ -44,5 +44,5 @@ 8;9;"0.2850146195080759" 8;10;"0.0008541679382324218" 8;11;"3.629469326417878e-05" -8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'}" +8;12;"{'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'}" 10;1;"** accuracy compared to STree_default (liblinear-ovr) .: 0.0454" diff --git a/benchmark/tests/test_files/report.test b/benchmark/tests/test_files/report.test index 0e5ffa9..c4b783f 100644 --- a/benchmark/tests/test_files/report.test +++ b/benchmark/tests/test_files/report.test @@ -8,8 +8,8 @@ Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters ============================== ====== ===== === ======= ======= ======= =============== ================= =============== -balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} -balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} +balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'} +balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'} ************************************************************************************************************************* * accuracy compared to STree_default (liblinear-ovr) .: 0.0454 * ************************************************************************************************************************* diff --git a/benchmark/tests/test_files/report_best.test b/benchmark/tests/test_files/report_best.test index 03ffc30..77beb05 100644 --- a/benchmark/tests/test_files/report_best.test +++ b/benchmark/tests/test_files/report_best.test @@ -5,7 +5,7 @@ Dataset Score File/Message Hyperparameters ============================== ======== ============================================================================ ============================================= balance-scale 0.980000 results_accuracy_STree_iMac27_2021-10-27_09:40:40_0.json {'splitter': 'best', 'max_features': 'auto'} -balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} +balloons 0.860000 results_accuracy_STree_iMac27_2021-09-30_11:42:07_0.json {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'} ****************************************************************************************************************************************************************** * accuracy compared to STree_default (liblinear-ovr) .: 0.0457 * ****************************************************************************************************************************************************************** diff --git a/benchmark/tests/test_files/report_compared.test b/benchmark/tests/test_files/report_compared.test index 103fb5d..6229759 100644 --- a/benchmark/tests/test_files/report_compared.test +++ b/benchmark/tests/test_files/report_compared.test @@ -8,8 +8,8 @@ Dataset Sampl. Feat. Cls Nodes Leaves Depth Score Time Hyperparameters ============================== ====== ===== === ======= ======= ======= =============== ================= =============== -balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000.0, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} -balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000.0, 'multiclass_strategy': 'ovr'} +balance-scale 625 4 3 7.00 4.00 3.00 0.970560±0.0150 0.014049±0.0020 {'C': 10000, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'} +balloons 16 4 2 3.00 2.00 2.00 0.860000±0.2850✔ 0.000854±0.0000 {'C': 7, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': 10000, 'multiclass_strategy': 'ovr'} ************************************************************************************************************************* * ✔ Equal to best .....: 1 * * accuracy compared to STree_default (liblinear-ovr) .: 0.0454 * diff --git a/benchmark/tests/test_files/report_datasets.test b/benchmark/tests/test_files/report_datasets.test index 16c7bd7..3fa0aeb 100644 --- a/benchmark/tests/test_files/report_datasets.test +++ b/benchmark/tests/test_files/report_datasets.test @@ -1,6 +1,6 @@ Datasets used in benchmark ver. 0.2.0 -Dataset Sampl. Feat. Cls Balance -============================== ====== ===== === ============================================================ -balance-scale 625 4 3 7.84%/ 46.08%/ 46.08% -balloons 16 4 2 56.25%/ 43.75% +Dataset Sampl. Feat. Cont Cls Balance +============================== ====== ===== ==== === ============================================================ +balance-scale 625 4 0 3 7.84%/ 46.08%/ 46.08% +balloons 16 4 0 2 56.25%/ 43.75% diff --git a/requirements.txt b/requirements.txt index cba834e..32065e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ scikit-learn scipy odte cython -mdlp-discretization +fimdlp mufs bayesclass @ git+ssh://git@github.com/doctorado-ml/bayesclass.git xlsxwriter