diff --git a/experimentation/Database.py b/experimentation/Database.py index 22d9955..d2a9325 100644 --- a/experimentation/Database.py +++ b/experimentation/Database.py @@ -45,12 +45,15 @@ class MySQL: if self._config_db["sqlite"] == "None": del self._config_db["sqlite"] self._config_db["buffered"] = True + # self._config_db["auth_plugin"] = "mysql_native_password" self._database = mysql.connector.connect(**self._config_db) + # kk = mysql.connector.connect(**self._config_db) + # self._database = sqlite3.connect("./data/stree.sqlite") + # self._database.row_factory = sqlite3.Row else: self._database = sqlite3.connect(self._config_db["sqlite"]) # return dict as a result of select self._database.row_factory = sqlite3.Row - return self._database def find_best( diff --git a/experimentation/local/.myconfig b/experimentation/local/.myconfig new file mode 100644 index 0000000..7b79ceb --- /dev/null +++ b/experimentation/local/.myconfig @@ -0,0 +1,6 @@ +host=127.0.0.1 +port=3306 +user=stree +password=xtree +database=stree +sqlite=None diff --git a/experimentation/local/.tunnel b/experimentation/local/.tunnel new file mode 100644 index 0000000..22469f7 --- /dev/null +++ b/experimentation/local/.tunnel @@ -0,0 +1,5 @@ +ssh_address_or_host=("","") +ssh_username= +ssh_private_key=/id_rsa +remote_bind_address=('127.0.0.1', 3306) +enabled=0 diff --git a/report_score.py b/report_score.py index fa8143b..a3e0e05 100644 --- a/report_score.py +++ b/report_score.py @@ -1,6 +1,7 @@ import argparse import random import time +import warnings from datetime import datetime import json import numpy as np @@ -132,8 +133,9 @@ def process_dataset(dataset, verbose, model, params): np.random.seed(random_state) kfold = KFold(shuffle=True, random_state=random_state, n_splits=5) clf = get_classifier(model, random_state, hyperparameters) - print(hyperparameters) - res = cross_validate(clf, X, y, cv=kfold, return_estimator=True) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + res = cross_validate(clf, X, y, cv=kfold, return_estimator=True) scores.append(res["test_score"]) times.append(res["fit_time"]) for result_item in res["estimator"]: diff --git a/report_test.py b/report_test.py new file mode 100644 index 0000000..93f045e --- /dev/null +++ b/report_test.py @@ -0,0 +1,313 @@ +import argparse +import random +import time +import json +import numpy as np +import xlsxwriter +from stree import Stree +from sklearn.model_selection import KFold, cross_validate +from experimentation.Sets import Datasets +from experimentation.Database import MySQL +from experimentation.Utils import TextColor + + +CHECK_MARK = "\N{heavy check mark}" +EXCLAMATION_MARK = "\N{heavy exclamation mark symbol}" +BLACK_STAR = "\N{black star}" + + +def parse_arguments(): + ap = argparse.ArgumentParser() + ap.add_argument( + "-x", + "--excel", + type=str, + default="", + required=False, + help="generate excel file", + ) + ap.add_argument( + "-p", "--parameters", type=str, required=False, default="{}" + ) + args = ap.parse_args() + return ( + args.parameters, + args.excel, + ) + + +def get_classifier(model, random_state, hyperparameters): + clf = Stree(random_state=random_state) + clf.set_params(**hyperparameters) + return clf + + +def process_dataset(dataset, verbose, model, params): + X, y = dt.load(dataset) + scores = [] + times = [] + nodes = [] + leaves = [] + depths = [] + hyperparameters = json.loads(params) + for random_state in random_seeds: + random.seed(random_state) + np.random.seed(random_state) + kfold = KFold(shuffle=True, random_state=random_state, n_splits=5) + clf = get_classifier(model, random_state, hyperparameters) + res = cross_validate(clf, X, y, cv=kfold, return_estimator=True) + scores.append(res["test_score"]) + times.append(res["fit_time"]) + for result_item in res["estimator"]: + nodes_item, leaves_item = result_item.nodes_leaves() + depth_item = result_item.depth_ + nodes.append(nodes_item) + leaves.append(leaves_item) + depths.append(depth_item) + return scores, times, json.dumps(hyperparameters), nodes, leaves, depths + + +def excel_write_line( + book, + sheet, + name, + samples, + features, + classes, + accuracy, + times, + hyperparameters, + complexity, + status, +): + try: + excel_write_line.row += 1 + except AttributeError: + excel_write_line.row = 4 + size_n = 14 + decimal = book.add_format({"num_format": "0.000000", "font_size": size_n}) + integer = book.add_format({"num_format": "#,###", "font_size": size_n}) + normal = book.add_format({"font_size": size_n}) + col = 0 + status, _ = excel_status(status) + sheet.write(excel_write_line.row, col, name, normal) + sheet.write(excel_write_line.row, col + 1, samples, integer) + sheet.write(excel_write_line.row, col + 2, features, normal) + sheet.write(excel_write_line.row, col + 3, classes, normal) + sheet.write(excel_write_line.row, col + 4, complexity["nodes"], normal) + sheet.write(excel_write_line.row, col + 5, complexity["leaves"], normal) + sheet.write(excel_write_line.row, col + 6, complexity["depth"], normal) + sheet.write(excel_write_line.row, col + 7, accuracy, decimal) + sheet.write(excel_write_line.row, col + 8, status, normal) + sheet.write(excel_write_line.row, col + 9, np.mean(times), decimal) + sheet.write(excel_write_line.row, col + 10, hyperparameters, normal) + + +def excel_write_header(book, sheet, parameters): + header = book.add_format() + header.set_font_size(18) + subheader = book.add_format() + subheader.set_font_size(16) + sheet.write( + 0, + 0, + f"Process all datasets set with STree: Parameters: {parameters} ", + header, + ) + sheet.write( + 1, + 0, + "5 Fold Cross Validation with 10 random seeds", + subheader, + ) + sheet.write(1, 5, f"{random_seeds}", subheader) + header_cols = [ + ("Dataset", 30), + ("Samples", 10), + ("Variables", 7), + ("Classes", 7), + ("Nodes", 7), + ("Leaves", 7), + ("Depth", 7), + ("Accuracy", 10), + ("Stat", 3), + ("Time", 10), + ("Parameters", 50), + ] + bold = book.add_format({"bold": True, "font_size": 14}) + i = 0 + for item, length in header_cols: + sheet.write(3, i, item, bold) + sheet.set_column(i, i, length) + i += 1 + + +def excel_status(status): + if status == TextColor.GREEN + CHECK_MARK + TextColor.ENDC: + return EXCLAMATION_MARK, "Accuracy better than stree optimized" + elif status == TextColor.RED + BLACK_STAR + TextColor.ENDC: + return BLACK_STAR, "Best accuracy of al models" + elif status != " ": + return CHECK_MARK, "Accuracy better than original stree_default" + return " ", "" + + +def excel_write_totals(book, sheet, totals, start, accuracy_total): + i = 2 + bold = book.add_format({"bold": True, "font_size": 16}) + for key, total in totals.items(): + status, text = excel_status(key) + sheet.write(excel_write_line.row + i, 1, status, bold) + sheet.write(excel_write_line.row + i, 2, total, bold) + sheet.write(excel_write_line.row + i, 3, text, bold) + i += 1 + message = ( + f"** Accuracy compared to stree_default (liblinear-ovr) .: " + f"{accuracy_total/40.282203:7.4f}" + ) + sheet.write(excel_write_line.row + i + 1, 0, message, bold) + time_spent = get_time(start, time.time()) + sheet.write(excel_write_line.row + i + 3, 0, time_spent, bold) + + +def compute_status(dbh, name, model, accuracy): + n_dig = 6 + ac_round = round(accuracy, n_dig) + better_default = CHECK_MARK + better_stree = TextColor.GREEN + CHECK_MARK + TextColor.ENDC + best = TextColor.RED + BLACK_STAR + TextColor.ENDC + best_default, _ = get_best_score(dbh, name, model) + best_stree, _ = get_best_score(dbh, name, "stree") + best_all, _ = get_best_score(dbh, name, models_tree) + status = better_default if ac_round > round(best_default, n_dig) else " " + status = better_stree if ac_round > round(best_stree, n_dig) else status + status = best if ac_round > round(best_all, n_dig) else status + return status + + +def get_best_score(dbh, name, model): + record = dbh.find_best(name, model, "crossval") + accuracy = record[5] if record is not None else 0.0 + acc_std = record[11] if record is not None else 0.0 + return accuracy, acc_std + + +def get_time(start, stop): + hours, rem = divmod(stop - start, 3600) + minutes, seconds = divmod(rem, 60) + return f"Time: {int(hours):2d}h {int(minutes):2d}m {int(seconds):2d}s" + + +random_seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] +models_tree = [ + "stree", + "stree_default", + "wodt", + "j48svm", + "oc1", + "cart", + "baseRaF", +] +( + parameters, + excel, +) = parse_arguments() +dbh = MySQL() +if excel != "": + file_name = f"{excel}.xlsx" + excel_wb = xlsxwriter.Workbook(file_name) + excel_ws = excel_wb.add_worksheet("STree") + excel_write_header(excel_wb, excel_ws, parameters) +database = dbh.get_connection() +dt = Datasets(normalize=False, standardize=False, set_of_files="tanveer") +start = time.time() +print(f"* Process all datasets set with STree: {parameters}") +print(f"5 Fold Cross Validation with 10 random seeds {random_seeds}\n") +header_cols = [ + "Dataset", + "Samp", + "Var", + "Cls", + "Nodes", + "Leaves", + "Depth", + "Accuracy", + "Time", + "Parameters", +] +header_lengths = [30, 5, 3, 3, 7, 7, 7, 15, 15, 10] +model = "stree_default" +parameters = json.dumps(json.loads(parameters)) +if parameters != "{}" and len(parameters) > 10: + header_lengths.pop() + header_lengths.append(len(parameters)) +line_col = "" +for field, underscore in zip(header_cols, header_lengths): + print(f"{field:{underscore}s} ", end="") + line_col += "=" * underscore + " " +print(f"\n{line_col}") +totals = {} +accuracy_total = 0.0 +for dataset in dt: + name = dataset[0] + X, y = dt.load(name) # type: ignore + samples, features = X.shape + classes = len(np.unique(y)) + print( + f"{name:30s} {samples:5d} {features:3d} {classes:3d} ", + end="", + ) + scores, times, hyperparameters, nodes, leaves, depth = process_dataset( + dataset[0], verbose=False, model=model, params=parameters + ) + complexity = dict( + nodes=float(np.mean(nodes)), + leaves=float(np.mean(leaves)), + depth=float(np.mean(depth)), + ) + nodes_item, leaves_item, depth_item = complexity.values() + print( + f"{nodes_item:7.2f} {leaves_item:7.2f} {depth_item:7.2f} ", + end="", + ) + accuracy = np.mean(scores) + accuracy_total += accuracy + status = ( + compute_status(dbh, name, model, accuracy) + if model == "stree_default" + else " " + ) + if status != " ": + if status not in totals: + totals[status] = 1 + else: + totals[status] += 1 + print(f"{accuracy:8.6f}±{np.std(scores):6.4f}{status}", end="") + print(f"{np.mean(times):8.6f}±{np.std(times):6.4f} {hyperparameters}") + if excel != "": + excel_write_line( + excel_wb, + excel_ws, + name, + samples, + features, + classes, + accuracy, + times, + hyperparameters, + complexity, + status, + ) +for key, value in totals.items(): + print(f"{key} .....: {value:2d}") +print( + f"** Accuracy compared to stree_default (liblinear-ovr) .: " + f"{accuracy_total/40.282203:7.4f}" +) +if excel != "": + excel_write_totals(excel_wb, excel_ws, totals, start, accuracy_total) + excel_wb.close() +stop = time.time() +time_spent = get_time(start, time.time()) +print(f"{time_spent}") +dbh.close() diff --git a/score_all_cfs.py b/score_all_cfs.py index 5671989..e06669b 100755 --- a/score_all_cfs.py +++ b/score_all_cfs.py @@ -4,7 +4,6 @@ import warnings from experimentation.Sets import Datasets from stree import Stree -from mdlp import MDLP from mfs import MFS @@ -33,11 +32,9 @@ datasets = Datasets(False, False, "tanveer") header(filter_name) better = worse = equal = 0 for dataset in datasets: - # mdlp = MDLP(random_state=1) X, y = datasets.load(dataset[0]) mfs = MFS(discrete=False) now_disc = time.time() - # X_disc = mdlp.fit_transform(X, y) X_disc = X time_disc = time.time() - now_disc now_selec = time.time()