stree_datasets/analysis_mysql.py

import argparse
from typing import Tuple
import numpy as np
from experimentation.Sets import Datasets
from experimentation.Utils import TextColor
from experimentation.Database import MySQL

report_csv = "report.csv"
table_tex = "table.tex"
models_tree = [
    "stree",
    "stree_default",
    "wodt",
    "j48svm",
    "oc1",
    "cart",
    "baseRaF",
]
models_ensemble = ["odte", "adaBoost", "bagging", "TBRaF", "TBRoF", "TBRRoF"]
description = ["samp", "var", "cls"]
complexity = ["nodes", "leaves", "depth"]
title = "Best model results"
lengths = [30, 4, 3, 3, 3, 3, 3, 12, 12, 12, 12, 12, 12, 12]


def parse_arguments() -> Tuple[str, str, str, bool, bool]:
    ap = argparse.ArgumentParser()
    ap.add_argument(
        "-e",
        "--experiment",
        type=str,
        choices=["gridsearch", "crossval", "any"],
        required=False,
        default="gridsearch",
    )
    ap.add_argument(
        "-m",
        "--model",
        type=str,
        choices=["tree", "ensemble"],
        required=False,
        default="tree",
    )
    ap.add_argument(
        "-c",
        "--csv-output",
        type=bool,
        required=False,
        default=False,
    )
    ap.add_argument(
        "-t",
        "--tex-output",
        type=bool,
        required=False,
        default=False,
    )
    ap.add_argument(
        "-o",
        "--compare",
        type=bool,
        required=False,
        default=False,
    )
    args = ap.parse_args()
    return (
        args.experiment,
        args.model,
        args.csv_output,
        args.tex_output,
        args.compare,
    )


def print_header_tex(file_tex, second=False):
    # old_header = (
    #     "\\begin{table}[ht]\n"
    #     "\\centering"
    #     "\\resizebox{\\textwidth}{!}{\\begin{tabular}{|r|l|r|r|r|c|c|c|c|c|c|
    # c"
    #     "|}"
    #     "\\hline\n"
    #     "\\# & Dataset & Samples & Features & Classes & stree & stree def. &
    #  "
    #     "wodt & j48svm & oc1 & cart & baseRaF\\\\\n"
    #     "\\hline"
    # )
    cont = ""
    num = ""
    if second:
        cont = " (cont.)"
        num = "2"
    header = (
        "\\begin{sidewaystable}[ht]\n"
        "\\centering\n"
        "\\renewcommand{\\arraystretch}{1.2}\n"
        "\\renewcommand{\\tabcolsep}{0.07cm}\n"
        "\\caption{Datasets used during the experimentation" + cont + "}\n"
        "\\label{table:datasets" + num + "}\n"
        "\\resizebox{0.95\\textwidth}{!}{\n"
        "\\begin{tabular}{rlrrrccccccc}\\hline\n"
        "\\# & Dataset & \\#S & \\#F & \\#L & stree & stree default & wodt & "
        "j48svm & oc1 & cart & baseRaF\\\\\n"
        "\\hline\n"
    )
    print(header, file=file_tex)


def print_line_tex(number, dataset, line, file_tex):
    dataset_name = dataset.replace("_", "\\_")
    print_line = (
        f"{number} & {dataset_name} & {line['samp']} & {line['var']} "
        f"& {line['cls']}"
    )
    for model in models:
        item = line[model]
        print_line += f" & {item}"
    print_line += "\\\\"
    print(f"{print_line}", file=file_tex)


def print_footer_tex(file_tex):
    # old_footer = (
    #     "\\hline\n"
    #     "\\csname @@input\\endcsname wintieloss\n"
    #     "\\hline\n"
    #     "\\end{tabular}}\n"
    #     "\\caption{Datasets used during the experimentation}\n"
    #     "\\label{table:datasets}\n"
    #     "\\end{table}"
    # )
    footer = "\\hline\n\\end{tabular}}\n\\end{sidewaystable}\n"
    print(f"{footer}", file=file_tex)


def report_header_content(title, experiment, model_type):
    length = sum(lengths) + len(lengths) - 1
    output = "\n" + "*" * length + "\n"
    titles_length = len(title) + len(experiment) + len(model_type) + 21
    num = (length - titles_length) // 2 - 3
    num2 = length - titles_length - 7 - 2 * num
    output += (
        "*"
        + " " * num
        + f"{title} - Experiment: {experiment} - Models: {model_type}"
        + " " * (num + num2)
        + "*\n"
    )
    output += "*" * length + "\n\n"
    lines = ""
    for item, data in enumerate(fields):
        output += f"{fields[item]:^{lengths[item]}} "
        lines += "=" * lengths[item] + " "
    output += f"\n{lines}"
    return output


def report_header(title, experiment, model_type):
    print(
        TextColor.HEADER
        + report_header_content(title, experiment, model_type)
        + TextColor.ENDC
    )


def report_line(line):
    output = f"{line['dataset']:{lengths[0] + 5}s} "
    for key, item in enumerate(description + complexity):
        output += f"{line[item]:{lengths[key + 1]}d} "
    data = models.copy()
    for key, model in enumerate(data):
        output += f"{line[model]:{lengths[key + 7]}s} "
    return output


def report_footer(agg):
    length = sum(lengths) + len(lengths) - 1
    print("-" * length)
    color = TextColor.LINE2
    print(color + "|{0:15s}|{1:6s}|".format("Classifier", "# Best"))
    print(color + "=" * 24)
    for item in models:
        print(color + f"|{item:15s}", end="|")
        print(color + f"{agg[item]['best']:6d}|")
        color = (
            TextColor.LINE2 if color == TextColor.LINE1 else TextColor.LINE1
        )
    print("-" * 24)


(experiment, model_type, csv_output, tex_output, compare) = parse_arguments()
dbh = MySQL()
database = dbh.get_connection()
dt = Datasets(False, False, "tanveer")
fields = (
    "Dataset",
    "Samp",
    "Var",
    "Cls",
    "Nod",
    "Lea",
    "Dep",
)
if tex_output:
    # We need the stree_std column for the tex output
    compare = True
if not compare:
    # remove stree_default from fields list and lengths
    models_tree.pop(1)
    lengths.pop(7)
models = models_tree if model_type == "tree" else models_ensemble
for item in models:
    fields += (f"{item}",)
report_header(title, experiment, model_type)
color = TextColor.LINE1
agg = {}
for item in [
    "better",
    "worse",
] + models:
    agg[item] = {}
    agg[item]["best"] = 0
if csv_output:
    file_csv = open(report_csv, "w")
    print("dataset, classifier, accuracy", file=file_csv)
if tex_output:
    file_tex = open(table_tex, "w")
    print_header_tex(file_tex, second=False)
for number, dataset in enumerate(dt):
    find_one = False
    # Look for max accuracy for any given dataset
    line = {"dataset": color + dataset[0]}
    X, y = dt.load(dataset[0])  # type: ignore
    line["samp"], line["var"] = X.shape
    line["cls"] = len(np.unique(y))
    record = dbh.find_best(dataset[0], models, experiment)
    max_accuracy = 0.0 if record is None else record[5]
    line["nodes"] = 0
    line["leaves"] = 0
    line["depth"] = 0
    line_tex = line.copy()
    for model in models:
        record = dbh.find_best(dataset[0], model, experiment)
        if record is None:
            line[model] = color + "-" * 12
        else:
            if model == "stree":
                line["nodes"] = record[12]
                line["leaves"] = record[13]
                line["depth"] = record[14]
            reference = record[13]
            accuracy = record[5]
            acc_std = record[11]
            find_one = True
            item = f"{accuracy:.4f}±{acc_std:.3f}"
            line_tex[model] = item
            if round(accuracy, 4) == round(max_accuracy, 4):
                line_tex[model] = "\\textbf{" + item + "}"
            if accuracy == max_accuracy:
                line[model] = (
                    TextColor.GREEN + TextColor.BOLD + item + TextColor.ENDC
                )
                agg[model]["best"] += 1
            else:
                line[model] = color + item
            if csv_output:
                print(f"{dataset[0]}, {model}, {accuracy}", file=file_csv)
    if tex_output:
        print_line_tex(number + 1, dataset[0], line_tex, file_tex)
        if number == 24:
            print_footer_tex(file_tex)
            print_header_tex(file_tex, second=True)
    if not find_one:
        print(TextColor.FAIL + f"*No results found for {dataset[0]}")
    else:
        color = (
            TextColor.LINE2 if color == TextColor.LINE1 else TextColor.LINE1
        )
        print(report_line(line))
report_footer(agg)
if csv_output:
    file_csv.close()
    print(f"{report_csv} file generated")
if tex_output:
    print_footer_tex(file_tex)
    file_tex.close()
    print(f"{table_tex} file generated")
dbh.close()