From 5aa2ea8984819a89875d9bd218b083155ca314ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Mon, 14 Dec 2020 00:12:27 +0100 Subject: [PATCH] Refactor MySQL class and develop param_analysis --- analysis_mysql.py | 24 +-- experimentation/Database.py | 35 +++- param_analysis.ipynb | 395 ------------------------------------ param_analysis.py | 156 ++++++++++++++ report_mysql.py | 22 +- 5 files changed, 187 insertions(+), 445 deletions(-) delete mode 100644 param_analysis.ipynb create mode 100644 param_analysis.py diff --git a/analysis_mysql.py b/analysis_mysql.py index 15ac666..b787118 100644 --- a/analysis_mysql.py +++ b/analysis_mysql.py @@ -7,26 +7,6 @@ title = "Best model results" lengths = (30, 9, 11, 11, 11, 11) -def find_best(dataset, classifier): - cursor = database.cursor(buffered=True) - if classifier == "any": - command = ( - f"select * from results r inner join reference e on " - f"r.dataset=e.dataset where r.dataset='{dataset}' " - ) - else: - command = ( - f"select * from results r inner join reference e on " - f"r.dataset=e.dataset where r.dataset='{dataset}' and classifier" - f"='{classifier}'" - ) - command += ( - " order by r.dataset, accuracy desc, classifier desc, type, date, time" - ) - cursor.execute(command) - return cursor.fetchone() - - def report_header_content(title): length = sum(lengths) + len(lengths) - 1 output = "\n" + "*" * length + "\n" @@ -99,10 +79,10 @@ for item in [ for dataset in dt: find_one = False line = {"dataset": color + dataset[0]} - record = find_best(dataset[0], "any") + record = dbh.find_best(dataset[0], "any") max_accuracy = 0.0 if record is None else record[5] for model in models: - record = find_best(dataset[0], model) + record = dbh.find_best(dataset[0], model) if record is None: line[model] = color + "-" * 9 + " " else: diff --git a/experimentation/Database.py b/experimentation/Database.py index 00052b3..a7a2ee9 100644 --- a/experimentation/Database.py +++ b/experimentation/Database.py @@ -12,7 +12,7 @@ from .Utils import TextColor class MySQL: def __init__(self): - self.server = None + self._server = None def get_connection(self): config_db = dict() @@ -32,14 +32,35 @@ class MySQL: config_tunnel["ssh_address_or_host"] = make_tuple( config_tunnel["ssh_address_or_host"] ) - self.server = SSHTunnelForwarder(**config_tunnel) - self.server.daemon_forward_servers = True - self.server.start() - config_db["port"] = self.server.local_bind_port - return mysql.connector.connect(**config_db) + self._server = SSHTunnelForwarder(**config_tunnel) + self._server.daemon_forward_servers = True + self._server.start() + config_db["port"] = self._server.local_bind_port + self._database = mysql.connector.connect(**config_db) + return self._database + + def find_best(self, dataset, classifier="any"): + cursor = self._database.cursor(buffered=True) + if classifier == "any": + command = ( + f"select * from results r inner join reference e on " + f"r.dataset=e.dataset where r.dataset='{dataset}' " + ) + else: + command = ( + f"select * from results r inner join reference e on " + f"r.dataset=e.dataset where r.dataset='{dataset}' and " + f"classifier='{classifier}'" + ) + command += ( + " order by r.dataset, accuracy desc, classifier desc, " + "type, date, time" + ) + cursor.execute(command) + return cursor.fetchone() def close(self): - self.server.close() + self._server.close() class BD(ABC): diff --git a/param_analysis.ipynb b/param_analysis.ipynb deleted file mode 100644 index 7f9e533..0000000 --- a/param_analysis.ipynb +++ /dev/null @@ -1,395 +0,0 @@ -{ - "metadata": { - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.2-final" - }, - "orig_nbformat": 2, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import sqlite3\n", - "import mysql.connector\n", - "from experimentation.Database import MySQL\n", - "from experimentation.Sets import Datasets\n", - "dbh = MySQL()\n", - "database = dbh.get_connection()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "classifier = 'bagging'\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "def find_best(dataset):\n", - " cursor = database.cursor(buffered=True)\n", - " if classifier == \"any\":\n", - " command = (\n", - " f\"select * from results r inner join reference e on \"\n", - " f\"r.dataset=e.dataset where r.dataset='{dataset}' \"\n", - " )\n", - " else:\n", - " command = (\n", - " f\"select * from results r inner join reference e on \"\n", - " f\"r.dataset=e.dataset where r.dataset='{dataset}' and classifier\"\n", - " f\"='{classifier}'\"\n", - " )\n", - " command += (\n", - " \" order by r.dataset, accuracy desc, classifier desc, type, date, time\"\n", - " )\n", - " cursor.execute(command)\n", - " return cursor.fetchone()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "def find_values(parameter, kernel_chosen):\n", - " result = []\n", - " for data in agg[kernel_chosen]:\n", - " base_parameter = f\"base_estimator__{parameter}\"\n", - " if parameter in data.keys():\n", - " result.append(data[parameter])\n", - " if base_parameter in data.keys():\n", - " result.append(data[base_parameter])\n", - " try:\n", - " result_ordered = sorted(result)\n", - " return result_ordered\n", - " except TypeError:\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Aggregating data ..................................................\n", - "stree has 0 results\n", - "adaBoost has 0 results\n", - "bagging has 43 results\n", - "odte has 0 results\n" - ] - } - ], - "source": [ - "dt = Datasets(False, False, 'tanveer')\n", - "models = ['stree', 'adaBoost', 'bagging', 'odte']\n", - "agg_models = {}\n", - "for i in models:\n", - " agg_models[i] = 0\n", - "agg = {'linear': [], 'rbf': [], 'poly': []}\n", - "print(\"Aggregating data .\", end='')\n", - "for dataset in dt:\n", - " result = find_best(dataset[0])\n", - " print('.', end='')\n", - " if result:\n", - " agg_models[result[3]] += 1\n", - " json_result = json.loads(result[8])\n", - " key = json_result['kernel'] if 'kernel' in json_result.keys() else 'linear'\n", - " agg[key].append(json_result)\n", - "print('')\n", - "for i in models:\n", - " print(f\"{i:10} has {agg_models[i]:2} results\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Used kernel linear: 43 times\nUsed kernel poly: 0 times\nUsed kernel rbf: 0 times\n" - ] - } - ], - "source": [ - "print(\"Used kernel linear: \", len(agg['linear']), ' times')\n", - "print(\"Used kernel poly: \", len(agg['poly']), ' times')\n", - "print(\"Used kernel rbf: \", len(agg['rbf']), ' times')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[]" - ] - }, - "metadata": {}, - "execution_count": 7 - } - ], - "source": [ - "find_values('gamma', 'poly')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[0.05,\n", - " 0.05,\n", - " 0.05,\n", - " 0.05,\n", - " 0.05,\n", - " 0.05,\n", - " 0.2,\n", - " 0.2,\n", - " 0.2,\n", - " 0.2,\n", - " 0.2,\n", - " 0.2,\n", - " 0.2,\n", - " 0.55,\n", - " 0.55,\n", - " 0.55,\n", - " 1.0,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 7,\n", - " 55,\n", - " 55,\n", - " 55,\n", - " 55,\n", - " 55,\n", - " 55,\n", - " 10000.0,\n", - " 10000.0,\n", - " 10000.0,\n", - " 10000.0,\n", - " 10000.0,\n", - " 10000.0,\n", - " 10000.0,\n", - " 10000.0,\n", - " 10000.0]" - ] - }, - "metadata": {}, - "execution_count": 8 - } - ], - "source": [ - "find_values('C', 'linear')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[]" - ] - }, - "metadata": {}, - "execution_count": 9 - } - ], - "source": [ - "find_values('C', 'poly')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[]" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ], - "source": [ - "find_values('C', 'rbf')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.2,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.2,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.2,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.2,\n", - " None,\n", - " 0.6,\n", - " None,\n", - " 0.6,\n", - " 'auto',\n", - " 0.6,\n", - " 'auto']" - ] - }, - "metadata": {}, - "execution_count": 11 - } - ], - "source": [ - "find_values('max_features', 'linear')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "dbh.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ] -} \ No newline at end of file diff --git a/param_analysis.py b/param_analysis.py new file mode 100644 index 0000000..8756a7f --- /dev/null +++ b/param_analysis.py @@ -0,0 +1,156 @@ +import json +import argparse +import collections +from typing import Tuple +from experimentation.Database import MySQL +from experimentation.Sets import Datasets +from experimentation.Utils import TextColor + +kernel_names = ["linear", "rbf", "poly"] + + +class Aggregation: + def __init__(self, dbh): + self._dbh = dbh + self._report = {} + self._model_names = ["stree", "adaBoost", "bagging", "odte"] + self._kernel_names = kernel_names + + def find_values(self, dataset, parameter): + result = [] + for data in self._report[dataset]: + base_parameter = f"base_estimator__{parameter}" + if parameter in data.keys(): + result.append(data[parameter]) + if base_parameter in data.keys(): + result.append(data[base_parameter]) + try: + result_ordered = sorted(result) + return result_ordered + except TypeError: + return result + + def load(self): + dt = Datasets(False, False, "tanveer") + print("Aggregating data of best results ...") + for dataset in dt: + if result := self._dbh.find_best(dataset[0]): + accuracy = result[5] + expected = result[10] + model = result[3] + json_result = json.loads(result[8]) + if "kernel" in json_result.keys(): + kernel = json_result["kernel"] + elif "base_estimator__kernel" in json_result.keys(): + kernel = json_result["base_estimator__kernel"] + else: + kernel = "linear" + best = accuracy > expected + self._report[dataset[0]] = { + "model": model, + "kernel": kernel, + "parameters": json_result, + "best": best, + } + + @staticmethod + def report_header(title, lengths, fields, parameter): + length = sum(lengths) + len(lengths) - 1 + output = "\n" + "*" * length + "\n" + title = title + f" -- {parameter} parameter --" + num = (length - len(title) - 2) // 2 + num2 = length - len(title) - 2 - 2 * num + output += "*" + " " * num + title + " " * (num + num2) + "*\n" + output += "*" * length + "\n\n" + lines = "" + for item, data in enumerate(fields): + output += f"{fields[item]:{lengths[item]}} " + lines += "=" * lengths[item] + " " + output += f"\n{lines}" + return output + + def report(self, parameter): + agg = {} + agg_result = collections.OrderedDict() + title = "Best Hyperparameters found for datasets" + lengths = (32, 10, 7, 20) + fields = ( + "Dataset", + "Classifier", + "Kernel", + "Parameter Value", + ) + print(Aggregation.report_header(title, lengths, fields, parameter)) + for i in self._kernel_names + self._model_names: + agg[i] = {} + agg[i]["total"] = 0 + agg[i]["better"] = 0 + agg[i]["worse"] = 0 + for dataset, data in self._report.items(): + kernel = data["kernel"] + model = data["model"] + if data["best"]: + key = "better" + sign = "+" + else: + key = "worse" + sign = "-" + base_parameter = f"base_estimator__{parameter}" + result = "" + if parameter in data["parameters"]: + result = data["parameters"][parameter] + try: + agg_result[result] += 1 + except KeyError: + agg_result[result] = 1 + elif base_parameter in data["parameters"]: + result = data["parameters"][base_parameter] + try: + agg_result[result] += 1 + except KeyError: + agg_result[result] = 1 + print(f"{sign} {dataset:30s} {model:10s} {kernel:7s} {result}") + agg[kernel]["total"] += 1 + agg[kernel][key] += 1 + agg[model]["total"] += 1 + agg[model][key] += 1 + print(TextColor.BOLD, "Models", TextColor.ENDC) + for i in self._model_names: + print( + f"{i:10} has {agg[i]['total']:2} results {agg[i]['better']:2} " + f"better {agg[i]['worse']:2} worse" + ) + print(TextColor.BOLD, "Kernels", TextColor.ENDC) + for i in self._kernel_names: + print( + f"{i:10} has {agg[i]['total']:2} results {agg[i]['better']:2} " + f"better {agg[i]['worse']:2} worse" + ) + print(TextColor.BOLD, f"{parameter} Values:", TextColor.ENDC) + try: + max_len = f"{len(max(agg_result.keys(), key=len))}s" + except TypeError: + max_len = "10.2f" + for key in sorted(agg_result): + print(f"{key:{max_len}} -> {agg_result[key]:2d} times") + + +def parse_arguments() -> Tuple[str, str, str, bool, bool]: + ap = argparse.ArgumentParser() + ap.add_argument( + "-p", + "--param", + type=str, + default="C", + ) + args = ap.parse_args() + return (args.param,) + + +(param,) = parse_arguments() +dbh = MySQL() +dbh.get_connection() +agg = Aggregation(dbh) +agg.load() +agg.report(param) +dbh.close() diff --git a/report_mysql.py b/report_mysql.py index 49e5c56..cba707f 100644 --- a/report_mysql.py +++ b/report_mysql.py @@ -32,26 +32,6 @@ def parse_arguments() -> Tuple[str, str, str, bool, bool]: ) -def find_best(dataset): - cursor = database.cursor(buffered=True) - if classifier == "any": - command = ( - f"select * from results r inner join reference e on " - f"r.dataset=e.dataset where r.dataset='{dataset}' " - ) - else: - command = ( - f"select * from results r inner join reference e on " - f"r.dataset=e.dataset where r.dataset='{dataset}' and classifier" - f"='{classifier}'" - ) - command += ( - " order by r.dataset, accuracy desc, classifier desc, type, date, time" - ) - cursor.execute(command) - return cursor.fetchone() - - def report_header_content(title): length = sum(lengths) + len(lengths) - 1 output = "\n" + "*" * length + "\n" @@ -144,7 +124,7 @@ for item in [ ] + models: agg[item] = 0 for dataset in dt: - record = find_best(dataset[0]) + record = dbh.find_best(dataset[0], classifier) if record is None: print(TextColor.FAIL + f"*No results found for {dataset[0]}") else: