Begin refactor Results

This commit is contained in:
2023-05-21 21:05:58 +02:00
parent b55553847b
commit 9041c412d5
17 changed files with 1265 additions and 1221 deletions

View File

@@ -163,16 +163,18 @@ class Datasets:
attr = SimpleNamespace()
attr.dataset = name
values, counts = np.unique(y, return_counts=True)
comp = ""
sep = ""
for count in counts:
comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) "
sep = "/ "
attr.balance = comp
attr.classes = len(np.unique(y))
attr.classes = len(values)
attr.samples = X.shape[0]
attr.features = X.shape[1]
attr.cont_features = len(self.get_continuous_features())
attr.distribution = {}
comp = ""
sep = ""
for value, count in zip(values, counts):
comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) "
sep = "/ "
attr.distribution[value.item()] = count / sum(counts)
attr.balance = comp
self.discretize = tmp
return attr

File diff suppressed because it is too large Load Diff

163
benchmark/ResultsBase.py Normal file
View File

@@ -0,0 +1,163 @@
import abc
import json
import os
from .Arguments import ALL_METRICS, EnvData
from .Datasets import Datasets
from .Experiments import BestResults
from .Utils import Folders, Symbols
def get_input(message="", is_test=False):
return "test" if is_test else input(message)
class BestResultsEver:
def __init__(self):
self.data = {}
for i in ["Tanveer", "Surcov", "Arff"]:
self.data[i] = {}
for metric in ALL_METRICS:
self.data[i][metric.replace("-", "_")] = ["self", 1.0]
self.data[i][metric] = ["self", 1.0]
self.data["Tanveer"]["accuracy"] = [
"STree_default (liblinear-ovr)",
40.282203,
]
self.data["Arff"]["accuracy"] = [
"STree_default (linear-ovo)",
22.109799,
]
def get_name_value(self, key, score):
return self.data[key][score]
class BaseReport(abc.ABC):
def __init__(self, file_name, best_file=False):
self.file_name = file_name
if not os.path.isfile(file_name):
if not os.path.isfile(os.path.join(Folders.results, file_name)):
raise FileNotFoundError(f"{file_name} does not exists!")
else:
self.file_name = os.path.join(Folders.results, file_name)
with open(self.file_name) as f:
self.data = json.load(f)
self.best_acc_file = best_file
if best_file:
self.lines = self.data
else:
self.lines = self.data["results"]
self.score_name = self.data["score_name"]
self.__load_env_data()
self.__compute_best_results_ever()
def __load_env_data(self):
# Set the labels for nodes, leaves, depth
env_data = EnvData.load()
self.nodes_label = env_data["nodes"]
self.leaves_label = env_data["leaves"]
self.depth_label = env_data["depth"]
self.key = env_data["source_data"]
self.margin = float(env_data["margin"])
def __compute_best_results_ever(self):
best = BestResultsEver()
self.best_score_name, self.best_score_value = best.get_name_value(
self.key, self.score_name
)
def _get_accuracy(self, item):
return self.data[item][0] if self.best_acc_file else item["score"]
def report(self):
self.header()
accuracy_total = 0.0
for result in self.lines:
self.print_line(result)
accuracy_total += self._get_accuracy(result)
self.footer(accuracy_total)
def _load_best_results(self, score, model):
best = BestResults(score, model, Datasets())
self.best_results = best.load({})
def _compute_status(self, dataset, accuracy: float):
status = " "
if self.compare:
# Compare with best results
best = self.best_results[dataset][0]
if accuracy == best:
status = Symbols.equal_best
elif accuracy > best:
status = Symbols.better_best
else:
# compare with dataset label distribution only if its a binary one
# down_arrow if accuracy is less than the ZeroR
# black_star if accuracy is greater than the ZeroR + margin%
if self.score_name == "accuracy":
dt = Datasets()
attr = dt.get_attributes(dataset)
if attr.classes == 2:
max_category = max(attr.distribution.values())
max_value = max_category * (1 + self.margin)
if max_value > 1:
max_value = 0.9995
status = (
Symbols.cross
if accuracy <= max_value
else Symbols.upward_arrow
if accuracy > max_value
else " "
)
if status != " ":
if status not in self._compare_totals:
self._compare_totals[status] = 1
else:
self._compare_totals[status] += 1
return status
def _status_meaning(self, status):
meaning = {
Symbols.equal_best: "Equal to best",
Symbols.better_best: "Better than best",
Symbols.cross: "Less than or equal to ZeroR",
Symbols.upward_arrow: f"Better than ZeroR + "
f"{self.margin*100:3.1f}%",
}
return meaning[status]
def _get_best_accuracy(self):
return self.best_score_value
def _get_message_best_accuracy(self):
return f"{self.score_name} compared to {self.best_score_name} .:"
@abc.abstractmethod
def header(self) -> None:
pass
@abc.abstractmethod
def print_line(self, result) -> None:
pass
@abc.abstractmethod
def footer(self, accuracy: float) -> None:
pass
class StubReport(BaseReport):
def __init__(self, file_name):
super().__init__(file_name=file_name, best_file=False)
def print_line(self, line) -> None:
pass
def header(self) -> None:
self.title = self.data["title"]
self.duration = self.data["duration"]
def footer(self, accuracy: float) -> None:
self.accuracy = accuracy
self.score = accuracy / self._get_best_accuracy()

1044
benchmark/ResultsFiles.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -14,6 +14,7 @@ class Folders:
report = os.path.join(exreport, "exreport_output")
img = "img"
excel = "excel"
sql = "sql"
@staticmethod
def src():
@@ -127,6 +128,9 @@ class Symbols:
check_mark = "\N{heavy check mark}"
exclamation = "\N{heavy exclamation mark symbol}"
black_star = "\N{black star}"
cross = "\N{Ballot X}"
upward_arrow = "\N{Black-feathered north east arrow}"
down_arrow = "\N{downwards black arrow}"
equal_best = check_mark
better_best = black_star

View File

@@ -16,6 +16,8 @@ def main(args_test=None):
folders.append(os.path.join(args.project_name, Folders.report))
folders.append(os.path.join(args.project_name, Folders.img))
folders.append(os.path.join(args.project_name, Folders.excel))
folders.append(os.path.join(args.project_name, Folders.sql))
try:
for folder in folders:
print(f"Creating folder {folder}")

View File

@@ -1,6 +1,7 @@
#!/usr/bin/env python
import os
from benchmark.Results import Report, Excel, SQL, ReportBest, ReportDatasets
from benchmark.Results import Report, ReportBest
from benchmark.ResultsFiles import Excel, SQLFile, ReportDatasets
from benchmark.Utils import Files, Folders
from benchmark.Arguments import Arguments
from pathlib import Path
@@ -67,7 +68,7 @@ def main(args_test=None):
print(e)
return
if args.sql:
sql = SQL(args.file_name)
sql = SQLFile(args.file_name)
sql.report()
if args.excel:
excel = Excel(

View File

@@ -11,3 +11,4 @@ nodes=Nodes
leaves=Leaves
depth=Depth
fit_features=0
margin=0.1

View File

@@ -10,3 +10,4 @@ nodes=Nodes
leaves=Leaves
depth=Depth
fit_features=1
margin=0.1

View File

@@ -11,3 +11,4 @@ nodes=Nodes
leaves=Leaves
depth=Depth
fit_features=0
margin=0.1

View File

@@ -11,3 +11,4 @@ nodes=Nodes
leaves=Leaves
depth=Depth
fit_features=0
margin=0.1

View File

@@ -4,7 +4,7 @@ from unittest.mock import patch
from openpyxl import load_workbook
from .TestBase import TestBase
from ..Utils import Folders, Files, NO_RESULTS
from ..Results import Benchmark
from ..ResultsFiles import Benchmark
from .._version import __version__

View File

@@ -2,7 +2,7 @@ import os
from openpyxl import load_workbook
from xlsxwriter import Workbook
from .TestBase import TestBase
from ..Results import Excel
from ..ResultsFiles import Excel
from ..Utils import Folders

View File

@@ -2,7 +2,9 @@ import os
from io import StringIO
from unittest.mock import patch
from .TestBase import TestBase
from ..Results import Report, BaseReport, ReportBest, ReportDatasets, get_input
from ..Results import Report, ReportBest
from ..ResultsFiles import ReportDatasets
from ..ResultsBase import BaseReport, get_input
from ..Utils import Symbols

View File

@@ -1,7 +1,7 @@
import os
from .TestBase import TestBase
from ..Results import SQL
from ..Utils import Folders
from ..ResultsFiles import SQLFile
from ..Utils import Folders, Files
class SQLTest(TestBase):
@@ -9,14 +9,14 @@ class SQLTest(TestBase):
files = [
"results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.sql",
]
self.remove_files(files, Folders.results)
self.remove_files(files, Folders.sql)
return super().tearDown()
def test_report_SQL(self):
file_name = "results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.json"
report = SQL(file_name)
report = SQLFile(file_name)
report.report()
file_name = os.path.join(
Folders.results, file_name.replace(".json", ".sql")
Folders.sql, file_name.replace(Files.report_ext, ".sql")
)
self.check_file_file(file_name, "sql")

View File

@@ -186,6 +186,7 @@ class UtilTest(TestBase):
"leaves": "Leaves",
"depth": "Depth",
"fit_features": "0",
"margin": "0.1",
}
computed = EnvData().load()
self.assertDictEqual(computed, expected)

View File

@@ -5,6 +5,7 @@ Creating folder test_project/exreport
Creating folder test_project/exreport/exreport_output
Creating folder test_project/img
Creating folder test_project/excel
Creating folder test_project/sql
Done!
Please, edit .env file with your settings and add a datasets folder
with an all.txt file with the datasets you want to use.