Begin refactor Results

2025-08-16 07:55:54 +00:00 · 2023-05-21 21:05:58 +02:00
parent b55553847b
commit 9041c412d5
17 changed files with 1265 additions and 1221 deletions
--- a/benchmark/Datasets.py
+++ b/benchmark/Datasets.py
@@ -163,16 +163,18 @@ class Datasets:
        attr = SimpleNamespace()
        attr.dataset = name
        values, counts = np.unique(y, return_counts=True)
-        comp = ""
-        sep = ""
-        for count in counts:
-            comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) "
-            sep = "/ "
-        attr.balance = comp
-        attr.classes = len(np.unique(y))
+        attr.classes = len(values)
        attr.samples = X.shape[0]
        attr.features = X.shape[1]
        attr.cont_features = len(self.get_continuous_features())
+        attr.distribution = {}
+        comp = ""
+        sep = ""
+        for value, count in zip(values, counts):
+            comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) "
+            sep = "/ "
+            attr.distribution[value.item()] = count / sum(counts)
+        attr.balance = comp
        self.discretize = tmp
        return attr

--- a/benchmark/Results.py
+++ b/benchmark/Results.py
--- a/benchmark/ResultsBase.py
+++ b/benchmark/ResultsBase.py
@@ -0,0 +1,163 @@
+import abc
+import json
+import os
+
+
+from .Arguments import ALL_METRICS, EnvData
+from .Datasets import Datasets
+from .Experiments import BestResults
+from .Utils import Folders, Symbols
+
+
+def get_input(message="", is_test=False):
+    return "test" if is_test else input(message)
+
+
+class BestResultsEver:
+    def __init__(self):
+        self.data = {}
+        for i in ["Tanveer", "Surcov", "Arff"]:
+            self.data[i] = {}
+            for metric in ALL_METRICS:
+                self.data[i][metric.replace("-", "_")] = ["self", 1.0]
+                self.data[i][metric] = ["self", 1.0]
+        self.data["Tanveer"]["accuracy"] = [
+            "STree_default (liblinear-ovr)",
+            40.282203,
+        ]
+        self.data["Arff"]["accuracy"] = [
+            "STree_default (linear-ovo)",
+            22.109799,
+        ]
+
+    def get_name_value(self, key, score):
+        return self.data[key][score]
+
+
+class BaseReport(abc.ABC):
+    def __init__(self, file_name, best_file=False):
+        self.file_name = file_name
+        if not os.path.isfile(file_name):
+            if not os.path.isfile(os.path.join(Folders.results, file_name)):
+                raise FileNotFoundError(f"{file_name} does not exists!")
+            else:
+                self.file_name = os.path.join(Folders.results, file_name)
+        with open(self.file_name) as f:
+            self.data = json.load(f)
+        self.best_acc_file = best_file
+        if best_file:
+            self.lines = self.data
+        else:
+            self.lines = self.data["results"]
+            self.score_name = self.data["score_name"]
+        self.__load_env_data()
+        self.__compute_best_results_ever()
+
+    def __load_env_data(self):
+        # Set the labels for nodes, leaves, depth
+        env_data = EnvData.load()
+        self.nodes_label = env_data["nodes"]
+        self.leaves_label = env_data["leaves"]
+        self.depth_label = env_data["depth"]
+        self.key = env_data["source_data"]
+        self.margin = float(env_data["margin"])
+
+    def __compute_best_results_ever(self):
+        best = BestResultsEver()
+        self.best_score_name, self.best_score_value = best.get_name_value(
+            self.key, self.score_name
+        )
+
+    def _get_accuracy(self, item):
+        return self.data[item][0] if self.best_acc_file else item["score"]
+
+    def report(self):
+        self.header()
+        accuracy_total = 0.0
+        for result in self.lines:
+            self.print_line(result)
+            accuracy_total += self._get_accuracy(result)
+        self.footer(accuracy_total)
+
+    def _load_best_results(self, score, model):
+        best = BestResults(score, model, Datasets())
+        self.best_results = best.load({})
+
+    def _compute_status(self, dataset, accuracy: float):
+        status = " "
+        if self.compare:
+            # Compare with best results
+            best = self.best_results[dataset][0]
+            if accuracy == best:
+                status = Symbols.equal_best
+            elif accuracy > best:
+                status = Symbols.better_best
+        else:
+            # compare with dataset label distribution only if its a binary one
+            # down_arrow if accuracy is less than the ZeroR
+            # black_star if accuracy is greater than the ZeroR + margin%
+            if self.score_name == "accuracy":
+                dt = Datasets()
+                attr = dt.get_attributes(dataset)
+                if attr.classes == 2:
+                    max_category = max(attr.distribution.values())
+                    max_value = max_category * (1 + self.margin)
+                    if max_value > 1:
+                        max_value = 0.9995
+                    status = (
+                        Symbols.cross
+                        if accuracy <= max_value
+                        else Symbols.upward_arrow
+                        if accuracy > max_value
+                        else " "
+                    )
+        if status != " ":
+            if status not in self._compare_totals:
+                self._compare_totals[status] = 1
+            else:
+                self._compare_totals[status] += 1
+        return status
+
+    def _status_meaning(self, status):
+        meaning = {
+            Symbols.equal_best: "Equal to best",
+            Symbols.better_best: "Better than best",
+            Symbols.cross: "Less than or equal to ZeroR",
+            Symbols.upward_arrow: f"Better than ZeroR + "
+            f"{self.margin*100:3.1f}%",
+        }
+        return meaning[status]
+
+    def _get_best_accuracy(self):
+        return self.best_score_value
+
+    def _get_message_best_accuracy(self):
+        return f"{self.score_name} compared to {self.best_score_name} .:"
+
+    @abc.abstractmethod
+    def header(self) -> None:
+        pass
+
+    @abc.abstractmethod
+    def print_line(self, result) -> None:
+        pass
+
+    @abc.abstractmethod
+    def footer(self, accuracy: float) -> None:
+        pass
+
+
+class StubReport(BaseReport):
+    def __init__(self, file_name):
+        super().__init__(file_name=file_name, best_file=False)
+
+    def print_line(self, line) -> None:
+        pass
+
+    def header(self) -> None:
+        self.title = self.data["title"]
+        self.duration = self.data["duration"]
+
+    def footer(self, accuracy: float) -> None:
+        self.accuracy = accuracy
+        self.score = accuracy / self._get_best_accuracy()
--- a/benchmark/ResultsFiles.py
+++ b/benchmark/ResultsFiles.py
--- a/benchmark/Utils.py
+++ b/benchmark/Utils.py
@@ -14,6 +14,7 @@ class Folders:
    report = os.path.join(exreport, "exreport_output")
    img = "img"
    excel = "excel"
+    sql = "sql"

    @staticmethod
    def src():
@@ -127,6 +128,9 @@ class Symbols:
    check_mark = "\N{heavy check mark}"
    exclamation = "\N{heavy exclamation mark symbol}"
    black_star = "\N{black star}"
+    cross = "\N{Ballot X}"
+    upward_arrow = "\N{Black-feathered north east arrow}"
+    down_arrow = "\N{downwards black arrow}"
    equal_best = check_mark
    better_best = black_star

--- a/benchmark/scripts/be_init_project.py
+++ b/benchmark/scripts/be_init_project.py
@@ -16,6 +16,8 @@ def main(args_test=None):
    folders.append(os.path.join(args.project_name, Folders.report))
    folders.append(os.path.join(args.project_name, Folders.img))
    folders.append(os.path.join(args.project_name, Folders.excel))
+    folders.append(os.path.join(args.project_name, Folders.sql))
+
    try:
        for folder in folders:
            print(f"Creating folder {folder}")
--- a/benchmark/scripts/be_report.py
+++ b/benchmark/scripts/be_report.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import os
-from benchmark.Results import Report, Excel, SQL, ReportBest, ReportDatasets
+from benchmark.Results import Report, ReportBest
+from benchmark.ResultsFiles import Excel, SQLFile, ReportDatasets
 from benchmark.Utils import Files, Folders
 from benchmark.Arguments import Arguments
 from pathlib import Path
@@ -67,7 +68,7 @@ def main(args_test=None):
                print(e)
                return
            if args.sql:
-                sql = SQL(args.file_name)
+                sql = SQLFile(args.file_name)
                sql.report()
            if args.excel:
                excel = Excel(
--- a/benchmark/tests/.env
+++ b/benchmark/tests/.env
@@ -11,3 +11,4 @@ nodes=Nodes
 leaves=Leaves
 depth=Depth
 fit_features=0
+margin=0.1
--- a/benchmark/tests/.env.arff
+++ b/benchmark/tests/.env.arff
@@ -10,3 +10,4 @@ nodes=Nodes
 leaves=Leaves
 depth=Depth
 fit_features=1
+margin=0.1
--- a/benchmark/tests/.env.dist
+++ b/benchmark/tests/.env.dist
@@ -11,3 +11,4 @@ nodes=Nodes
 leaves=Leaves
 depth=Depth
 fit_features=0
+margin=0.1
--- a/benchmark/tests/.env.surcov
+++ b/benchmark/tests/.env.surcov
@@ -11,3 +11,4 @@ nodes=Nodes
 leaves=Leaves
 depth=Depth
 fit_features=0
+margin=0.1
--- a/benchmark/tests/Benchmark_test.py
+++ b/benchmark/tests/Benchmark_test.py
@@ -4,7 +4,7 @@ from unittest.mock import patch
 from openpyxl import load_workbook
 from .TestBase import TestBase
 from ..Utils import Folders, Files, NO_RESULTS
-from ..Results import Benchmark
+from ..ResultsFiles import Benchmark
 from .._version import __version__


--- a/benchmark/tests/Excel_test.py
+++ b/benchmark/tests/Excel_test.py
@@ -2,7 +2,7 @@ import os
 from openpyxl import load_workbook
 from xlsxwriter import Workbook
 from .TestBase import TestBase
-from ..Results import Excel
+from ..ResultsFiles import Excel
 from ..Utils import Folders


--- a/benchmark/tests/Report_test.py
+++ b/benchmark/tests/Report_test.py
@@ -2,7 +2,9 @@ import os
 from io import StringIO
 from unittest.mock import patch
 from .TestBase import TestBase
-from ..Results import Report, BaseReport, ReportBest, ReportDatasets, get_input
+from ..Results import Report, ReportBest
+from ..ResultsFiles import ReportDatasets
+from ..ResultsBase import BaseReport, get_input
 from ..Utils import Symbols


--- a/benchmark/tests/SQL_test.py
+++ b/benchmark/tests/SQL_test.py
@@ -1,7 +1,7 @@
 import os
 from .TestBase import TestBase
-from ..Results import SQL
-from ..Utils import Folders
+from ..ResultsFiles import SQLFile
+from ..Utils import Folders, Files


 class SQLTest(TestBase):
@@ -9,14 +9,14 @@ class SQLTest(TestBase):
        files = [
            "results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.sql",
        ]
-        self.remove_files(files, Folders.results)
+        self.remove_files(files, Folders.sql)
        return super().tearDown()

    def test_report_SQL(self):
        file_name = "results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.json"
-        report = SQL(file_name)
+        report = SQLFile(file_name)
        report.report()
        file_name = os.path.join(
-            Folders.results, file_name.replace(".json", ".sql")
+            Folders.sql, file_name.replace(Files.report_ext, ".sql")
        )
        self.check_file_file(file_name, "sql")
--- a/benchmark/tests/Util_test.py
+++ b/benchmark/tests/Util_test.py
@@ -186,6 +186,7 @@ class UtilTest(TestBase):
            "leaves": "Leaves",
            "depth": "Depth",
            "fit_features": "0",
+            "margin": "0.1",
        }
        computed = EnvData().load()
        self.assertDictEqual(computed, expected)
--- a/benchmark/tests/test_files/be_init_project.test
+++ b/benchmark/tests/test_files/be_init_project.test
@@ -5,6 +5,7 @@ Creating folder test_project/exreport
 Creating folder test_project/exreport/exreport_output
 Creating folder test_project/img
 Creating folder test_project/excel
+Creating folder test_project/sql
 Done!
 Please, edit .env file with your settings and add a datasets folder
 with an all.txt file with the datasets you want to use.