Begin refactor Results

2025-08-16 16:05:54 +00:00 · 2023-05-21 21:05:58 +02:00
parent b55553847b
commit 9041c412d5
17 changed files with 1265 additions and 1221 deletions
--- a/benchmark/Datasets.py
+++ b/benchmark/Datasets.py
@@ -163,16 +163,18 @@ class Datasets:
        attr = SimpleNamespace()
        attr.dataset = name
        values, counts = np.unique(y, return_counts=True)
-        comp = ""
+        attr.classes = len(values)
        sep = ""
        for count in counts:
            comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) "
            sep = "/ "
        attr.balance = comp
        attr.classes = len(np.unique(y))
        attr.samples = X.shape[0]
        attr.features = X.shape[1]
        attr.cont_features = len(self.get_continuous_features())
        attr.distribution = {}
        comp = ""
        sep = ""
        for value, count in zip(values, counts):
            comp += f"{sep}{count/sum(counts)*100:5.2f}% ({count}) "
            sep = "/ "
            attr.distribution[value.item()] = count / sum(counts)
        attr.balance = comp
        self.discretize = tmp
        return attr
--- a/benchmark/Results.py
+++ b/benchmark/Results.py
--- a/benchmark/ResultsBase.py
+++ b/benchmark/ResultsBase.py
@@ -0,0 +1,163 @@
 import abc
 import json
 import os
 from .Arguments import ALL_METRICS, EnvData
 from .Datasets import Datasets
 from .Experiments import BestResults
 from .Utils import Folders, Symbols
 def get_input(message="", is_test=False):
    return "test" if is_test else input(message)
 class BestResultsEver:
    def __init__(self):
        self.data = {}
        for i in ["Tanveer", "Surcov", "Arff"]:
            self.data[i] = {}
            for metric in ALL_METRICS:
                self.data[i][metric.replace("-", "_")] = ["self", 1.0]
                self.data[i][metric] = ["self", 1.0]
        self.data["Tanveer"]["accuracy"] = [
            "STree_default (liblinear-ovr)",
            40.282203,
        ]
        self.data["Arff"]["accuracy"] = [
            "STree_default (linear-ovo)",
            22.109799,
        ]
    def get_name_value(self, key, score):
        return self.data[key][score]
 class BaseReport(abc.ABC):
    def __init__(self, file_name, best_file=False):
        self.file_name = file_name
        if not os.path.isfile(file_name):
            if not os.path.isfile(os.path.join(Folders.results, file_name)):
                raise FileNotFoundError(f"{file_name} does not exists!")
            else:
                self.file_name = os.path.join(Folders.results, file_name)
        with open(self.file_name) as f:
            self.data = json.load(f)
        self.best_acc_file = best_file
        if best_file:
            self.lines = self.data
        else:
            self.lines = self.data["results"]
            self.score_name = self.data["score_name"]
        self.__load_env_data()
        self.__compute_best_results_ever()
    def __load_env_data(self):
        # Set the labels for nodes, leaves, depth
        env_data = EnvData.load()
        self.nodes_label = env_data["nodes"]
        self.leaves_label = env_data["leaves"]
        self.depth_label = env_data["depth"]
        self.key = env_data["source_data"]
        self.margin = float(env_data["margin"])
    def __compute_best_results_ever(self):
        best = BestResultsEver()
        self.best_score_name, self.best_score_value = best.get_name_value(
            self.key, self.score_name
        )
    def _get_accuracy(self, item):
        return self.data[item][0] if self.best_acc_file else item["score"]
    def report(self):
        self.header()
        accuracy_total = 0.0
        for result in self.lines:
            self.print_line(result)
            accuracy_total += self._get_accuracy(result)
        self.footer(accuracy_total)
    def _load_best_results(self, score, model):
        best = BestResults(score, model, Datasets())
        self.best_results = best.load({})
    def _compute_status(self, dataset, accuracy: float):
        status = " "
        if self.compare:
            # Compare with best results
            best = self.best_results[dataset][0]
            if accuracy == best:
                status = Symbols.equal_best
            elif accuracy > best:
                status = Symbols.better_best
        else:
            # compare with dataset label distribution only if its a binary one
            # down_arrow if accuracy is less than the ZeroR
            # black_star if accuracy is greater than the ZeroR + margin%
            if self.score_name == "accuracy":
                dt = Datasets()
                attr = dt.get_attributes(dataset)
                if attr.classes == 2:
                    max_category = max(attr.distribution.values())
                    max_value = max_category * (1 + self.margin)
                    if max_value > 1:
                        max_value = 0.9995
                    status = (
                        Symbols.cross
                        if accuracy <= max_value
                        else Symbols.upward_arrow
                        if accuracy > max_value
                        else " "
                    )
        if status != " ":
            if status not in self._compare_totals:
                self._compare_totals[status] = 1
            else:
                self._compare_totals[status] += 1
        return status
    def _status_meaning(self, status):
        meaning = {
            Symbols.equal_best: "Equal to best",
            Symbols.better_best: "Better than best",
            Symbols.cross: "Less than or equal to ZeroR",
            Symbols.upward_arrow: f"Better than ZeroR + "
            f"{self.margin*100:3.1f}%",
        }
        return meaning[status]
    def _get_best_accuracy(self):
        return self.best_score_value
    def _get_message_best_accuracy(self):
        return f"{self.score_name} compared to {self.best_score_name} .:"
    @abc.abstractmethod
    def header(self) -> None:
        pass
    @abc.abstractmethod
    def print_line(self, result) -> None:
        pass
    @abc.abstractmethod
    def footer(self, accuracy: float) -> None:
        pass
 class StubReport(BaseReport):
    def __init__(self, file_name):
        super().__init__(file_name=file_name, best_file=False)
    def print_line(self, line) -> None:
        pass
    def header(self) -> None:
        self.title = self.data["title"]
        self.duration = self.data["duration"]
    def footer(self, accuracy: float) -> None:
        self.accuracy = accuracy
        self.score = accuracy / self._get_best_accuracy()
--- a/benchmark/ResultsFiles.py
+++ b/benchmark/ResultsFiles.py
--- a/benchmark/Utils.py
+++ b/benchmark/Utils.py
@@ -14,6 +14,7 @@ class Folders:
    report = os.path.join(exreport, "exreport_output")
    img = "img"
    excel = "excel"
    sql = "sql"
    @staticmethod
    def src():
@@ -127,6 +128,9 @@ class Symbols:
    check_mark = "\N{heavy check mark}"
    exclamation = "\N{heavy exclamation mark symbol}"
    black_star = "\N{black star}"
    cross = "\N{Ballot X}"
    upward_arrow = "\N{Black-feathered north east arrow}"
    down_arrow = "\N{downwards black arrow}"
    equal_best = check_mark
    better_best = black_star
--- a/benchmark/scripts/be_init_project.py
+++ b/benchmark/scripts/be_init_project.py
@@ -16,6 +16,8 @@ def main(args_test=None):
    folders.append(os.path.join(args.project_name, Folders.report))
    folders.append(os.path.join(args.project_name, Folders.img))
    folders.append(os.path.join(args.project_name, Folders.excel))
    folders.append(os.path.join(args.project_name, Folders.sql))
    try:
        for folder in folders:
            print(f"Creating folder {folder}")
--- a/benchmark/scripts/be_report.py
+++ b/benchmark/scripts/be_report.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import os
-from benchmark.Results import Report, Excel, SQL, ReportBest, ReportDatasets
+from benchmark.Results import Report, ReportBest
 from benchmark.ResultsFiles import Excel, SQLFile, ReportDatasets
 from benchmark.Utils import Files, Folders
 from benchmark.Arguments import Arguments
 from pathlib import Path
@@ -67,7 +68,7 @@ def main(args_test=None):
                print(e)
                return
            if args.sql:
-                sql = SQL(args.file_name)
+                sql = SQLFile(args.file_name)
                sql.report()
            if args.excel:
                excel = Excel(
--- a/benchmark/tests/.env
+++ b/benchmark/tests/.env
@@ -10,4 +10,5 @@ discretize=0
 nodes=Nodes
 leaves=Leaves
 depth=Depth
-fit_features=0
+fit_features=0
 margin=0.1
--- a/benchmark/tests/.env.arff
+++ b/benchmark/tests/.env.arff
@@ -9,4 +9,5 @@ discretize=1
 nodes=Nodes
 leaves=Leaves
 depth=Depth
-fit_features=1
+fit_features=1
 margin=0.1
--- a/benchmark/tests/.env.dist
+++ b/benchmark/tests/.env.dist
@@ -10,4 +10,5 @@ discretize=0
 nodes=Nodes
 leaves=Leaves
 depth=Depth
-fit_features=0
+fit_features=0
 margin=0.1
--- a/benchmark/tests/.env.surcov
+++ b/benchmark/tests/.env.surcov
@@ -10,4 +10,5 @@ discretize=0
 nodes=Nodes
 leaves=Leaves
 depth=Depth
-fit_features=0
+fit_features=0
 margin=0.1
--- a/benchmark/tests/Benchmark_test.py
+++ b/benchmark/tests/Benchmark_test.py
@@ -4,7 +4,7 @@ from unittest.mock import patch
 from openpyxl import load_workbook
 from .TestBase import TestBase
 from ..Utils import Folders, Files, NO_RESULTS
-from ..Results import Benchmark
+from ..ResultsFiles import Benchmark
 from .._version import __version__
--- a/benchmark/tests/Excel_test.py
+++ b/benchmark/tests/Excel_test.py
@@ -2,7 +2,7 @@ import os
 from openpyxl import load_workbook
 from xlsxwriter import Workbook
 from .TestBase import TestBase
-from ..Results import Excel
+from ..ResultsFiles import Excel
 from ..Utils import Folders
--- a/benchmark/tests/Report_test.py
+++ b/benchmark/tests/Report_test.py
@@ -2,7 +2,9 @@ import os
 from io import StringIO
 from unittest.mock import patch
 from .TestBase import TestBase
-from ..Results import Report, BaseReport, ReportBest, ReportDatasets, get_input
+from ..Results import Report, ReportBest
 from ..ResultsFiles import ReportDatasets
 from ..ResultsBase import BaseReport, get_input
 from ..Utils import Symbols
--- a/benchmark/tests/SQL_test.py
+++ b/benchmark/tests/SQL_test.py
@@ -1,7 +1,7 @@
 import os
 from .TestBase import TestBase
-from ..Results import SQL
+from ..ResultsFiles import SQLFile
-from ..Utils import Folders
+from ..Utils import Folders, Files
 class SQLTest(TestBase):
@@ -9,14 +9,14 @@ class SQLTest(TestBase):
        files = [
            "results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.sql",
        ]
-        self.remove_files(files, Folders.results)
+        self.remove_files(files, Folders.sql)
        return super().tearDown()
    def test_report_SQL(self):
        file_name = "results_accuracy_ODTE_Galgo_2022-04-20_10:52:20_0.json"
-        report = SQL(file_name)
+        report = SQLFile(file_name)
        report.report()
        file_name = os.path.join(
-            Folders.results, file_name.replace(".json", ".sql")
+            Folders.sql, file_name.replace(Files.report_ext, ".sql")
        )
        self.check_file_file(file_name, "sql")
--- a/benchmark/tests/Util_test.py
+++ b/benchmark/tests/Util_test.py
@@ -186,6 +186,7 @@ class UtilTest(TestBase):
            "leaves": "Leaves",
            "depth": "Depth",
            "fit_features": "0",
            "margin": "0.1",
        }
        computed = EnvData().load()
        self.assertDictEqual(computed, expected)
--- a/benchmark/tests/test_files/be_init_project.test
+++ b/benchmark/tests/test_files/be_init_project.test
@@ -5,6 +5,7 @@ Creating folder test_project/exreport
 Creating folder test_project/exreport/exreport_output
 Creating folder test_project/img
 Creating folder test_project/excel
 Creating folder test_project/sql
 Done!
 Please, edit .env file with your settings and add a datasets folder
 with an all.txt file with the datasets you want to use.