Add report models in benchmark

Add SVC, WODT & ExtraTree models
2025-08-17 16:35:54 +00:00 · 2021-09-25 12:54:46 +02:00
parent d630dfaeab
commit 60374c6613
6 changed files with 406 additions and 36 deletions
--- a/src/Experiments.py
+++ b/src/Experiments.py
@@ -8,31 +8,14 @@ from tqdm import tqdm
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import StratifiedKFold, cross_validate
 from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
 from stree import Stree
 from Utils import Folders, Files
 from Models import Models
 class Randomized:
    seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
 class Models:
    @staticmethod
    def get_model(name):
        if name == "STree":
            return Stree
        elif name == "Cart":
            return DecisionTreeClassifier
        elif name == "ExtraTree":
            return ExtraTreeClassifier
        else:
            msg = f"No model recognized {name}"
            if name == "Stree" or name == "stree":
                msg += ", did you mean STree?"
            raise ValueError(msg)
 class Diterator:
    def __init__(self, data):
        self._stack = data.copy()
@@ -178,20 +161,6 @@ class Experiment:
        self.leaves = []
        self.depths = []
    def _get_complexity(self, result):
        if self.model_name == "Cart":
            nodes = result.tree_.node_count
            depth = result.tree_.max_depth
            leaves = result.get_n_leaves()
        if self.model_name == "ExtraTree":
            nodes = 0
            leaves = result.get_n_leaves()
            depth = 0
        else:
            nodes, leaves = result.nodes_leaves()
            depth = result.depth_ if hasattr(result, "depth_") else 0
        return nodes, leaves, depth
    def _n_fold_crossval(self, X, y, hyperparameters):
        if self.scores != []:
            raise ValueError("Must init experiment before!")
@@ -217,8 +186,8 @@ class Experiment:
            self.scores.append(res["test_score"])
            self.times.append(res["fit_time"])
            for result_item in res["estimator"]:
-                nodes_item, leaves_item, depth_item = self._get_complexity(
+                nodes_item, leaves_item, depth_item = Models.get_complexity(
-                    result_item
+                    self.model_name, result_item
                )
                self.nodes.append(nodes_item)
                self.leaves.append(leaves_item)
--- a/src/Models.py
+++ b/src/Models.py
@@ -0,0 +1,41 @@
 from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
 from sklearn.svm import SVC
 from stree import Stree
 from wodt import TreeClassifier
 class Models:
    @staticmethod
    def get_model(name):
        if name == "STree":
            return Stree
        elif name == "Cart":
            return DecisionTreeClassifier
        elif name == "ExtraTree":
            return ExtraTreeClassifier
        elif name == "Wodt":
            return TreeClassifier
        elif name == "SVC":
            return SVC
        else:
            msg = f"No model recognized {name}"
            if name == "Stree" or name == "stree":
                msg += ", did you mean STree?"
            raise ValueError(msg)
    @staticmethod
    def get_complexity(name, result):
        if name == "Cart":
            nodes = result.tree_.node_count
            depth = result.tree_.max_depth
            leaves = result.get_n_leaves()
        elif name == "ExtraTree":
            nodes = 0
            leaves = result.get_n_leaves()
            depth = 0
        elif name == "SVC":
            nodes = leaves = depth = 0
        else:
            nodes, leaves = result.nodes_leaves()
            depth = result.depth_ if hasattr(result, "depth_") else 0
        return nodes, leaves, depth
--- a/src/Results.py
+++ b/src/Results.py
@@ -400,6 +400,10 @@ class SQL(BaseReport):
 class Benchmark:
    @staticmethod
    def get_result_file_name():
        return os.path.join(Folders.results, Files.exreport)
    @staticmethod
    def _process_dataset(results, data):
        model = data["model"]
@@ -414,7 +418,7 @@ class Benchmark:
    @staticmethod
    def compile_results():
        # build Files.exreport
-        result_file_name = os.path.join(Folders.results, Files.exreport)
+        result_file_name = Benchmark.get_result_file_name()
        results = {}
        init_suffix, end_suffix = Files.results_suffixes("")
        all_files = list(os.walk(Folders.results))
@@ -432,7 +436,7 @@ class Benchmark:
                f.write(f"{model}, {dataset}, {accuracy}\n")
    @staticmethod
-    def report():
+    def exreport():
        def end_message(message, file):
            length = 100
            print("*" * length)
@@ -471,3 +475,35 @@ class Benchmark:
        if is_exe(Files.cmd_open):
            subprocess.run([Files.cmd_open, Files.exreport_pdf])
    @staticmethod
    def report():
        def build():
            # Build results data structure
            file_name = Benchmark.get_result_file_name()
            results = {}
            with open(file_name) as f:
                data = f.read().splitlines()
                data = data[1:]
            for line in data:
                model, dataset, accuracy = line.split(", ")
                if model not in results:
                    results[model] = {}
                results[model][dataset] = accuracy
            return results
        def show(results):
            datasets = results[list(results)[0]]
            print(f"{'Dataset':30s} ", end="")
            lines = "=" * 30 + " "
            for model in results:
                print(f"{model:9s} ", end="")
                lines += "=" * 9 + " "
            print(f"\n{lines}")
            for dataset, _ in datasets.items():
                print(f"{dataset:30s} ", end="")
                for model in results:
                    print(f"{float(results[model][dataset]):.7f} ", end="")
                print("")
        show(build())
--- a/src/benchmark.py
+++ b/src/benchmark.py
@@ -3,3 +3,4 @@ from Results import Benchmark
 benchmark = Benchmark()
 benchmark.compile_results()
 benchmark.report()
 benchmark.exreport()
--- a/src/wodt/WODT.py
+++ b/src/wodt/WODT.py
@@ -0,0 +1,318 @@
 ########################
 """import"""
 import numpy as np
 import random
 from scipy.optimize import minimize
 from sklearn.base import BaseEstimator, ClassifierMixin
 """global var"""
 epsilonepsilon = 1e-220
 epsilon = 1e-50
 """class"""
 class SplitQuestion(object):
    """docstring for SplitQuestion"""
    def __init__(self, attrIDs=[0], paras=[0], threshold=0):
        super(SplitQuestion, self).__init__()
        self.attrIDs = attrIDs
        self.paras = paras
        self.threshold = threshold
    # we only consider continuous attributes for simplicity
    def test_forOneInstance(self, x):
        return np.dot(x[self.attrIDs], self.paras) <= self.threshold
    def test(self, X):
        return np.dot(X[:, self.attrIDs], self.paras) <= self.threshold
 class Node(object):
    """docstring for RBNode"""
    def __init__(self, depth, split, sample_ids, X, Y, class_num):
        super(Node, self).__init__()
        self.sample_ids = sample_ids
        self.split = split
        self.depth = depth
        self.X = X
        self.Y = Y
        self.class_num = class_num
        self.is_leaf = False
        # after grow_stump, set the node as an internal node
    def find_best_split(self, max_features="sqrt"):
        feature_num = self.X.shape[1]
        subset_feature_num = feature_num
        if max_features == "sqrt":
            subset_feature_num = int(np.sqrt(feature_num))
        if max_features == "all":
            subset_feature_num = feature_num
        if max_features == "log":
            subset_feature_num = int(np.log2(feature_num))
        if isinstance(max_features, int):
            subset_feature_num = max_features
        if isinstance(max_features, float):
            subset_feature_num = int(feature_num * max_features)
        # ### get random subset of features
        # ### feature 0 is threshold
        feature_ids = range(feature_num)
        subset_feature_ids = random.sample(feature_ids, subset_feature_num)
        self.split.attrIDs = subset_feature_ids
        subset_feature_ids = np.array(subset_feature_ids)
        X = self.X
        subFeatures_X = X[
            self.sample_ids[:, None], subset_feature_ids[None, :]
        ]
        Y = self.Y[self.sample_ids]
        class_num = self.class_num
        # ##############################
        # define func and func_gradient for optimization
        def func(a):
            paras = a[1:]
            threshold = a[0]
            p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
            w_R = p
            w_L = 1 - w_R
            w_R_sum = w_R.sum()
            w_L_sum = w_L.sum()
            w_R_eachClass = np.array(
                [sum(w_R[Y == k]) for k in range(class_num)]
            )
            w_L_eachClass = np.array(
                [sum(w_L[Y == k]) for k in range(class_num)]
            )
            fun = (
                w_L_sum * np.log2(w_L_sum + epsilonepsilon)
                + w_R_sum * np.log2(w_R_sum + epsilonepsilon)
                - np.sum(
                    w_R_eachClass * np.log2(w_R_eachClass + epsilonepsilon)
                )
                - np.sum(
                    w_L_eachClass * np.log2(w_L_eachClass + epsilonepsilon)
                )
            )
            # fun = w_L.sum() * compute_entropy(Y, w_L) + w_R.sum()
            # * compute_entropy(Y, w_R)
            return fun
        def func_gradient(a):
            paras = a[1:]
            threshold = a[0]
            p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
            w_R = p
            w_L = 1 - w_R
            w_R_eachClass = np.array(
                [sum(w_R[Y == k]) for k in range(class_num)]
            )
            w_L_eachClass = np.array(
                [sum(w_L[Y == k]) for k in range(class_num)]
            )
            la = np.log2(
                w_L_eachClass[Y] * w_R.sum() + epsilonepsilon
            ) - np.log2(w_R_eachClass[Y] * w_L.sum() + epsilonepsilon)
            beta = la * p * (1 - p)
            jac = np.zeros(a.shape)
            jac[0] = -np.sum(beta)
            jac[1:] = np.dot(subFeatures_X.T, beta)
            return jac
        ################################################
        initial_a = np.random.rand(subset_feature_num + 1) - 0.5
        result = minimize(
            func,
            initial_a,
            method="L-BFGS-B",
            jac=func_gradient,
            options={"maxiter": 10, "disp": False},
        )
        ##########################################
        self.split.paras = result.x[1:]
        self.split.threshold = result.x[0]
        return 1
    def grow_stump(self):
        L_bool = self.split.test(self.X[self.sample_ids])
        L_sample_ids = self.sample_ids[L_bool]
        R_sample_ids = self.sample_ids[~L_bool]
        # if len(R_sample_ids) * len(L_sample_ids) == 0 :
        # 	print('some branch is 0 sample')
        LChild = Node(
            self.depth + 1,
            SplitQuestion(),
            L_sample_ids,
            self.X,
            self.Y,
            self.class_num,
        )
        RChild = Node(
            self.depth + 1,
            SplitQuestion(),
            R_sample_ids,
            self.X,
            self.Y,
            self.class_num,
        )
        if len(L_sample_ids) == 0:
            LChild.is_leaf = True
            LChild.class_distribution = compute_class_distribution(
                self.Y[self.sample_ids], self.class_num
            )
        if len(R_sample_ids) == 0:
            RChild.is_leaf = True
            RChild.class_distribution = compute_class_distribution(
                self.Y[self.sample_ids], self.class_num
            )
        self.LChild = LChild
        self.RChild = RChild
 class TreeClassifier(BaseEstimator, ClassifierMixin):
    """docstring for TreeClassifier"""
    def __init__(
        self,
        max_depth=50,
        min_samples_split=2,
        max_features="all",
        random_state=None,
    ):
        # super(TreeClassifier, self).__init__()
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.random_state = random_state
    def fit(self, X, Y):
        self.X = X
        self.Y = Y
        self.classNum = self.Y.max() + 1
        self.sampleNum = self.X.shape[0]
        if self.random_state is not None:
            random.seed(self.random_state)
        ###########
        self.root_node = Node(
            1,
            SplitQuestion(),
            np.arange(self.sampleNum, dtype=np.uint32),
            self.X,
            self.Y,
            self.classNum,
        )
        self.leaf_num = 1
        self.tree_depth = self.bulid_subtree(self.root_node)
    def nodes_leaves(self):
        def num_leaves(node):
            leaves = 0
            nodes = 0
            nodes_left = 0
            nodes_right = 0
            leaves_left = 0
            leaves_right = 0
            if node.is_leaf:
                leaves += 1
            else:
                nodes_left, leaves_left = num_leaves(node.LChild)
                nodes_right, leaves_right = num_leaves(node.RChild)
            nodes = nodes_left + nodes_right + 1
            leaves += leaves_left + leaves_right
            return nodes, leaves
        def compute_depth(node):
            if node.is_leaf:
                return node.depth
            return max(
                node.depth,
                compute_depth(node.LChild),
                compute_depth(node.RChild),
            )
        self.depth_ = compute_depth(self.root_node)
        return num_leaves(self.root_node)
    def bulid_subtree(self, node):
        if node.is_leaf:
            return node.depth
        # stopping conditions
        is_leaf = (
            node.depth >= self.max_depth
            or len(node.sample_ids) < self.min_samples_split
            or is_all_equal(self.Y[node.sample_ids])
        )
        if is_leaf or node.find_best_split(self.max_features) < 0:
            node.is_leaf = True
            node.class_distribution = compute_class_distribution(
                self.Y[node.sample_ids], self.classNum
            )
            return node.depth
        node.grow_stump()
        node.is_leaf = False
        self.leaf_num += 1
        L_subtree_depth = self.bulid_subtree(node.LChild)
        R_subtree_depth = self.bulid_subtree(node.RChild)
        return max(L_subtree_depth, R_subtree_depth)
    def predict_forOneInstance(self, x):
        present_node = self.root_node
        while not (present_node.is_leaf):
            if present_node.split.test_forOneInstance(x):
                present_node = present_node.LChild
            else:
                present_node = present_node.RChild
        return np.argmax(present_node.class_distribution)
    def predict(self, X):
        m = X.shape[0]
        Y_predicted = np.zeros((m,), dtype=int)
        for i in range(m):
            x = X[i]
            Y_predicted[i] = self.predict_forOneInstance(x)
        return Y_predicted
    def score(
        self, X: np.array, y: np.array, sample_weight: np.array = None
    ) -> float:
        y_pred = self.predict(X)
        return np.mean(y_pred == y)
 ####################
 """function"""
 def sigmoid(z):
    # because that -z is too big will arise runtimeWarning in np.exp()
    if isinstance(z, float) and (z < -500):
        z = -500
    elif not (isinstance(z, float)):
        z[z < -500] = (-500) * np.ones(sum(z < -500))
    return 1 / (np.exp(-z) + 1)
 def is_all_equal(x):
    x_min, x_max = x.min(), x.max()
    return x_min == x_max
 def compute_class_distribution(Y, class_num):
    sample_num = len(Y)
    ratio_each_class = [sum(Y == k) / sample_num for k in range(class_num)]
    return np.array(ratio_each_class)
--- a/src/wodt/init.py
+++ b/src/wodt/init.py
@@ -0,0 +1,5 @@
 from .WODT import TreeClassifier
 __all__ = [
    "TreeClassifier",
 ]