From 60374c6613dbf982bc482cb1b2f45dbf95cef094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= Date: Sat, 25 Sep 2021 12:54:46 +0200 Subject: [PATCH] Add report models in benchmark Add SVC, WODT & ExtraTree models --- src/Experiments.py | 37 +---- src/Models.py | 41 ++++++ src/Results.py | 40 +++++- src/benchmark.py | 1 + src/wodt/WODT.py | 318 +++++++++++++++++++++++++++++++++++++++++++ src/wodt/__init__.py | 5 + 6 files changed, 406 insertions(+), 36 deletions(-) create mode 100644 src/Models.py create mode 100644 src/wodt/WODT.py create mode 100644 src/wodt/__init__.py diff --git a/src/Experiments.py b/src/Experiments.py index a1f6c2e..cdd52c4 100644 --- a/src/Experiments.py +++ b/src/Experiments.py @@ -8,31 +8,14 @@ from tqdm import tqdm import numpy as np import pandas as pd from sklearn.model_selection import StratifiedKFold, cross_validate -from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier -from stree import Stree from Utils import Folders, Files +from Models import Models class Randomized: seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1] -class Models: - @staticmethod - def get_model(name): - if name == "STree": - return Stree - elif name == "Cart": - return DecisionTreeClassifier - elif name == "ExtraTree": - return ExtraTreeClassifier - else: - msg = f"No model recognized {name}" - if name == "Stree" or name == "stree": - msg += ", did you mean STree?" - raise ValueError(msg) - - class Diterator: def __init__(self, data): self._stack = data.copy() @@ -178,20 +161,6 @@ class Experiment: self.leaves = [] self.depths = [] - def _get_complexity(self, result): - if self.model_name == "Cart": - nodes = result.tree_.node_count - depth = result.tree_.max_depth - leaves = result.get_n_leaves() - if self.model_name == "ExtraTree": - nodes = 0 - leaves = result.get_n_leaves() - depth = 0 - else: - nodes, leaves = result.nodes_leaves() - depth = result.depth_ if hasattr(result, "depth_") else 0 - return nodes, leaves, depth - def _n_fold_crossval(self, X, y, hyperparameters): if self.scores != []: raise ValueError("Must init experiment before!") @@ -217,8 +186,8 @@ class Experiment: self.scores.append(res["test_score"]) self.times.append(res["fit_time"]) for result_item in res["estimator"]: - nodes_item, leaves_item, depth_item = self._get_complexity( - result_item + nodes_item, leaves_item, depth_item = Models.get_complexity( + self.model_name, result_item ) self.nodes.append(nodes_item) self.leaves.append(leaves_item) diff --git a/src/Models.py b/src/Models.py new file mode 100644 index 0000000..97c0b4e --- /dev/null +++ b/src/Models.py @@ -0,0 +1,41 @@ +from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier +from sklearn.svm import SVC +from stree import Stree +from wodt import TreeClassifier + + +class Models: + @staticmethod + def get_model(name): + if name == "STree": + return Stree + elif name == "Cart": + return DecisionTreeClassifier + elif name == "ExtraTree": + return ExtraTreeClassifier + elif name == "Wodt": + return TreeClassifier + elif name == "SVC": + return SVC + else: + msg = f"No model recognized {name}" + if name == "Stree" or name == "stree": + msg += ", did you mean STree?" + raise ValueError(msg) + + @staticmethod + def get_complexity(name, result): + if name == "Cart": + nodes = result.tree_.node_count + depth = result.tree_.max_depth + leaves = result.get_n_leaves() + elif name == "ExtraTree": + nodes = 0 + leaves = result.get_n_leaves() + depth = 0 + elif name == "SVC": + nodes = leaves = depth = 0 + else: + nodes, leaves = result.nodes_leaves() + depth = result.depth_ if hasattr(result, "depth_") else 0 + return nodes, leaves, depth diff --git a/src/Results.py b/src/Results.py index eddca0d..b0b34e1 100644 --- a/src/Results.py +++ b/src/Results.py @@ -400,6 +400,10 @@ class SQL(BaseReport): class Benchmark: + @staticmethod + def get_result_file_name(): + return os.path.join(Folders.results, Files.exreport) + @staticmethod def _process_dataset(results, data): model = data["model"] @@ -414,7 +418,7 @@ class Benchmark: @staticmethod def compile_results(): # build Files.exreport - result_file_name = os.path.join(Folders.results, Files.exreport) + result_file_name = Benchmark.get_result_file_name() results = {} init_suffix, end_suffix = Files.results_suffixes("") all_files = list(os.walk(Folders.results)) @@ -432,7 +436,7 @@ class Benchmark: f.write(f"{model}, {dataset}, {accuracy}\n") @staticmethod - def report(): + def exreport(): def end_message(message, file): length = 100 print("*" * length) @@ -471,3 +475,35 @@ class Benchmark: if is_exe(Files.cmd_open): subprocess.run([Files.cmd_open, Files.exreport_pdf]) + + @staticmethod + def report(): + def build(): + # Build results data structure + file_name = Benchmark.get_result_file_name() + results = {} + with open(file_name) as f: + data = f.read().splitlines() + data = data[1:] + for line in data: + model, dataset, accuracy = line.split(", ") + if model not in results: + results[model] = {} + results[model][dataset] = accuracy + return results + + def show(results): + datasets = results[list(results)[0]] + print(f"{'Dataset':30s} ", end="") + lines = "=" * 30 + " " + for model in results: + print(f"{model:9s} ", end="") + lines += "=" * 9 + " " + print(f"\n{lines}") + for dataset, _ in datasets.items(): + print(f"{dataset:30s} ", end="") + for model in results: + print(f"{float(results[model][dataset]):.7f} ", end="") + print("") + + show(build()) diff --git a/src/benchmark.py b/src/benchmark.py index d6a66a4..e1b015d 100644 --- a/src/benchmark.py +++ b/src/benchmark.py @@ -3,3 +3,4 @@ from Results import Benchmark benchmark = Benchmark() benchmark.compile_results() benchmark.report() +benchmark.exreport() diff --git a/src/wodt/WODT.py b/src/wodt/WODT.py new file mode 100644 index 0000000..f20a9fa --- /dev/null +++ b/src/wodt/WODT.py @@ -0,0 +1,318 @@ +######################## +"""import""" +import numpy as np +import random +from scipy.optimize import minimize +from sklearn.base import BaseEstimator, ClassifierMixin + + +"""global var""" +epsilonepsilon = 1e-220 +epsilon = 1e-50 + +"""class""" + + +class SplitQuestion(object): + """docstring for SplitQuestion""" + + def __init__(self, attrIDs=[0], paras=[0], threshold=0): + super(SplitQuestion, self).__init__() + self.attrIDs = attrIDs + self.paras = paras + self.threshold = threshold + + # we only consider continuous attributes for simplicity + def test_forOneInstance(self, x): + return np.dot(x[self.attrIDs], self.paras) <= self.threshold + + def test(self, X): + return np.dot(X[:, self.attrIDs], self.paras) <= self.threshold + + +class Node(object): + """docstring for RBNode""" + + def __init__(self, depth, split, sample_ids, X, Y, class_num): + super(Node, self).__init__() + self.sample_ids = sample_ids + self.split = split + self.depth = depth + self.X = X + self.Y = Y + self.class_num = class_num + self.is_leaf = False + # after grow_stump, set the node as an internal node + + def find_best_split(self, max_features="sqrt"): + feature_num = self.X.shape[1] + subset_feature_num = feature_num + if max_features == "sqrt": + subset_feature_num = int(np.sqrt(feature_num)) + if max_features == "all": + subset_feature_num = feature_num + if max_features == "log": + subset_feature_num = int(np.log2(feature_num)) + if isinstance(max_features, int): + subset_feature_num = max_features + if isinstance(max_features, float): + subset_feature_num = int(feature_num * max_features) + + # ### get random subset of features + # ### feature 0 is threshold + feature_ids = range(feature_num) + subset_feature_ids = random.sample(feature_ids, subset_feature_num) + self.split.attrIDs = subset_feature_ids + subset_feature_ids = np.array(subset_feature_ids) + + X = self.X + subFeatures_X = X[ + self.sample_ids[:, None], subset_feature_ids[None, :] + ] + Y = self.Y[self.sample_ids] + class_num = self.class_num + + # ############################## + # define func and func_gradient for optimization + def func(a): + paras = a[1:] + threshold = a[0] + p = sigmoid(np.dot(subFeatures_X, paras) - threshold) + w_R = p + w_L = 1 - w_R + w_R_sum = w_R.sum() + w_L_sum = w_L.sum() + w_R_eachClass = np.array( + [sum(w_R[Y == k]) for k in range(class_num)] + ) + w_L_eachClass = np.array( + [sum(w_L[Y == k]) for k in range(class_num)] + ) + fun = ( + w_L_sum * np.log2(w_L_sum + epsilonepsilon) + + w_R_sum * np.log2(w_R_sum + epsilonepsilon) + - np.sum( + w_R_eachClass * np.log2(w_R_eachClass + epsilonepsilon) + ) + - np.sum( + w_L_eachClass * np.log2(w_L_eachClass + epsilonepsilon) + ) + ) + # fun = w_L.sum() * compute_entropy(Y, w_L) + w_R.sum() + # * compute_entropy(Y, w_R) + return fun + + def func_gradient(a): + paras = a[1:] + threshold = a[0] + + p = sigmoid(np.dot(subFeatures_X, paras) - threshold) + w_R = p + w_L = 1 - w_R + w_R_eachClass = np.array( + [sum(w_R[Y == k]) for k in range(class_num)] + ) + w_L_eachClass = np.array( + [sum(w_L[Y == k]) for k in range(class_num)] + ) + la = np.log2( + w_L_eachClass[Y] * w_R.sum() + epsilonepsilon + ) - np.log2(w_R_eachClass[Y] * w_L.sum() + epsilonepsilon) + beta = la * p * (1 - p) + + jac = np.zeros(a.shape) + jac[0] = -np.sum(beta) + jac[1:] = np.dot(subFeatures_X.T, beta) + + return jac + + ################################################ + initial_a = np.random.rand(subset_feature_num + 1) - 0.5 + result = minimize( + func, + initial_a, + method="L-BFGS-B", + jac=func_gradient, + options={"maxiter": 10, "disp": False}, + ) + + ########################################## + self.split.paras = result.x[1:] + self.split.threshold = result.x[0] + + return 1 + + def grow_stump(self): + L_bool = self.split.test(self.X[self.sample_ids]) + L_sample_ids = self.sample_ids[L_bool] + R_sample_ids = self.sample_ids[~L_bool] + # if len(R_sample_ids) * len(L_sample_ids) == 0 : + # print('some branch is 0 sample') + LChild = Node( + self.depth + 1, + SplitQuestion(), + L_sample_ids, + self.X, + self.Y, + self.class_num, + ) + RChild = Node( + self.depth + 1, + SplitQuestion(), + R_sample_ids, + self.X, + self.Y, + self.class_num, + ) + + if len(L_sample_ids) == 0: + LChild.is_leaf = True + LChild.class_distribution = compute_class_distribution( + self.Y[self.sample_ids], self.class_num + ) + if len(R_sample_ids) == 0: + RChild.is_leaf = True + RChild.class_distribution = compute_class_distribution( + self.Y[self.sample_ids], self.class_num + ) + + self.LChild = LChild + self.RChild = RChild + + +class TreeClassifier(BaseEstimator, ClassifierMixin): + """docstring for TreeClassifier""" + + def __init__( + self, + max_depth=50, + min_samples_split=2, + max_features="all", + random_state=None, + ): + # super(TreeClassifier, self).__init__() + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.max_features = max_features + self.random_state = random_state + + def fit(self, X, Y): + self.X = X + self.Y = Y + self.classNum = self.Y.max() + 1 + self.sampleNum = self.X.shape[0] + if self.random_state is not None: + random.seed(self.random_state) + ########### + self.root_node = Node( + 1, + SplitQuestion(), + np.arange(self.sampleNum, dtype=np.uint32), + self.X, + self.Y, + self.classNum, + ) + self.leaf_num = 1 + self.tree_depth = self.bulid_subtree(self.root_node) + + def nodes_leaves(self): + def num_leaves(node): + leaves = 0 + nodes = 0 + nodes_left = 0 + nodes_right = 0 + leaves_left = 0 + leaves_right = 0 + if node.is_leaf: + leaves += 1 + else: + nodes_left, leaves_left = num_leaves(node.LChild) + nodes_right, leaves_right = num_leaves(node.RChild) + nodes = nodes_left + nodes_right + 1 + leaves += leaves_left + leaves_right + return nodes, leaves + + def compute_depth(node): + if node.is_leaf: + return node.depth + return max( + node.depth, + compute_depth(node.LChild), + compute_depth(node.RChild), + ) + + self.depth_ = compute_depth(self.root_node) + return num_leaves(self.root_node) + + def bulid_subtree(self, node): + if node.is_leaf: + return node.depth + + # stopping conditions + is_leaf = ( + node.depth >= self.max_depth + or len(node.sample_ids) < self.min_samples_split + or is_all_equal(self.Y[node.sample_ids]) + ) + + if is_leaf or node.find_best_split(self.max_features) < 0: + node.is_leaf = True + node.class_distribution = compute_class_distribution( + self.Y[node.sample_ids], self.classNum + ) + return node.depth + + node.grow_stump() + node.is_leaf = False + self.leaf_num += 1 + L_subtree_depth = self.bulid_subtree(node.LChild) + R_subtree_depth = self.bulid_subtree(node.RChild) + return max(L_subtree_depth, R_subtree_depth) + + def predict_forOneInstance(self, x): + present_node = self.root_node + while not (present_node.is_leaf): + if present_node.split.test_forOneInstance(x): + present_node = present_node.LChild + else: + present_node = present_node.RChild + return np.argmax(present_node.class_distribution) + + def predict(self, X): + m = X.shape[0] + Y_predicted = np.zeros((m,), dtype=int) + for i in range(m): + x = X[i] + Y_predicted[i] = self.predict_forOneInstance(x) + return Y_predicted + + def score( + self, X: np.array, y: np.array, sample_weight: np.array = None + ) -> float: + y_pred = self.predict(X) + return np.mean(y_pred == y) + + +#################### +"""function""" + + +def sigmoid(z): + # because that -z is too big will arise runtimeWarning in np.exp() + if isinstance(z, float) and (z < -500): + z = -500 + elif not (isinstance(z, float)): + z[z < -500] = (-500) * np.ones(sum(z < -500)) + + return 1 / (np.exp(-z) + 1) + + +def is_all_equal(x): + x_min, x_max = x.min(), x.max() + return x_min == x_max + + +def compute_class_distribution(Y, class_num): + sample_num = len(Y) + ratio_each_class = [sum(Y == k) / sample_num for k in range(class_num)] + return np.array(ratio_each_class) diff --git a/src/wodt/__init__.py b/src/wodt/__init__.py new file mode 100644 index 0000000..6a86cb0 --- /dev/null +++ b/src/wodt/__init__.py @@ -0,0 +1,5 @@ +from .WODT import TreeClassifier + +__all__ = [ + "TreeClassifier", +]