From 60374c6613dbf982bc482cb1b2f45dbf95cef094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Montan=CC=83ana?= <rmontanana@gmail.com>
Date: Sat, 25 Sep 2021 12:54:46 +0200
Subject: [PATCH] Add report models in benchmark Add SVC, WODT & ExtraTree
 models

---
 src/Experiments.py   |  37 +----
 src/Models.py        |  41 ++++++
 src/Results.py       |  40 +++++-
 src/benchmark.py     |   1 +
 src/wodt/WODT.py     | 318 +++++++++++++++++++++++++++++++++++++++++++
 src/wodt/__init__.py |   5 +
 6 files changed, 406 insertions(+), 36 deletions(-)
 create mode 100644 src/Models.py
 create mode 100644 src/wodt/WODT.py
 create mode 100644 src/wodt/__init__.py

diff --git a/src/Experiments.py b/src/Experiments.py
index a1f6c2e..cdd52c4 100644
--- a/src/Experiments.py
+++ b/src/Experiments.py
@@ -8,31 +8,14 @@ from tqdm import tqdm
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import StratifiedKFold, cross_validate
-from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
-from stree import Stree
 from Utils import Folders, Files
+from Models import Models
 
 
 class Randomized:
     seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
 
 
-class Models:
-    @staticmethod
-    def get_model(name):
-        if name == "STree":
-            return Stree
-        elif name == "Cart":
-            return DecisionTreeClassifier
-        elif name == "ExtraTree":
-            return ExtraTreeClassifier
-        else:
-            msg = f"No model recognized {name}"
-            if name == "Stree" or name == "stree":
-                msg += ", did you mean STree?"
-            raise ValueError(msg)
-
-
 class Diterator:
     def __init__(self, data):
         self._stack = data.copy()
@@ -178,20 +161,6 @@ class Experiment:
         self.leaves = []
         self.depths = []
 
-    def _get_complexity(self, result):
-        if self.model_name == "Cart":
-            nodes = result.tree_.node_count
-            depth = result.tree_.max_depth
-            leaves = result.get_n_leaves()
-        if self.model_name == "ExtraTree":
-            nodes = 0
-            leaves = result.get_n_leaves()
-            depth = 0
-        else:
-            nodes, leaves = result.nodes_leaves()
-            depth = result.depth_ if hasattr(result, "depth_") else 0
-        return nodes, leaves, depth
-
     def _n_fold_crossval(self, X, y, hyperparameters):
         if self.scores != []:
             raise ValueError("Must init experiment before!")
@@ -217,8 +186,8 @@ class Experiment:
             self.scores.append(res["test_score"])
             self.times.append(res["fit_time"])
             for result_item in res["estimator"]:
-                nodes_item, leaves_item, depth_item = self._get_complexity(
-                    result_item
+                nodes_item, leaves_item, depth_item = Models.get_complexity(
+                    self.model_name, result_item
                 )
                 self.nodes.append(nodes_item)
                 self.leaves.append(leaves_item)
diff --git a/src/Models.py b/src/Models.py
new file mode 100644
index 0000000..97c0b4e
--- /dev/null
+++ b/src/Models.py
@@ -0,0 +1,41 @@
+from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.svm import SVC
+from stree import Stree
+from wodt import TreeClassifier
+
+
+class Models:
+    @staticmethod
+    def get_model(name):
+        if name == "STree":
+            return Stree
+        elif name == "Cart":
+            return DecisionTreeClassifier
+        elif name == "ExtraTree":
+            return ExtraTreeClassifier
+        elif name == "Wodt":
+            return TreeClassifier
+        elif name == "SVC":
+            return SVC
+        else:
+            msg = f"No model recognized {name}"
+            if name == "Stree" or name == "stree":
+                msg += ", did you mean STree?"
+            raise ValueError(msg)
+
+    @staticmethod
+    def get_complexity(name, result):
+        if name == "Cart":
+            nodes = result.tree_.node_count
+            depth = result.tree_.max_depth
+            leaves = result.get_n_leaves()
+        elif name == "ExtraTree":
+            nodes = 0
+            leaves = result.get_n_leaves()
+            depth = 0
+        elif name == "SVC":
+            nodes = leaves = depth = 0
+        else:
+            nodes, leaves = result.nodes_leaves()
+            depth = result.depth_ if hasattr(result, "depth_") else 0
+        return nodes, leaves, depth
diff --git a/src/Results.py b/src/Results.py
index eddca0d..b0b34e1 100644
--- a/src/Results.py
+++ b/src/Results.py
@@ -400,6 +400,10 @@ class SQL(BaseReport):
 
 
 class Benchmark:
+    @staticmethod
+    def get_result_file_name():
+        return os.path.join(Folders.results, Files.exreport)
+
     @staticmethod
     def _process_dataset(results, data):
         model = data["model"]
@@ -414,7 +418,7 @@ class Benchmark:
     @staticmethod
     def compile_results():
         # build Files.exreport
-        result_file_name = os.path.join(Folders.results, Files.exreport)
+        result_file_name = Benchmark.get_result_file_name()
         results = {}
         init_suffix, end_suffix = Files.results_suffixes("")
         all_files = list(os.walk(Folders.results))
@@ -432,7 +436,7 @@ class Benchmark:
                 f.write(f"{model}, {dataset}, {accuracy}\n")
 
     @staticmethod
-    def report():
+    def exreport():
         def end_message(message, file):
             length = 100
             print("*" * length)
@@ -471,3 +475,35 @@ class Benchmark:
 
         if is_exe(Files.cmd_open):
             subprocess.run([Files.cmd_open, Files.exreport_pdf])
+
+    @staticmethod
+    def report():
+        def build():
+            # Build results data structure
+            file_name = Benchmark.get_result_file_name()
+            results = {}
+            with open(file_name) as f:
+                data = f.read().splitlines()
+                data = data[1:]
+            for line in data:
+                model, dataset, accuracy = line.split(", ")
+                if model not in results:
+                    results[model] = {}
+                results[model][dataset] = accuracy
+            return results
+
+        def show(results):
+            datasets = results[list(results)[0]]
+            print(f"{'Dataset':30s} ", end="")
+            lines = "=" * 30 + " "
+            for model in results:
+                print(f"{model:9s} ", end="")
+                lines += "=" * 9 + " "
+            print(f"\n{lines}")
+            for dataset, _ in datasets.items():
+                print(f"{dataset:30s} ", end="")
+                for model in results:
+                    print(f"{float(results[model][dataset]):.7f} ", end="")
+                print("")
+
+        show(build())
diff --git a/src/benchmark.py b/src/benchmark.py
index d6a66a4..e1b015d 100644
--- a/src/benchmark.py
+++ b/src/benchmark.py
@@ -3,3 +3,4 @@ from Results import Benchmark
 benchmark = Benchmark()
 benchmark.compile_results()
 benchmark.report()
+benchmark.exreport()
diff --git a/src/wodt/WODT.py b/src/wodt/WODT.py
new file mode 100644
index 0000000..f20a9fa
--- /dev/null
+++ b/src/wodt/WODT.py
@@ -0,0 +1,318 @@
+########################
+"""import"""
+import numpy as np
+import random
+from scipy.optimize import minimize
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+
+"""global var"""
+epsilonepsilon = 1e-220
+epsilon = 1e-50
+
+"""class"""
+
+
+class SplitQuestion(object):
+    """docstring for SplitQuestion"""
+
+    def __init__(self, attrIDs=[0], paras=[0], threshold=0):
+        super(SplitQuestion, self).__init__()
+        self.attrIDs = attrIDs
+        self.paras = paras
+        self.threshold = threshold
+
+    # we only consider continuous attributes for simplicity
+    def test_forOneInstance(self, x):
+        return np.dot(x[self.attrIDs], self.paras) <= self.threshold
+
+    def test(self, X):
+        return np.dot(X[:, self.attrIDs], self.paras) <= self.threshold
+
+
+class Node(object):
+    """docstring for RBNode"""
+
+    def __init__(self, depth, split, sample_ids, X, Y, class_num):
+        super(Node, self).__init__()
+        self.sample_ids = sample_ids
+        self.split = split
+        self.depth = depth
+        self.X = X
+        self.Y = Y
+        self.class_num = class_num
+        self.is_leaf = False
+        # after grow_stump, set the node as an internal node
+
+    def find_best_split(self, max_features="sqrt"):
+        feature_num = self.X.shape[1]
+        subset_feature_num = feature_num
+        if max_features == "sqrt":
+            subset_feature_num = int(np.sqrt(feature_num))
+        if max_features == "all":
+            subset_feature_num = feature_num
+        if max_features == "log":
+            subset_feature_num = int(np.log2(feature_num))
+        if isinstance(max_features, int):
+            subset_feature_num = max_features
+        if isinstance(max_features, float):
+            subset_feature_num = int(feature_num * max_features)
+
+        # ### get random subset of features
+        # ### feature 0 is threshold
+        feature_ids = range(feature_num)
+        subset_feature_ids = random.sample(feature_ids, subset_feature_num)
+        self.split.attrIDs = subset_feature_ids
+        subset_feature_ids = np.array(subset_feature_ids)
+
+        X = self.X
+        subFeatures_X = X[
+            self.sample_ids[:, None], subset_feature_ids[None, :]
+        ]
+        Y = self.Y[self.sample_ids]
+        class_num = self.class_num
+
+        # ##############################
+        # define func and func_gradient for optimization
+        def func(a):
+            paras = a[1:]
+            threshold = a[0]
+            p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
+            w_R = p
+            w_L = 1 - w_R
+            w_R_sum = w_R.sum()
+            w_L_sum = w_L.sum()
+            w_R_eachClass = np.array(
+                [sum(w_R[Y == k]) for k in range(class_num)]
+            )
+            w_L_eachClass = np.array(
+                [sum(w_L[Y == k]) for k in range(class_num)]
+            )
+            fun = (
+                w_L_sum * np.log2(w_L_sum + epsilonepsilon)
+                + w_R_sum * np.log2(w_R_sum + epsilonepsilon)
+                - np.sum(
+                    w_R_eachClass * np.log2(w_R_eachClass + epsilonepsilon)
+                )
+                - np.sum(
+                    w_L_eachClass * np.log2(w_L_eachClass + epsilonepsilon)
+                )
+            )
+            # fun = w_L.sum() * compute_entropy(Y, w_L) + w_R.sum()
+            # * compute_entropy(Y, w_R)
+            return fun
+
+        def func_gradient(a):
+            paras = a[1:]
+            threshold = a[0]
+
+            p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
+            w_R = p
+            w_L = 1 - w_R
+            w_R_eachClass = np.array(
+                [sum(w_R[Y == k]) for k in range(class_num)]
+            )
+            w_L_eachClass = np.array(
+                [sum(w_L[Y == k]) for k in range(class_num)]
+            )
+            la = np.log2(
+                w_L_eachClass[Y] * w_R.sum() + epsilonepsilon
+            ) - np.log2(w_R_eachClass[Y] * w_L.sum() + epsilonepsilon)
+            beta = la * p * (1 - p)
+
+            jac = np.zeros(a.shape)
+            jac[0] = -np.sum(beta)
+            jac[1:] = np.dot(subFeatures_X.T, beta)
+
+            return jac
+
+        ################################################
+        initial_a = np.random.rand(subset_feature_num + 1) - 0.5
+        result = minimize(
+            func,
+            initial_a,
+            method="L-BFGS-B",
+            jac=func_gradient,
+            options={"maxiter": 10, "disp": False},
+        )
+
+        ##########################################
+        self.split.paras = result.x[1:]
+        self.split.threshold = result.x[0]
+
+        return 1
+
+    def grow_stump(self):
+        L_bool = self.split.test(self.X[self.sample_ids])
+        L_sample_ids = self.sample_ids[L_bool]
+        R_sample_ids = self.sample_ids[~L_bool]
+        # if len(R_sample_ids) * len(L_sample_ids) == 0 :
+        # 	print('some branch is 0 sample')
+        LChild = Node(
+            self.depth + 1,
+            SplitQuestion(),
+            L_sample_ids,
+            self.X,
+            self.Y,
+            self.class_num,
+        )
+        RChild = Node(
+            self.depth + 1,
+            SplitQuestion(),
+            R_sample_ids,
+            self.X,
+            self.Y,
+            self.class_num,
+        )
+
+        if len(L_sample_ids) == 0:
+            LChild.is_leaf = True
+            LChild.class_distribution = compute_class_distribution(
+                self.Y[self.sample_ids], self.class_num
+            )
+        if len(R_sample_ids) == 0:
+            RChild.is_leaf = True
+            RChild.class_distribution = compute_class_distribution(
+                self.Y[self.sample_ids], self.class_num
+            )
+
+        self.LChild = LChild
+        self.RChild = RChild
+
+
+class TreeClassifier(BaseEstimator, ClassifierMixin):
+    """docstring for TreeClassifier"""
+
+    def __init__(
+        self,
+        max_depth=50,
+        min_samples_split=2,
+        max_features="all",
+        random_state=None,
+    ):
+        # super(TreeClassifier, self).__init__()
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.max_features = max_features
+        self.random_state = random_state
+
+    def fit(self, X, Y):
+        self.X = X
+        self.Y = Y
+        self.classNum = self.Y.max() + 1
+        self.sampleNum = self.X.shape[0]
+        if self.random_state is not None:
+            random.seed(self.random_state)
+        ###########
+        self.root_node = Node(
+            1,
+            SplitQuestion(),
+            np.arange(self.sampleNum, dtype=np.uint32),
+            self.X,
+            self.Y,
+            self.classNum,
+        )
+        self.leaf_num = 1
+        self.tree_depth = self.bulid_subtree(self.root_node)
+
+    def nodes_leaves(self):
+        def num_leaves(node):
+            leaves = 0
+            nodes = 0
+            nodes_left = 0
+            nodes_right = 0
+            leaves_left = 0
+            leaves_right = 0
+            if node.is_leaf:
+                leaves += 1
+            else:
+                nodes_left, leaves_left = num_leaves(node.LChild)
+                nodes_right, leaves_right = num_leaves(node.RChild)
+            nodes = nodes_left + nodes_right + 1
+            leaves += leaves_left + leaves_right
+            return nodes, leaves
+
+        def compute_depth(node):
+            if node.is_leaf:
+                return node.depth
+            return max(
+                node.depth,
+                compute_depth(node.LChild),
+                compute_depth(node.RChild),
+            )
+
+        self.depth_ = compute_depth(self.root_node)
+        return num_leaves(self.root_node)
+
+    def bulid_subtree(self, node):
+        if node.is_leaf:
+            return node.depth
+
+        # stopping conditions
+        is_leaf = (
+            node.depth >= self.max_depth
+            or len(node.sample_ids) < self.min_samples_split
+            or is_all_equal(self.Y[node.sample_ids])
+        )
+
+        if is_leaf or node.find_best_split(self.max_features) < 0:
+            node.is_leaf = True
+            node.class_distribution = compute_class_distribution(
+                self.Y[node.sample_ids], self.classNum
+            )
+            return node.depth
+
+        node.grow_stump()
+        node.is_leaf = False
+        self.leaf_num += 1
+        L_subtree_depth = self.bulid_subtree(node.LChild)
+        R_subtree_depth = self.bulid_subtree(node.RChild)
+        return max(L_subtree_depth, R_subtree_depth)
+
+    def predict_forOneInstance(self, x):
+        present_node = self.root_node
+        while not (present_node.is_leaf):
+            if present_node.split.test_forOneInstance(x):
+                present_node = present_node.LChild
+            else:
+                present_node = present_node.RChild
+        return np.argmax(present_node.class_distribution)
+
+    def predict(self, X):
+        m = X.shape[0]
+        Y_predicted = np.zeros((m,), dtype=int)
+        for i in range(m):
+            x = X[i]
+            Y_predicted[i] = self.predict_forOneInstance(x)
+        return Y_predicted
+
+    def score(
+        self, X: np.array, y: np.array, sample_weight: np.array = None
+    ) -> float:
+        y_pred = self.predict(X)
+        return np.mean(y_pred == y)
+
+
+####################
+"""function"""
+
+
+def sigmoid(z):
+    # because that -z is too big will arise runtimeWarning in np.exp()
+    if isinstance(z, float) and (z < -500):
+        z = -500
+    elif not (isinstance(z, float)):
+        z[z < -500] = (-500) * np.ones(sum(z < -500))
+
+    return 1 / (np.exp(-z) + 1)
+
+
+def is_all_equal(x):
+    x_min, x_max = x.min(), x.max()
+    return x_min == x_max
+
+
+def compute_class_distribution(Y, class_num):
+    sample_num = len(Y)
+    ratio_each_class = [sum(Y == k) / sample_num for k in range(class_num)]
+    return np.array(ratio_each_class)
diff --git a/src/wodt/__init__.py b/src/wodt/__init__.py
new file mode 100644
index 0000000..6a86cb0
--- /dev/null
+++ b/src/wodt/__init__.py
@@ -0,0 +1,5 @@
+from .WODT import TreeClassifier
+
+__all__ = [
+    "TreeClassifier",
+]