Add Random Forest

2025-08-15 23:45:54 +00:00 · 2022-01-14 14:07:58 +01:00
parent bae3b676ec
commit f43622504c
9 changed files with 31 additions and 325 deletions
--- a/results/results_accuracy_Bagging_iMac27_2022-01-14_14:00:32_0.json
+++ b/results/results_accuracy_Bagging_iMac27_2022-01-14_14:00:32_0.json
--- a/results/results_accuracy_Bagging_iMac27_2022-01-14_14:01:48_0.json
+++ b/results/results_accuracy_Bagging_iMac27_2022-01-14_14:01:48_0.json
--- a/results/results_accuracy_Bagging_iMac27_2022-01-14_14:03:24_0.json
+++ b/results/results_accuracy_Bagging_iMac27_2022-01-14_14:03:24_0.json
--- a/results/results_accuracy_RandomForest_iMac27_2022-01-14_12:39:30_0.json
+++ b/results/results_accuracy_RandomForest_iMac27_2022-01-14_12:39:30_0.json
--- a/results/results_accuracy_Wodt_iMac27_2022-01-14_12:03:47_0.json
+++ b/results/results_accuracy_Wodt_iMac27_2022-01-14_12:03:47_0.json
--- a/src/Experiments.py
+++ b/src/Experiments.py
@@ -10,6 +10,9 @@ import pandas as pd
 from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
 from Utils import Folders, Files
 from Models import Models
+from stree import Stree
+from wodt import Wodt
+from sklearn.tree import DecisionTreeClassifier


 class Randomized:
--- a/src/Models.py
+++ b/src/Models.py
@@ -1,7 +1,9 @@
+from statistics import mean
 from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 from sklearn.svm import SVC
 from stree import Stree
-from wodt import TreeClassifier
+from wodt import Wodt
 from odte import Odte


@@ -15,11 +17,15 @@ class Models:
        if name == "ExtraTree":
            return ExtraTreeClassifier
        if name == "Wodt":
-            return TreeClassifier
+            return Wodt
        if name == "SVC":
            return SVC
        if name == "ODTE":
            return Odte
+        if name == "Bagging":
+            return BaggingClassifier
+        if name == "RandomForest":
+            return RandomForestClassifier
        msg = f"No model recognized {name}"
        if name in ("Stree", "stree"):
            msg += ", did you mean STree?"
@@ -37,6 +43,21 @@ class Models:
            nodes = 0
            leaves = result.get_n_leaves()
            depth = 0
+        elif name=="Bagging":
+            if hasattr(result.base_estimator_, "nodes_leaves"):
+                nodes, leaves = list(zip(*[x.nodes_leaves() for x in result.estimators_]))
+                nodes, leaves = mean(nodes), mean(leaves)
+                depth = mean([x.depth_ for x in result.estimators_])
+            elif hasattr(result.base_estimator_, "tree_"):
+                nodes = mean([x.tree_.node_count for x in result.estimators_])
+                leaves = mean([x.get_n_leaves() for x in result.estimators_])
+                depth = mean([x.get_depth() for x in result.estimators_])
+            else:
+                nodes = leaves=depth=0
+        elif name == "RandomForest":
+            leaves = mean([x.get_n_leaves() for x in result.estimators_])
+            depth = mean([x.get_depth() for x in result.estimators_])
+            nodes = mean([x.tree_.node_count for x in result.estimators_])
        elif name == "SVC":
            nodes = leaves = depth = 0
        else:
--- a/src/wodt/WODT.py
+++ b/src/wodt/WODT.py
@@ -1,318 +0,0 @@
-########################
-"""import"""
-import numpy as np
-import random
-from scipy.optimize import minimize
-from sklearn.base import BaseEstimator, ClassifierMixin
-
-
-"""global var"""
-epsilonepsilon = 1e-220
-epsilon = 1e-50
-
-"""class"""
-
-
-class SplitQuestion(object):
-    """docstring for SplitQuestion"""
-
-    def __init__(self, attrIDs=[0], paras=[0], threshold=0):
-        super(SplitQuestion, self).__init__()
-        self.attrIDs = attrIDs
-        self.paras = paras
-        self.threshold = threshold
-
-    # we only consider continuous attributes for simplicity
-    def test_forOneInstance(self, x):
-        return np.dot(x[self.attrIDs], self.paras) <= self.threshold
-
-    def test(self, X):
-        return np.dot(X[:, self.attrIDs], self.paras) <= self.threshold
-
-
-class Node(object):
-    """docstring for RBNode"""
-
-    def __init__(self, depth, split, sample_ids, X, Y, class_num):
-        super(Node, self).__init__()
-        self.sample_ids = sample_ids
-        self.split = split
-        self.depth = depth
-        self.X = X
-        self.Y = Y
-        self.class_num = class_num
-        self.is_leaf = False
-        # after grow_stump, set the node as an internal node
-
-    def find_best_split(self, max_features="sqrt"):
-        feature_num = self.X.shape[1]
-        subset_feature_num = feature_num
-        if max_features == "sqrt":
-            subset_feature_num = int(np.sqrt(feature_num))
-        if max_features == "all":
-            subset_feature_num = feature_num
-        if max_features == "log":
-            subset_feature_num = int(np.log2(feature_num))
-        if isinstance(max_features, int):
-            subset_feature_num = max_features
-        if isinstance(max_features, float):
-            subset_feature_num = int(feature_num * max_features)
-
-        # ### get random subset of features
-        # ### feature 0 is threshold
-        feature_ids = range(feature_num)
-        subset_feature_ids = random.sample(feature_ids, subset_feature_num)
-        self.split.attrIDs = subset_feature_ids
-        subset_feature_ids = np.array(subset_feature_ids)
-
-        X = self.X
-        subFeatures_X = X[
-            self.sample_ids[:, None], subset_feature_ids[None, :]
-        ]
-        Y = self.Y[self.sample_ids]
-        class_num = self.class_num
-
-        # ##############################
-        # define func and func_gradient for optimization
-        def func(a):
-            paras = a[1:]
-            threshold = a[0]
-            p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
-            w_R = p
-            w_L = 1 - w_R
-            w_R_sum = w_R.sum()
-            w_L_sum = w_L.sum()
-            w_R_eachClass = np.array(
-                [sum(w_R[Y == k]) for k in range(class_num)]
-            )
-            w_L_eachClass = np.array(
-                [sum(w_L[Y == k]) for k in range(class_num)]
-            )
-            fun = (
-                w_L_sum * np.log2(w_L_sum + epsilonepsilon)
-                + w_R_sum * np.log2(w_R_sum + epsilonepsilon)
-                - np.sum(
-                    w_R_eachClass * np.log2(w_R_eachClass + epsilonepsilon)
-                )
-                - np.sum(
-                    w_L_eachClass * np.log2(w_L_eachClass + epsilonepsilon)
-                )
-            )
-            # fun = w_L.sum() * compute_entropy(Y, w_L) + w_R.sum()
-            # * compute_entropy(Y, w_R)
-            return fun
-
-        def func_gradient(a):
-            paras = a[1:]
-            threshold = a[0]
-
-            p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
-            w_R = p
-            w_L = 1 - w_R
-            w_R_eachClass = np.array(
-                [sum(w_R[Y == k]) for k in range(class_num)]
-            )
-            w_L_eachClass = np.array(
-                [sum(w_L[Y == k]) for k in range(class_num)]
-            )
-            la = np.log2(
-                w_L_eachClass[Y] * w_R.sum() + epsilonepsilon
-            ) - np.log2(w_R_eachClass[Y] * w_L.sum() + epsilonepsilon)
-            beta = la * p * (1 - p)
-
-            jac = np.zeros(a.shape)
-            jac[0] = -np.sum(beta)
-            jac[1:] = np.dot(subFeatures_X.T, beta)
-
-            return jac
-
-        ################################################
-        initial_a = np.random.rand(subset_feature_num + 1) - 0.5
-        result = minimize(
-            func,
-            initial_a,
-            method="L-BFGS-B",
-            jac=func_gradient,
-            options={"maxiter": 10, "disp": False},
-        )
-
-        ##########################################
-        self.split.paras = result.x[1:]
-        self.split.threshold = result.x[0]
-
-        return 1
-
-    def grow_stump(self):
-        L_bool = self.split.test(self.X[self.sample_ids])
-        L_sample_ids = self.sample_ids[L_bool]
-        R_sample_ids = self.sample_ids[~L_bool]
-        # if len(R_sample_ids) * len(L_sample_ids) == 0 :
-        # 	print('some branch is 0 sample')
-        LChild = Node(
-            self.depth + 1,
-            SplitQuestion(),
-            L_sample_ids,
-            self.X,
-            self.Y,
-            self.class_num,
-        )
-        RChild = Node(
-            self.depth + 1,
-            SplitQuestion(),
-            R_sample_ids,
-            self.X,
-            self.Y,
-            self.class_num,
-        )
-
-        if len(L_sample_ids) == 0:
-            LChild.is_leaf = True
-            LChild.class_distribution = compute_class_distribution(
-                self.Y[self.sample_ids], self.class_num
-            )
-        if len(R_sample_ids) == 0:
-            RChild.is_leaf = True
-            RChild.class_distribution = compute_class_distribution(
-                self.Y[self.sample_ids], self.class_num
-            )
-
-        self.LChild = LChild
-        self.RChild = RChild
-
-
-class TreeClassifier(BaseEstimator, ClassifierMixin):
-    """docstring for TreeClassifier"""
-
-    def __init__(
-        self,
-        max_depth=50,
-        min_samples_split=2,
-        max_features="all",
-        random_state=None,
-    ):
-        # super(TreeClassifier, self).__init__()
-        self.max_depth = max_depth
-        self.min_samples_split = min_samples_split
-        self.max_features = max_features
-        self.random_state = random_state
-
-    def fit(self, X, Y):
-        self.X = X
-        self.Y = Y
-        self.classNum = self.Y.max() + 1
-        self.sampleNum = self.X.shape[0]
-        if self.random_state is not None:
-            random.seed(self.random_state)
-        ###########
-        self.root_node = Node(
-            1,
-            SplitQuestion(),
-            np.arange(self.sampleNum, dtype=np.uint32),
-            self.X,
-            self.Y,
-            self.classNum,
-        )
-        self.leaf_num = 1
-        self.tree_depth = self.bulid_subtree(self.root_node)
-
-    def nodes_leaves(self):
-        def num_leaves(node):
-            leaves = 0
-            nodes = 0
-            nodes_left = 0
-            nodes_right = 0
-            leaves_left = 0
-            leaves_right = 0
-            if node.is_leaf:
-                leaves += 1
-            else:
-                nodes_left, leaves_left = num_leaves(node.LChild)
-                nodes_right, leaves_right = num_leaves(node.RChild)
-            nodes = nodes_left + nodes_right + 1
-            leaves += leaves_left + leaves_right
-            return nodes, leaves
-
-        def compute_depth(node):
-            if node.is_leaf:
-                return node.depth
-            return max(
-                node.depth,
-                compute_depth(node.LChild),
-                compute_depth(node.RChild),
-            )
-
-        self.depth_ = compute_depth(self.root_node)
-        return num_leaves(self.root_node)
-
-    def bulid_subtree(self, node):
-        if node.is_leaf:
-            return node.depth
-
-        # stopping conditions
-        is_leaf = (
-            node.depth >= self.max_depth
-            or len(node.sample_ids) < self.min_samples_split
-            or is_all_equal(self.Y[node.sample_ids])
-        )
-
-        if is_leaf or node.find_best_split(self.max_features) < 0:
-            node.is_leaf = True
-            node.class_distribution = compute_class_distribution(
-                self.Y[node.sample_ids], self.classNum
-            )
-            return node.depth
-
-        node.grow_stump()
-        node.is_leaf = False
-        self.leaf_num += 1
-        L_subtree_depth = self.bulid_subtree(node.LChild)
-        R_subtree_depth = self.bulid_subtree(node.RChild)
-        return max(L_subtree_depth, R_subtree_depth)
-
-    def predict_forOneInstance(self, x):
-        present_node = self.root_node
-        while not (present_node.is_leaf):
-            if present_node.split.test_forOneInstance(x):
-                present_node = present_node.LChild
-            else:
-                present_node = present_node.RChild
-        return np.argmax(present_node.class_distribution)
-
-    def predict(self, X):
-        m = X.shape[0]
-        Y_predicted = np.zeros((m,), dtype=int)
-        for i in range(m):
-            x = X[i]
-            Y_predicted[i] = self.predict_forOneInstance(x)
-        return Y_predicted
-
-    def score(
-        self, X: np.array, y: np.array, sample_weight: np.array = None
-    ) -> float:
-        y_pred = self.predict(X)
-        return np.mean(y_pred == y)
-
-
-####################
-"""function"""
-
-
-def sigmoid(z):
-    # because that -z is too big will arise runtimeWarning in np.exp()
-    if isinstance(z, float) and (z < -500):
-        z = -500
-    elif not (isinstance(z, float)):
-        z[z < -500] = (-500) * np.ones(sum(z < -500))
-
-    return 1 / (np.exp(-z) + 1)
-
-
-def is_all_equal(x):
-    x_min, x_max = x.min(), x.max()
-    return x_min == x_max
-
-
-def compute_class_distribution(Y, class_num):
-    sample_num = len(Y)
-    ratio_each_class = [sum(Y == k) / sample_num for k in range(class_num)]
-    return np.array(ratio_each_class)
--- a/src/wodt/init.py
+++ b/src/wodt/init.py
@@ -1,5 +0,0 @@
-from .WODT import TreeClassifier
-
-__all__ = [
-    "TreeClassifier",
-]