mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-15 23:45:54 +00:00
Add Random Forest
This commit is contained in:
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -10,6 +10,9 @@ import pandas as pd
|
||||
from sklearn.model_selection import StratifiedKFold, KFold, cross_validate
|
||||
from Utils import Folders, Files
|
||||
from Models import Models
|
||||
from stree import Stree
|
||||
from wodt import Wodt
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
|
||||
class Randomized:
|
||||
|
@@ -1,7 +1,9 @@
|
||||
from statistics import mean
|
||||
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
|
||||
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
|
||||
from sklearn.svm import SVC
|
||||
from stree import Stree
|
||||
from wodt import TreeClassifier
|
||||
from wodt import Wodt
|
||||
from odte import Odte
|
||||
|
||||
|
||||
@@ -15,11 +17,15 @@ class Models:
|
||||
if name == "ExtraTree":
|
||||
return ExtraTreeClassifier
|
||||
if name == "Wodt":
|
||||
return TreeClassifier
|
||||
return Wodt
|
||||
if name == "SVC":
|
||||
return SVC
|
||||
if name == "ODTE":
|
||||
return Odte
|
||||
if name == "Bagging":
|
||||
return BaggingClassifier
|
||||
if name == "RandomForest":
|
||||
return RandomForestClassifier
|
||||
msg = f"No model recognized {name}"
|
||||
if name in ("Stree", "stree"):
|
||||
msg += ", did you mean STree?"
|
||||
@@ -37,6 +43,21 @@ class Models:
|
||||
nodes = 0
|
||||
leaves = result.get_n_leaves()
|
||||
depth = 0
|
||||
elif name=="Bagging":
|
||||
if hasattr(result.base_estimator_, "nodes_leaves"):
|
||||
nodes, leaves = list(zip(*[x.nodes_leaves() for x in result.estimators_]))
|
||||
nodes, leaves = mean(nodes), mean(leaves)
|
||||
depth = mean([x.depth_ for x in result.estimators_])
|
||||
elif hasattr(result.base_estimator_, "tree_"):
|
||||
nodes = mean([x.tree_.node_count for x in result.estimators_])
|
||||
leaves = mean([x.get_n_leaves() for x in result.estimators_])
|
||||
depth = mean([x.get_depth() for x in result.estimators_])
|
||||
else:
|
||||
nodes = leaves=depth=0
|
||||
elif name == "RandomForest":
|
||||
leaves = mean([x.get_n_leaves() for x in result.estimators_])
|
||||
depth = mean([x.get_depth() for x in result.estimators_])
|
||||
nodes = mean([x.tree_.node_count for x in result.estimators_])
|
||||
elif name == "SVC":
|
||||
nodes = leaves = depth = 0
|
||||
else:
|
||||
|
318
src/wodt/WODT.py
318
src/wodt/WODT.py
@@ -1,318 +0,0 @@
|
||||
########################
|
||||
"""import"""
|
||||
import numpy as np
|
||||
import random
|
||||
from scipy.optimize import minimize
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
|
||||
|
||||
"""global var"""
|
||||
epsilonepsilon = 1e-220
|
||||
epsilon = 1e-50
|
||||
|
||||
"""class"""
|
||||
|
||||
|
||||
class SplitQuestion(object):
|
||||
"""docstring for SplitQuestion"""
|
||||
|
||||
def __init__(self, attrIDs=[0], paras=[0], threshold=0):
|
||||
super(SplitQuestion, self).__init__()
|
||||
self.attrIDs = attrIDs
|
||||
self.paras = paras
|
||||
self.threshold = threshold
|
||||
|
||||
# we only consider continuous attributes for simplicity
|
||||
def test_forOneInstance(self, x):
|
||||
return np.dot(x[self.attrIDs], self.paras) <= self.threshold
|
||||
|
||||
def test(self, X):
|
||||
return np.dot(X[:, self.attrIDs], self.paras) <= self.threshold
|
||||
|
||||
|
||||
class Node(object):
|
||||
"""docstring for RBNode"""
|
||||
|
||||
def __init__(self, depth, split, sample_ids, X, Y, class_num):
|
||||
super(Node, self).__init__()
|
||||
self.sample_ids = sample_ids
|
||||
self.split = split
|
||||
self.depth = depth
|
||||
self.X = X
|
||||
self.Y = Y
|
||||
self.class_num = class_num
|
||||
self.is_leaf = False
|
||||
# after grow_stump, set the node as an internal node
|
||||
|
||||
def find_best_split(self, max_features="sqrt"):
|
||||
feature_num = self.X.shape[1]
|
||||
subset_feature_num = feature_num
|
||||
if max_features == "sqrt":
|
||||
subset_feature_num = int(np.sqrt(feature_num))
|
||||
if max_features == "all":
|
||||
subset_feature_num = feature_num
|
||||
if max_features == "log":
|
||||
subset_feature_num = int(np.log2(feature_num))
|
||||
if isinstance(max_features, int):
|
||||
subset_feature_num = max_features
|
||||
if isinstance(max_features, float):
|
||||
subset_feature_num = int(feature_num * max_features)
|
||||
|
||||
# ### get random subset of features
|
||||
# ### feature 0 is threshold
|
||||
feature_ids = range(feature_num)
|
||||
subset_feature_ids = random.sample(feature_ids, subset_feature_num)
|
||||
self.split.attrIDs = subset_feature_ids
|
||||
subset_feature_ids = np.array(subset_feature_ids)
|
||||
|
||||
X = self.X
|
||||
subFeatures_X = X[
|
||||
self.sample_ids[:, None], subset_feature_ids[None, :]
|
||||
]
|
||||
Y = self.Y[self.sample_ids]
|
||||
class_num = self.class_num
|
||||
|
||||
# ##############################
|
||||
# define func and func_gradient for optimization
|
||||
def func(a):
|
||||
paras = a[1:]
|
||||
threshold = a[0]
|
||||
p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
|
||||
w_R = p
|
||||
w_L = 1 - w_R
|
||||
w_R_sum = w_R.sum()
|
||||
w_L_sum = w_L.sum()
|
||||
w_R_eachClass = np.array(
|
||||
[sum(w_R[Y == k]) for k in range(class_num)]
|
||||
)
|
||||
w_L_eachClass = np.array(
|
||||
[sum(w_L[Y == k]) for k in range(class_num)]
|
||||
)
|
||||
fun = (
|
||||
w_L_sum * np.log2(w_L_sum + epsilonepsilon)
|
||||
+ w_R_sum * np.log2(w_R_sum + epsilonepsilon)
|
||||
- np.sum(
|
||||
w_R_eachClass * np.log2(w_R_eachClass + epsilonepsilon)
|
||||
)
|
||||
- np.sum(
|
||||
w_L_eachClass * np.log2(w_L_eachClass + epsilonepsilon)
|
||||
)
|
||||
)
|
||||
# fun = w_L.sum() * compute_entropy(Y, w_L) + w_R.sum()
|
||||
# * compute_entropy(Y, w_R)
|
||||
return fun
|
||||
|
||||
def func_gradient(a):
|
||||
paras = a[1:]
|
||||
threshold = a[0]
|
||||
|
||||
p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
|
||||
w_R = p
|
||||
w_L = 1 - w_R
|
||||
w_R_eachClass = np.array(
|
||||
[sum(w_R[Y == k]) for k in range(class_num)]
|
||||
)
|
||||
w_L_eachClass = np.array(
|
||||
[sum(w_L[Y == k]) for k in range(class_num)]
|
||||
)
|
||||
la = np.log2(
|
||||
w_L_eachClass[Y] * w_R.sum() + epsilonepsilon
|
||||
) - np.log2(w_R_eachClass[Y] * w_L.sum() + epsilonepsilon)
|
||||
beta = la * p * (1 - p)
|
||||
|
||||
jac = np.zeros(a.shape)
|
||||
jac[0] = -np.sum(beta)
|
||||
jac[1:] = np.dot(subFeatures_X.T, beta)
|
||||
|
||||
return jac
|
||||
|
||||
################################################
|
||||
initial_a = np.random.rand(subset_feature_num + 1) - 0.5
|
||||
result = minimize(
|
||||
func,
|
||||
initial_a,
|
||||
method="L-BFGS-B",
|
||||
jac=func_gradient,
|
||||
options={"maxiter": 10, "disp": False},
|
||||
)
|
||||
|
||||
##########################################
|
||||
self.split.paras = result.x[1:]
|
||||
self.split.threshold = result.x[0]
|
||||
|
||||
return 1
|
||||
|
||||
def grow_stump(self):
|
||||
L_bool = self.split.test(self.X[self.sample_ids])
|
||||
L_sample_ids = self.sample_ids[L_bool]
|
||||
R_sample_ids = self.sample_ids[~L_bool]
|
||||
# if len(R_sample_ids) * len(L_sample_ids) == 0 :
|
||||
# print('some branch is 0 sample')
|
||||
LChild = Node(
|
||||
self.depth + 1,
|
||||
SplitQuestion(),
|
||||
L_sample_ids,
|
||||
self.X,
|
||||
self.Y,
|
||||
self.class_num,
|
||||
)
|
||||
RChild = Node(
|
||||
self.depth + 1,
|
||||
SplitQuestion(),
|
||||
R_sample_ids,
|
||||
self.X,
|
||||
self.Y,
|
||||
self.class_num,
|
||||
)
|
||||
|
||||
if len(L_sample_ids) == 0:
|
||||
LChild.is_leaf = True
|
||||
LChild.class_distribution = compute_class_distribution(
|
||||
self.Y[self.sample_ids], self.class_num
|
||||
)
|
||||
if len(R_sample_ids) == 0:
|
||||
RChild.is_leaf = True
|
||||
RChild.class_distribution = compute_class_distribution(
|
||||
self.Y[self.sample_ids], self.class_num
|
||||
)
|
||||
|
||||
self.LChild = LChild
|
||||
self.RChild = RChild
|
||||
|
||||
|
||||
class TreeClassifier(BaseEstimator, ClassifierMixin):
|
||||
"""docstring for TreeClassifier"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_depth=50,
|
||||
min_samples_split=2,
|
||||
max_features="all",
|
||||
random_state=None,
|
||||
):
|
||||
# super(TreeClassifier, self).__init__()
|
||||
self.max_depth = max_depth
|
||||
self.min_samples_split = min_samples_split
|
||||
self.max_features = max_features
|
||||
self.random_state = random_state
|
||||
|
||||
def fit(self, X, Y):
|
||||
self.X = X
|
||||
self.Y = Y
|
||||
self.classNum = self.Y.max() + 1
|
||||
self.sampleNum = self.X.shape[0]
|
||||
if self.random_state is not None:
|
||||
random.seed(self.random_state)
|
||||
###########
|
||||
self.root_node = Node(
|
||||
1,
|
||||
SplitQuestion(),
|
||||
np.arange(self.sampleNum, dtype=np.uint32),
|
||||
self.X,
|
||||
self.Y,
|
||||
self.classNum,
|
||||
)
|
||||
self.leaf_num = 1
|
||||
self.tree_depth = self.bulid_subtree(self.root_node)
|
||||
|
||||
def nodes_leaves(self):
|
||||
def num_leaves(node):
|
||||
leaves = 0
|
||||
nodes = 0
|
||||
nodes_left = 0
|
||||
nodes_right = 0
|
||||
leaves_left = 0
|
||||
leaves_right = 0
|
||||
if node.is_leaf:
|
||||
leaves += 1
|
||||
else:
|
||||
nodes_left, leaves_left = num_leaves(node.LChild)
|
||||
nodes_right, leaves_right = num_leaves(node.RChild)
|
||||
nodes = nodes_left + nodes_right + 1
|
||||
leaves += leaves_left + leaves_right
|
||||
return nodes, leaves
|
||||
|
||||
def compute_depth(node):
|
||||
if node.is_leaf:
|
||||
return node.depth
|
||||
return max(
|
||||
node.depth,
|
||||
compute_depth(node.LChild),
|
||||
compute_depth(node.RChild),
|
||||
)
|
||||
|
||||
self.depth_ = compute_depth(self.root_node)
|
||||
return num_leaves(self.root_node)
|
||||
|
||||
def bulid_subtree(self, node):
|
||||
if node.is_leaf:
|
||||
return node.depth
|
||||
|
||||
# stopping conditions
|
||||
is_leaf = (
|
||||
node.depth >= self.max_depth
|
||||
or len(node.sample_ids) < self.min_samples_split
|
||||
or is_all_equal(self.Y[node.sample_ids])
|
||||
)
|
||||
|
||||
if is_leaf or node.find_best_split(self.max_features) < 0:
|
||||
node.is_leaf = True
|
||||
node.class_distribution = compute_class_distribution(
|
||||
self.Y[node.sample_ids], self.classNum
|
||||
)
|
||||
return node.depth
|
||||
|
||||
node.grow_stump()
|
||||
node.is_leaf = False
|
||||
self.leaf_num += 1
|
||||
L_subtree_depth = self.bulid_subtree(node.LChild)
|
||||
R_subtree_depth = self.bulid_subtree(node.RChild)
|
||||
return max(L_subtree_depth, R_subtree_depth)
|
||||
|
||||
def predict_forOneInstance(self, x):
|
||||
present_node = self.root_node
|
||||
while not (present_node.is_leaf):
|
||||
if present_node.split.test_forOneInstance(x):
|
||||
present_node = present_node.LChild
|
||||
else:
|
||||
present_node = present_node.RChild
|
||||
return np.argmax(present_node.class_distribution)
|
||||
|
||||
def predict(self, X):
|
||||
m = X.shape[0]
|
||||
Y_predicted = np.zeros((m,), dtype=int)
|
||||
for i in range(m):
|
||||
x = X[i]
|
||||
Y_predicted[i] = self.predict_forOneInstance(x)
|
||||
return Y_predicted
|
||||
|
||||
def score(
|
||||
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||
) -> float:
|
||||
y_pred = self.predict(X)
|
||||
return np.mean(y_pred == y)
|
||||
|
||||
|
||||
####################
|
||||
"""function"""
|
||||
|
||||
|
||||
def sigmoid(z):
|
||||
# because that -z is too big will arise runtimeWarning in np.exp()
|
||||
if isinstance(z, float) and (z < -500):
|
||||
z = -500
|
||||
elif not (isinstance(z, float)):
|
||||
z[z < -500] = (-500) * np.ones(sum(z < -500))
|
||||
|
||||
return 1 / (np.exp(-z) + 1)
|
||||
|
||||
|
||||
def is_all_equal(x):
|
||||
x_min, x_max = x.min(), x.max()
|
||||
return x_min == x_max
|
||||
|
||||
|
||||
def compute_class_distribution(Y, class_num):
|
||||
sample_num = len(Y)
|
||||
ratio_each_class = [sum(Y == k) / sample_num for k in range(class_num)]
|
||||
return np.array(ratio_each_class)
|
@@ -1,5 +0,0 @@
|
||||
from .WODT import TreeClassifier
|
||||
|
||||
__all__ = [
|
||||
"TreeClassifier",
|
||||
]
|
Reference in New Issue
Block a user