mirror of
https://github.com/Doctorado-ML/benchmark.git
synced 2025-08-17 16:35:54 +00:00
Add report models in benchmark
Add SVC, WODT & ExtraTree models
This commit is contained in:
@@ -8,31 +8,14 @@ from tqdm import tqdm
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.model_selection import StratifiedKFold, cross_validate
|
from sklearn.model_selection import StratifiedKFold, cross_validate
|
||||||
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
|
|
||||||
from stree import Stree
|
|
||||||
from Utils import Folders, Files
|
from Utils import Folders, Files
|
||||||
|
from Models import Models
|
||||||
|
|
||||||
|
|
||||||
class Randomized:
|
class Randomized:
|
||||||
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
seeds = [57, 31, 1714, 17, 23, 79, 83, 97, 7, 1]
|
||||||
|
|
||||||
|
|
||||||
class Models:
|
|
||||||
@staticmethod
|
|
||||||
def get_model(name):
|
|
||||||
if name == "STree":
|
|
||||||
return Stree
|
|
||||||
elif name == "Cart":
|
|
||||||
return DecisionTreeClassifier
|
|
||||||
elif name == "ExtraTree":
|
|
||||||
return ExtraTreeClassifier
|
|
||||||
else:
|
|
||||||
msg = f"No model recognized {name}"
|
|
||||||
if name == "Stree" or name == "stree":
|
|
||||||
msg += ", did you mean STree?"
|
|
||||||
raise ValueError(msg)
|
|
||||||
|
|
||||||
|
|
||||||
class Diterator:
|
class Diterator:
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
self._stack = data.copy()
|
self._stack = data.copy()
|
||||||
@@ -178,20 +161,6 @@ class Experiment:
|
|||||||
self.leaves = []
|
self.leaves = []
|
||||||
self.depths = []
|
self.depths = []
|
||||||
|
|
||||||
def _get_complexity(self, result):
|
|
||||||
if self.model_name == "Cart":
|
|
||||||
nodes = result.tree_.node_count
|
|
||||||
depth = result.tree_.max_depth
|
|
||||||
leaves = result.get_n_leaves()
|
|
||||||
if self.model_name == "ExtraTree":
|
|
||||||
nodes = 0
|
|
||||||
leaves = result.get_n_leaves()
|
|
||||||
depth = 0
|
|
||||||
else:
|
|
||||||
nodes, leaves = result.nodes_leaves()
|
|
||||||
depth = result.depth_ if hasattr(result, "depth_") else 0
|
|
||||||
return nodes, leaves, depth
|
|
||||||
|
|
||||||
def _n_fold_crossval(self, X, y, hyperparameters):
|
def _n_fold_crossval(self, X, y, hyperparameters):
|
||||||
if self.scores != []:
|
if self.scores != []:
|
||||||
raise ValueError("Must init experiment before!")
|
raise ValueError("Must init experiment before!")
|
||||||
@@ -217,8 +186,8 @@ class Experiment:
|
|||||||
self.scores.append(res["test_score"])
|
self.scores.append(res["test_score"])
|
||||||
self.times.append(res["fit_time"])
|
self.times.append(res["fit_time"])
|
||||||
for result_item in res["estimator"]:
|
for result_item in res["estimator"]:
|
||||||
nodes_item, leaves_item, depth_item = self._get_complexity(
|
nodes_item, leaves_item, depth_item = Models.get_complexity(
|
||||||
result_item
|
self.model_name, result_item
|
||||||
)
|
)
|
||||||
self.nodes.append(nodes_item)
|
self.nodes.append(nodes_item)
|
||||||
self.leaves.append(leaves_item)
|
self.leaves.append(leaves_item)
|
||||||
|
41
src/Models.py
Normal file
41
src/Models.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from stree import Stree
|
||||||
|
from wodt import TreeClassifier
|
||||||
|
|
||||||
|
|
||||||
|
class Models:
|
||||||
|
@staticmethod
|
||||||
|
def get_model(name):
|
||||||
|
if name == "STree":
|
||||||
|
return Stree
|
||||||
|
elif name == "Cart":
|
||||||
|
return DecisionTreeClassifier
|
||||||
|
elif name == "ExtraTree":
|
||||||
|
return ExtraTreeClassifier
|
||||||
|
elif name == "Wodt":
|
||||||
|
return TreeClassifier
|
||||||
|
elif name == "SVC":
|
||||||
|
return SVC
|
||||||
|
else:
|
||||||
|
msg = f"No model recognized {name}"
|
||||||
|
if name == "Stree" or name == "stree":
|
||||||
|
msg += ", did you mean STree?"
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_complexity(name, result):
|
||||||
|
if name == "Cart":
|
||||||
|
nodes = result.tree_.node_count
|
||||||
|
depth = result.tree_.max_depth
|
||||||
|
leaves = result.get_n_leaves()
|
||||||
|
elif name == "ExtraTree":
|
||||||
|
nodes = 0
|
||||||
|
leaves = result.get_n_leaves()
|
||||||
|
depth = 0
|
||||||
|
elif name == "SVC":
|
||||||
|
nodes = leaves = depth = 0
|
||||||
|
else:
|
||||||
|
nodes, leaves = result.nodes_leaves()
|
||||||
|
depth = result.depth_ if hasattr(result, "depth_") else 0
|
||||||
|
return nodes, leaves, depth
|
@@ -400,6 +400,10 @@ class SQL(BaseReport):
|
|||||||
|
|
||||||
|
|
||||||
class Benchmark:
|
class Benchmark:
|
||||||
|
@staticmethod
|
||||||
|
def get_result_file_name():
|
||||||
|
return os.path.join(Folders.results, Files.exreport)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _process_dataset(results, data):
|
def _process_dataset(results, data):
|
||||||
model = data["model"]
|
model = data["model"]
|
||||||
@@ -414,7 +418,7 @@ class Benchmark:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def compile_results():
|
def compile_results():
|
||||||
# build Files.exreport
|
# build Files.exreport
|
||||||
result_file_name = os.path.join(Folders.results, Files.exreport)
|
result_file_name = Benchmark.get_result_file_name()
|
||||||
results = {}
|
results = {}
|
||||||
init_suffix, end_suffix = Files.results_suffixes("")
|
init_suffix, end_suffix = Files.results_suffixes("")
|
||||||
all_files = list(os.walk(Folders.results))
|
all_files = list(os.walk(Folders.results))
|
||||||
@@ -432,7 +436,7 @@ class Benchmark:
|
|||||||
f.write(f"{model}, {dataset}, {accuracy}\n")
|
f.write(f"{model}, {dataset}, {accuracy}\n")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def report():
|
def exreport():
|
||||||
def end_message(message, file):
|
def end_message(message, file):
|
||||||
length = 100
|
length = 100
|
||||||
print("*" * length)
|
print("*" * length)
|
||||||
@@ -471,3 +475,35 @@ class Benchmark:
|
|||||||
|
|
||||||
if is_exe(Files.cmd_open):
|
if is_exe(Files.cmd_open):
|
||||||
subprocess.run([Files.cmd_open, Files.exreport_pdf])
|
subprocess.run([Files.cmd_open, Files.exreport_pdf])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def report():
|
||||||
|
def build():
|
||||||
|
# Build results data structure
|
||||||
|
file_name = Benchmark.get_result_file_name()
|
||||||
|
results = {}
|
||||||
|
with open(file_name) as f:
|
||||||
|
data = f.read().splitlines()
|
||||||
|
data = data[1:]
|
||||||
|
for line in data:
|
||||||
|
model, dataset, accuracy = line.split(", ")
|
||||||
|
if model not in results:
|
||||||
|
results[model] = {}
|
||||||
|
results[model][dataset] = accuracy
|
||||||
|
return results
|
||||||
|
|
||||||
|
def show(results):
|
||||||
|
datasets = results[list(results)[0]]
|
||||||
|
print(f"{'Dataset':30s} ", end="")
|
||||||
|
lines = "=" * 30 + " "
|
||||||
|
for model in results:
|
||||||
|
print(f"{model:9s} ", end="")
|
||||||
|
lines += "=" * 9 + " "
|
||||||
|
print(f"\n{lines}")
|
||||||
|
for dataset, _ in datasets.items():
|
||||||
|
print(f"{dataset:30s} ", end="")
|
||||||
|
for model in results:
|
||||||
|
print(f"{float(results[model][dataset]):.7f} ", end="")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
show(build())
|
||||||
|
@@ -3,3 +3,4 @@ from Results import Benchmark
|
|||||||
benchmark = Benchmark()
|
benchmark = Benchmark()
|
||||||
benchmark.compile_results()
|
benchmark.compile_results()
|
||||||
benchmark.report()
|
benchmark.report()
|
||||||
|
benchmark.exreport()
|
||||||
|
318
src/wodt/WODT.py
Normal file
318
src/wodt/WODT.py
Normal file
@@ -0,0 +1,318 @@
|
|||||||
|
########################
|
||||||
|
"""import"""
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
from scipy.optimize import minimize
|
||||||
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
|
||||||
|
|
||||||
|
"""global var"""
|
||||||
|
epsilonepsilon = 1e-220
|
||||||
|
epsilon = 1e-50
|
||||||
|
|
||||||
|
"""class"""
|
||||||
|
|
||||||
|
|
||||||
|
class SplitQuestion(object):
|
||||||
|
"""docstring for SplitQuestion"""
|
||||||
|
|
||||||
|
def __init__(self, attrIDs=[0], paras=[0], threshold=0):
|
||||||
|
super(SplitQuestion, self).__init__()
|
||||||
|
self.attrIDs = attrIDs
|
||||||
|
self.paras = paras
|
||||||
|
self.threshold = threshold
|
||||||
|
|
||||||
|
# we only consider continuous attributes for simplicity
|
||||||
|
def test_forOneInstance(self, x):
|
||||||
|
return np.dot(x[self.attrIDs], self.paras) <= self.threshold
|
||||||
|
|
||||||
|
def test(self, X):
|
||||||
|
return np.dot(X[:, self.attrIDs], self.paras) <= self.threshold
|
||||||
|
|
||||||
|
|
||||||
|
class Node(object):
|
||||||
|
"""docstring for RBNode"""
|
||||||
|
|
||||||
|
def __init__(self, depth, split, sample_ids, X, Y, class_num):
|
||||||
|
super(Node, self).__init__()
|
||||||
|
self.sample_ids = sample_ids
|
||||||
|
self.split = split
|
||||||
|
self.depth = depth
|
||||||
|
self.X = X
|
||||||
|
self.Y = Y
|
||||||
|
self.class_num = class_num
|
||||||
|
self.is_leaf = False
|
||||||
|
# after grow_stump, set the node as an internal node
|
||||||
|
|
||||||
|
def find_best_split(self, max_features="sqrt"):
|
||||||
|
feature_num = self.X.shape[1]
|
||||||
|
subset_feature_num = feature_num
|
||||||
|
if max_features == "sqrt":
|
||||||
|
subset_feature_num = int(np.sqrt(feature_num))
|
||||||
|
if max_features == "all":
|
||||||
|
subset_feature_num = feature_num
|
||||||
|
if max_features == "log":
|
||||||
|
subset_feature_num = int(np.log2(feature_num))
|
||||||
|
if isinstance(max_features, int):
|
||||||
|
subset_feature_num = max_features
|
||||||
|
if isinstance(max_features, float):
|
||||||
|
subset_feature_num = int(feature_num * max_features)
|
||||||
|
|
||||||
|
# ### get random subset of features
|
||||||
|
# ### feature 0 is threshold
|
||||||
|
feature_ids = range(feature_num)
|
||||||
|
subset_feature_ids = random.sample(feature_ids, subset_feature_num)
|
||||||
|
self.split.attrIDs = subset_feature_ids
|
||||||
|
subset_feature_ids = np.array(subset_feature_ids)
|
||||||
|
|
||||||
|
X = self.X
|
||||||
|
subFeatures_X = X[
|
||||||
|
self.sample_ids[:, None], subset_feature_ids[None, :]
|
||||||
|
]
|
||||||
|
Y = self.Y[self.sample_ids]
|
||||||
|
class_num = self.class_num
|
||||||
|
|
||||||
|
# ##############################
|
||||||
|
# define func and func_gradient for optimization
|
||||||
|
def func(a):
|
||||||
|
paras = a[1:]
|
||||||
|
threshold = a[0]
|
||||||
|
p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
|
||||||
|
w_R = p
|
||||||
|
w_L = 1 - w_R
|
||||||
|
w_R_sum = w_R.sum()
|
||||||
|
w_L_sum = w_L.sum()
|
||||||
|
w_R_eachClass = np.array(
|
||||||
|
[sum(w_R[Y == k]) for k in range(class_num)]
|
||||||
|
)
|
||||||
|
w_L_eachClass = np.array(
|
||||||
|
[sum(w_L[Y == k]) for k in range(class_num)]
|
||||||
|
)
|
||||||
|
fun = (
|
||||||
|
w_L_sum * np.log2(w_L_sum + epsilonepsilon)
|
||||||
|
+ w_R_sum * np.log2(w_R_sum + epsilonepsilon)
|
||||||
|
- np.sum(
|
||||||
|
w_R_eachClass * np.log2(w_R_eachClass + epsilonepsilon)
|
||||||
|
)
|
||||||
|
- np.sum(
|
||||||
|
w_L_eachClass * np.log2(w_L_eachClass + epsilonepsilon)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# fun = w_L.sum() * compute_entropy(Y, w_L) + w_R.sum()
|
||||||
|
# * compute_entropy(Y, w_R)
|
||||||
|
return fun
|
||||||
|
|
||||||
|
def func_gradient(a):
|
||||||
|
paras = a[1:]
|
||||||
|
threshold = a[0]
|
||||||
|
|
||||||
|
p = sigmoid(np.dot(subFeatures_X, paras) - threshold)
|
||||||
|
w_R = p
|
||||||
|
w_L = 1 - w_R
|
||||||
|
w_R_eachClass = np.array(
|
||||||
|
[sum(w_R[Y == k]) for k in range(class_num)]
|
||||||
|
)
|
||||||
|
w_L_eachClass = np.array(
|
||||||
|
[sum(w_L[Y == k]) for k in range(class_num)]
|
||||||
|
)
|
||||||
|
la = np.log2(
|
||||||
|
w_L_eachClass[Y] * w_R.sum() + epsilonepsilon
|
||||||
|
) - np.log2(w_R_eachClass[Y] * w_L.sum() + epsilonepsilon)
|
||||||
|
beta = la * p * (1 - p)
|
||||||
|
|
||||||
|
jac = np.zeros(a.shape)
|
||||||
|
jac[0] = -np.sum(beta)
|
||||||
|
jac[1:] = np.dot(subFeatures_X.T, beta)
|
||||||
|
|
||||||
|
return jac
|
||||||
|
|
||||||
|
################################################
|
||||||
|
initial_a = np.random.rand(subset_feature_num + 1) - 0.5
|
||||||
|
result = minimize(
|
||||||
|
func,
|
||||||
|
initial_a,
|
||||||
|
method="L-BFGS-B",
|
||||||
|
jac=func_gradient,
|
||||||
|
options={"maxiter": 10, "disp": False},
|
||||||
|
)
|
||||||
|
|
||||||
|
##########################################
|
||||||
|
self.split.paras = result.x[1:]
|
||||||
|
self.split.threshold = result.x[0]
|
||||||
|
|
||||||
|
return 1
|
||||||
|
|
||||||
|
def grow_stump(self):
|
||||||
|
L_bool = self.split.test(self.X[self.sample_ids])
|
||||||
|
L_sample_ids = self.sample_ids[L_bool]
|
||||||
|
R_sample_ids = self.sample_ids[~L_bool]
|
||||||
|
# if len(R_sample_ids) * len(L_sample_ids) == 0 :
|
||||||
|
# print('some branch is 0 sample')
|
||||||
|
LChild = Node(
|
||||||
|
self.depth + 1,
|
||||||
|
SplitQuestion(),
|
||||||
|
L_sample_ids,
|
||||||
|
self.X,
|
||||||
|
self.Y,
|
||||||
|
self.class_num,
|
||||||
|
)
|
||||||
|
RChild = Node(
|
||||||
|
self.depth + 1,
|
||||||
|
SplitQuestion(),
|
||||||
|
R_sample_ids,
|
||||||
|
self.X,
|
||||||
|
self.Y,
|
||||||
|
self.class_num,
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(L_sample_ids) == 0:
|
||||||
|
LChild.is_leaf = True
|
||||||
|
LChild.class_distribution = compute_class_distribution(
|
||||||
|
self.Y[self.sample_ids], self.class_num
|
||||||
|
)
|
||||||
|
if len(R_sample_ids) == 0:
|
||||||
|
RChild.is_leaf = True
|
||||||
|
RChild.class_distribution = compute_class_distribution(
|
||||||
|
self.Y[self.sample_ids], self.class_num
|
||||||
|
)
|
||||||
|
|
||||||
|
self.LChild = LChild
|
||||||
|
self.RChild = RChild
|
||||||
|
|
||||||
|
|
||||||
|
class TreeClassifier(BaseEstimator, ClassifierMixin):
|
||||||
|
"""docstring for TreeClassifier"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_depth=50,
|
||||||
|
min_samples_split=2,
|
||||||
|
max_features="all",
|
||||||
|
random_state=None,
|
||||||
|
):
|
||||||
|
# super(TreeClassifier, self).__init__()
|
||||||
|
self.max_depth = max_depth
|
||||||
|
self.min_samples_split = min_samples_split
|
||||||
|
self.max_features = max_features
|
||||||
|
self.random_state = random_state
|
||||||
|
|
||||||
|
def fit(self, X, Y):
|
||||||
|
self.X = X
|
||||||
|
self.Y = Y
|
||||||
|
self.classNum = self.Y.max() + 1
|
||||||
|
self.sampleNum = self.X.shape[0]
|
||||||
|
if self.random_state is not None:
|
||||||
|
random.seed(self.random_state)
|
||||||
|
###########
|
||||||
|
self.root_node = Node(
|
||||||
|
1,
|
||||||
|
SplitQuestion(),
|
||||||
|
np.arange(self.sampleNum, dtype=np.uint32),
|
||||||
|
self.X,
|
||||||
|
self.Y,
|
||||||
|
self.classNum,
|
||||||
|
)
|
||||||
|
self.leaf_num = 1
|
||||||
|
self.tree_depth = self.bulid_subtree(self.root_node)
|
||||||
|
|
||||||
|
def nodes_leaves(self):
|
||||||
|
def num_leaves(node):
|
||||||
|
leaves = 0
|
||||||
|
nodes = 0
|
||||||
|
nodes_left = 0
|
||||||
|
nodes_right = 0
|
||||||
|
leaves_left = 0
|
||||||
|
leaves_right = 0
|
||||||
|
if node.is_leaf:
|
||||||
|
leaves += 1
|
||||||
|
else:
|
||||||
|
nodes_left, leaves_left = num_leaves(node.LChild)
|
||||||
|
nodes_right, leaves_right = num_leaves(node.RChild)
|
||||||
|
nodes = nodes_left + nodes_right + 1
|
||||||
|
leaves += leaves_left + leaves_right
|
||||||
|
return nodes, leaves
|
||||||
|
|
||||||
|
def compute_depth(node):
|
||||||
|
if node.is_leaf:
|
||||||
|
return node.depth
|
||||||
|
return max(
|
||||||
|
node.depth,
|
||||||
|
compute_depth(node.LChild),
|
||||||
|
compute_depth(node.RChild),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.depth_ = compute_depth(self.root_node)
|
||||||
|
return num_leaves(self.root_node)
|
||||||
|
|
||||||
|
def bulid_subtree(self, node):
|
||||||
|
if node.is_leaf:
|
||||||
|
return node.depth
|
||||||
|
|
||||||
|
# stopping conditions
|
||||||
|
is_leaf = (
|
||||||
|
node.depth >= self.max_depth
|
||||||
|
or len(node.sample_ids) < self.min_samples_split
|
||||||
|
or is_all_equal(self.Y[node.sample_ids])
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_leaf or node.find_best_split(self.max_features) < 0:
|
||||||
|
node.is_leaf = True
|
||||||
|
node.class_distribution = compute_class_distribution(
|
||||||
|
self.Y[node.sample_ids], self.classNum
|
||||||
|
)
|
||||||
|
return node.depth
|
||||||
|
|
||||||
|
node.grow_stump()
|
||||||
|
node.is_leaf = False
|
||||||
|
self.leaf_num += 1
|
||||||
|
L_subtree_depth = self.bulid_subtree(node.LChild)
|
||||||
|
R_subtree_depth = self.bulid_subtree(node.RChild)
|
||||||
|
return max(L_subtree_depth, R_subtree_depth)
|
||||||
|
|
||||||
|
def predict_forOneInstance(self, x):
|
||||||
|
present_node = self.root_node
|
||||||
|
while not (present_node.is_leaf):
|
||||||
|
if present_node.split.test_forOneInstance(x):
|
||||||
|
present_node = present_node.LChild
|
||||||
|
else:
|
||||||
|
present_node = present_node.RChild
|
||||||
|
return np.argmax(present_node.class_distribution)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
m = X.shape[0]
|
||||||
|
Y_predicted = np.zeros((m,), dtype=int)
|
||||||
|
for i in range(m):
|
||||||
|
x = X[i]
|
||||||
|
Y_predicted[i] = self.predict_forOneInstance(x)
|
||||||
|
return Y_predicted
|
||||||
|
|
||||||
|
def score(
|
||||||
|
self, X: np.array, y: np.array, sample_weight: np.array = None
|
||||||
|
) -> float:
|
||||||
|
y_pred = self.predict(X)
|
||||||
|
return np.mean(y_pred == y)
|
||||||
|
|
||||||
|
|
||||||
|
####################
|
||||||
|
"""function"""
|
||||||
|
|
||||||
|
|
||||||
|
def sigmoid(z):
|
||||||
|
# because that -z is too big will arise runtimeWarning in np.exp()
|
||||||
|
if isinstance(z, float) and (z < -500):
|
||||||
|
z = -500
|
||||||
|
elif not (isinstance(z, float)):
|
||||||
|
z[z < -500] = (-500) * np.ones(sum(z < -500))
|
||||||
|
|
||||||
|
return 1 / (np.exp(-z) + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def is_all_equal(x):
|
||||||
|
x_min, x_max = x.min(), x.max()
|
||||||
|
return x_min == x_max
|
||||||
|
|
||||||
|
|
||||||
|
def compute_class_distribution(Y, class_num):
|
||||||
|
sample_num = len(Y)
|
||||||
|
ratio_each_class = [sum(Y == k) / sample_num for k in range(class_num)]
|
||||||
|
return np.array(ratio_each_class)
|
5
src/wodt/__init__.py
Normal file
5
src/wodt/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
from .WODT import TreeClassifier
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"TreeClassifier",
|
||||||
|
]
|
Reference in New Issue
Block a user