Complete refactor of KDB with BayesNet library

This commit is contained in:
2023-07-12 12:07:01 +02:00
parent 2ff38f73e7
commit aef22306ef
3 changed files with 24 additions and 42 deletions

View File

@@ -2879,7 +2879,12 @@ static PyObject *__pyx_pf_10bayesclass_8BayesNet_12BayesNetwork_14addEdge(struct
__Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
__pyx_t_5 = __pyx_convert_string_from_py_std__in_string(__pyx_t_1); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 45, __pyx_L1_error) __pyx_t_5 = __pyx_convert_string_from_py_std__in_string(__pyx_t_1); if (unlikely(PyErr_Occurred())) __PYX_ERR(0, 45, __pyx_L1_error)
__Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
try {
__pyx_v_self->thisptr->addEdge(__pyx_t_4, __pyx_t_5); __pyx_v_self->thisptr->addEdge(__pyx_t_4, __pyx_t_5);
} catch(...) {
__Pyx_CppExn2PyErr();
__PYX_ERR(0, 45, __pyx_L1_error)
}
/* "bayesclass/BayesNetwork.pyx":44 /* "bayesclass/BayesNetwork.pyx":44
* def addNode(self, name, states): * def addNode(self, name, states):

View File

@@ -11,12 +11,12 @@ cdef extern from "Network.h" namespace "bayesnet":
vector[int] predict(vector[vector[int]]&) vector[int] predict(vector[vector[int]]&)
vector[vector[double]] predict_proba(vector[vector[int]]&) vector[vector[double]] predict_proba(vector[vector[int]]&)
float score(const vector[vector[int]]&, const vector[int]&) float score(const vector[vector[int]]&, const vector[int]&)
void addNode(string, int); void addNode(string, int)
void addEdge(string, string); void addEdge(string, string) except +
vector[string] getFeatures(); vector[string] getFeatures()
int getClassNumStates(); int getClassNumStates()
int getStates(); int getStates()
string getClassName(); string getClassName()
string version() string version()
cdef class BayesNetwork: cdef class BayesNetwork:

View File

@@ -368,7 +368,7 @@ class KDB(BayesBase):
max_minfo = np.argmax(cond_w[idx, :]) max_minfo = np.argmax(cond_w[idx, :])
if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta: if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
try: try:
self.add_edge( self.model_.addEdge(
self.feature_names_in_[max_minfo], self.feature_names_in_[max_minfo],
self.feature_names_in_[idx], self.feature_names_in_[idx],
) )
@@ -399,7 +399,6 @@ class KDB(BayesBase):
Compute the conditional probabilility infered by the structure of BN by Compute the conditional probabilility infered by the structure of BN by
using counts from DB, and output BN. using counts from DB, and output BN.
""" """
super()._build(kwargs)
# 1. get the mutual information between each feature and the class # 1. get the mutual information between each feature and the class
mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True) mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
# 2. symmetric matrix where each element represents I(X, Y| class_node) # 2. symmetric matrix where each element represents I(X, Y| class_node)
@@ -410,53 +409,31 @@ class KDB(BayesBase):
self.class_name_, self.class_name_,
self.n_classes_, self.n_classes_,
) )
conditional_weights = metrics.conditionalEdgeWeights(self.n_features_in_ + 1) conditional_weights = metrics.conditionalEdgeWeights(
''' self.n_features_in_ + 1
# Step 1: Compute edge weights for a fully connected graph.
n_vars = len(data.columns)
pbar = combinations(data.columns, 2)
if show_progress and SHOW_PROGRESS:
pbar = tqdm(pbar, total=(n_vars * (n_vars - 1) / 2), desc="Building tree")
def _conditional_edge_weights_fn(u, v):
"""
Computes the conditional edge weight of variable index u and v conditioned on class_node
"""
cond_marginal = data.loc[:, class_node].value_counts() / data.shape[0]
cond_edge_weight = 0
for index, marg_prob in cond_marginal.items():
df_cond_subset = data[data.loc[:, class_node] == index]
cond_edge_weight += marg_prob * edge_weights_fn(
df_cond_subset.loc[:, u], df_cond_subset.loc[:, v]
) )
return cond_edge_weight
vals = Parallel(n_jobs=1, prefer="threads")(
delayed(_conditional_edge_weights_fn)(u, v) for u, v in pbar
)
weights = np.zeros((n_vars, n_vars))
indices = np.triu_indices(n_vars, k=1)
weights[indices] = vals
weights.T[indices] = vals
return weights
'''
# 3. Let the used variable list, S, be empty. # 3. Let the used variable list, S, be empty.
S_nodes = [] S_nodes = []
num_states = {
feature: len(states)
for feature, states in kwargs["state_names"].items()
}
# 4. Let the DAG being constructed, BN, begin with a single class node # 4. Let the DAG being constructed, BN, begin with a single class node
self.model_ = BayesNetwork()
self.model_.addNode(self.class_name_, self.n_classes_)
# 5. Repeat until S includes all domain features # 5. Repeat until S includes all domain features
# 5.1 Select feature Xmax which is not in S and has the largest value # 5.1 Select feature Xmax which is not in S and has the largest value
for idx in np.argsort(mutual): for idx in np.argsort(mutual):
# 5.2 Add a node to BN representing Xmax. # 5.2 Add a node to BN representing Xmax.
feature = self.feature_names_in_[idx] feature = self.feature_names_in_[idx]
self.model_.addNode(feature, num_states[feature])
# 5.3 Add an arc from C to Xmax in BN. # 5.3 Add an arc from C to Xmax in BN.
self.edges_.append(self.class_name_, feature) self.model_.addEdge(self.class_name_, feature)
# 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S # 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S
self._add_m_edges(idx, S_nodes, conditional_weights) self._add_m_edges(idx, S_nodes, conditional_weights)
# 5.5 Add Xmax to S. # 5.5 Add Xmax to S.
S_nodes.append(idx) S_nodes.append(idx)
self.dag_ = dag self.edges_ = []
def build_spodes(features, class_name): def build_spodes(features, class_name):