Add getStates

2025-08-17 16:45:54 +00:00 · 2023-07-11 21:28:29 +02:00
parent 36cc875615
commit 8b6624e08a
5 changed files with 306 additions and 197 deletions
--- a/bayesclass/BayesNetwork.cpp
+++ b/bayesclass/BayesNetwork.cpp
--- a/bayesclass/BayesNetwork.pyx
+++ b/bayesclass/BayesNetwork.pyx
@@ -15,6 +15,7 @@ cdef extern from "Network.h" namespace "bayesnet":
        void addEdge(string, string);
        vector[string] getFeatures();
        int getClassNumStates();
+        int getStates();
        string getClassName();
        string version()
        
@@ -45,6 +46,8 @@ cdef class BayesNetwork:
    def getFeatures(self):
        res = self.thisptr.getFeatures()
        return [x.decode() for x in res]
+    def getStates(self):
+        return self.thisptr.getStates()
    def getClassName(self):
        return self.thisptr.getClassName().decode()
    def getClassNumStates(self):
--- a/bayesclass/Network.cc
+++ b/bayesclass/Network.cc
@@ -38,6 +38,14 @@ namespace bayesnet {
    {
        return classNumStates;
    }
+    int Network::getStates()
+    {
+        int result = 0;
+        for (auto node : nodes) {
+            result += node.second->getNumStates();
+        }
+        return result;
+    }
    string Network::getClassName()
    {
        return className;
--- a/bayesclass/Network.h
+++ b/bayesclass/Network.h
@@ -30,6 +30,7 @@ namespace bayesnet {
        void addEdge(const string, const string);
        map<string, Node*>& getNodes();
        vector<string> getFeatures();
+        int getStates();
        int getClassNumStates();
        string getClassName();
        void fit(const vector<vector<int>>&, const vector<int>&, const vector<string>&, const string&);
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -95,7 +95,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
    @property
    def states_(self):
        if hasattr(self, "fitted_"):
-            return sum([len(item) for _, item in self.model_.states.items()])
+            return self.states_computed_
        return 0

    @property
@@ -180,14 +180,15 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        # )
        self.model_ = BayesNetwork()
        features = kwargs["features"]
-        for i, feature in enumerate(features):
-            maxf = max(self.X_[:, i] + 1)
-            self.model_.addNode(feature, maxf)
+        states = kwargs["state_names"]
+        for feature in features:
+            self.model_.addNode(feature, len(states[feature]))
        class_name = kwargs["class_name"]
        self.model_.addNode(class_name, max(self.y_) + 1)
        for source, destination in self.dag_.edges():
            self.model_.addEdge(source, destination)
        self.model_.fit(self.X_, self.y_, features, class_name)
+        self.states_computed_ = self.model_.getStates()

    def predict(self, X):
        """A reference implementation of a prediction for a classifier.
@@ -381,7 +382,7 @@ class KDB(BayesBase):

    def _build(self):
        """
-        1. For each feature Xi, compute mutual information, I(X;;C),
+        1. For each feature Xi, compute mutual information, I(X;C),
        where C is the class.
        2. Compute class conditional mutual information I(Xi;XjIC), f or each
        pair of features Xi and Xj, where i#j.
@@ -407,6 +408,37 @@ class KDB(BayesBase):
        )._get_conditional_weights(
            self.dataset_, self.class_name_, show_progress=self.show_progress
        )
+        '''
+        # Step 1: Compute edge weights for a fully connected graph.
+        n_vars = len(data.columns)
+        pbar = combinations(data.columns, 2)
+        if show_progress and SHOW_PROGRESS:
+            pbar = tqdm(pbar, total=(n_vars * (n_vars - 1) / 2), desc="Building tree")
+
+        def _conditional_edge_weights_fn(u, v):
+            """
+            Computes the conditional edge weight of variable index u and v conditioned on class_node
+            """
+            cond_marginal = data.loc[:, class_node].value_counts() / data.shape[0]
+            cond_edge_weight = 0
+            for index, marg_prob in cond_marginal.items():
+                df_cond_subset = data[data.loc[:, class_node] == index]
+                cond_edge_weight += marg_prob * edge_weights_fn(
+                    df_cond_subset.loc[:, u], df_cond_subset.loc[:, v]
+                )
+            return cond_edge_weight
+
+        vals = Parallel(n_jobs=1, prefer="threads")(
+            delayed(_conditional_edge_weights_fn)(u, v) for u, v in pbar
+        )
+        weights = np.zeros((n_vars, n_vars))
+        indices = np.triu_indices(n_vars, k=1)
+        weights[indices] = vals
+        weights.T[indices] = vals
+
+        return weights
+        '''
+
        # 3. Let the used variable list, S, be empty.
        S_nodes = []
        # 4. Let the DAG being constructed, BN, begin with a single class node