Add KDBNew estimator

2025-08-18 09:05:55 +00:00 · 2023-02-04 17:39:32 +01:00
parent 9019b878f0
commit de45a94c9b
2 changed files with 70 additions and 32 deletions
--- a/bayesclass/init.py
+++ b/bayesclass/init.py
@@ -16,4 +16,5 @@ __all__ = [
    "TAN",
    "KDB",
    "AODE",
+    "KDBNew",
 ]
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -12,6 +12,7 @@ import networkx as nx
 from pgmpy.estimators import TreeSearch, BayesianEstimator
 from pgmpy.models import BayesianNetwork
 import matplotlib.pyplot as plt
+from fimdlp.mdlp import MultiDiscretizer
 from ._version import __version__


@@ -75,7 +76,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        return self.states_

    def fit(self, X, y, **kwargs):
-        """A reference implementation of a fitting function for a classifier.
+        """Fit classifier

        Parameters
        ----------
@@ -130,6 +131,9 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        # Return the classifier
        return self

+    def _build(self):
+        pass
+
    def _train(self, kwargs):
        self.model_ = BayesianNetwork(
            self.dag_.edges(), show_progress=self.show_progress
@@ -260,37 +264,38 @@ class TAN(BayesBase):
        return X, y

    def _build(self):
-        # est = TreeSearch(self.dataset_,
-        # root_node=self.feature_names_in_[self.head_])
-        # self.dag_ = est.estimate(
-        #     estimator_type="tan",
-        #     class_node=self.class_name_,
-        #     show_progress=self.show_progress,
-        # )
+        est = TreeSearch(
+            self.dataset_, root_node=self.feature_names_in_[self.head_]
+        )
+        self.dag_ = est.estimate(
+            estimator_type="tan",
+            class_node=self.class_name_,
+            show_progress=self.show_progress,
+        )
        # Code taken from pgmpy
-        n_jobs = -1
-        weights = TreeSearch._get_conditional_weights(
-            self.dataset_,
-            self.class_name_,
-            "mutual_info",
-            n_jobs,
-            self.show_progress,
-        )
-        # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
-        class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
-            0
-        ][0]
-        weights = np.delete(weights, class_node_idx, axis=0)
-        weights = np.delete(weights, class_node_idx, axis=1)
-        reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
-        D = TreeSearch._create_tree_and_dag(
-            weights, reduced_columns, self.feature_names_in_[self.head_]
-        )
-        # Step 4.3: Add edges from class_node to all other nodes.
-        D.add_edges_from(
-            [(self.class_name_, node) for node in reduced_columns]
-        )
-        self.dag_ = D
+        # n_jobs = -1
+        # weights = TreeSearch._get_conditional_weights(
+        #     self.dataset_,
+        #     self.class_name_,
+        #     "mutual_info",
+        #     n_jobs,
+        #     self.show_progress,
+        # )
+        # # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
+        # class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
+        #     0
+        # ][0]
+        # weights = np.delete(weights, class_node_idx, axis=0)
+        # weights = np.delete(weights, class_node_idx, axis=1)
+        # reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
+        # D = TreeSearch._create_tree_and_dag(
+        #     weights, reduced_columns, self.feature_names_in_[self.head_]
+        # )
+        # # Step 4.3: Add edges from class_node to all other nodes.
+        # D.add_edges_from(
+        #     [(self.class_name_, node) for node in reduced_columns]
+        # )
+        # self.dag_ = D


 class KDB(BayesBase):
@@ -345,7 +350,6 @@ class KDB(BayesBase):
        Compute the conditional probabilility infered by the structure of BN by
        using counts from DB, and output BN.
        """
-
        # 1. get the mutual information between each feature and the class
        mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
        # 2. symmetric matrix where each element represents I(X, Y| class_node)
@@ -449,3 +453,36 @@ class AODE(BayesBase, BaseEnsemble):
        for index, model in enumerate(self.models_):
            result[:, index] = model.predict(dataset).values.ravel()
        return mode(result, axis=1, keepdims=False).mode.ravel()
+
+
+class KDBNew(KDB):
+    def fit(self, X, y, **kwargs):
+        self.discretizer_ = MultiDiscretizer(n_jobs=1)
+        Xd = self.discretizer_.fit_transform(X, y)
+        features = kwargs["features"]
+        states = {
+            features[i]: np.unique(Xd[:, i]).tolist()
+            for i in range(Xd.shape[1])
+        }
+        kwargs["state_names"] = states
+        return super().fit(Xd, y, **kwargs)
+
+    def predict(self, X, **kwargs):
+        return super().predict(self.discretizer_.transform(X))
+
+    def check_integrity(self, X, state_names, features):
+        for i in range(X.shape[1]):
+            if not np.array_equal(
+                np.unique(X[:, i]), np.array(state_names[features[i]])
+            ):
+                print(
+                    "i",
+                    i,
+                    "features[i]",
+                    features[i],
+                    "np.unique(X[:, i])",
+                    np.unique(X[:, i]),
+                    "np.array(state_names[features[i]])",
+                    np.array(state_names[features[i]]),
+                )
+                raise ValueError("Discretization error")