From de45a94c9b148f6f74d0c4a222f34d2ca54df262 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= <rmontanana@gmail.com>
Date: Sat, 4 Feb 2023 17:39:32 +0100
Subject: [PATCH] Add KDBNew estimator

---
 bayesclass/__init__.py |   1 +
 bayesclass/clfs.py     | 101 ++++++++++++++++++++++++++++-------------
 2 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/bayesclass/__init__.py b/bayesclass/__init__.py
index fb89952..0ef73dd 100644
--- a/bayesclass/__init__.py
+++ b/bayesclass/__init__.py
@@ -16,4 +16,5 @@ __all__ = [
     "TAN",
     "KDB",
     "AODE",
+    "KDBNew",
 ]
diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py
index c5242ad..2e4a98b 100644
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -12,6 +12,7 @@ import networkx as nx
 from pgmpy.estimators import TreeSearch, BayesianEstimator
 from pgmpy.models import BayesianNetwork
 import matplotlib.pyplot as plt
+from fimdlp.mdlp import MultiDiscretizer
 from ._version import __version__
 
 
@@ -75,7 +76,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
         return self.states_
 
     def fit(self, X, y, **kwargs):
-        """A reference implementation of a fitting function for a classifier.
+        """Fit classifier
 
         Parameters
         ----------
@@ -130,6 +131,9 @@ class BayesBase(BaseEstimator, ClassifierMixin):
         # Return the classifier
         return self
 
+    def _build(self):
+        pass
+
     def _train(self, kwargs):
         self.model_ = BayesianNetwork(
             self.dag_.edges(), show_progress=self.show_progress
@@ -260,37 +264,38 @@ class TAN(BayesBase):
         return X, y
 
     def _build(self):
-        # est = TreeSearch(self.dataset_,
-        # root_node=self.feature_names_in_[self.head_])
-        # self.dag_ = est.estimate(
-        #     estimator_type="tan",
-        #     class_node=self.class_name_,
-        #     show_progress=self.show_progress,
-        # )
+        est = TreeSearch(
+            self.dataset_, root_node=self.feature_names_in_[self.head_]
+        )
+        self.dag_ = est.estimate(
+            estimator_type="tan",
+            class_node=self.class_name_,
+            show_progress=self.show_progress,
+        )
         # Code taken from pgmpy
-        n_jobs = -1
-        weights = TreeSearch._get_conditional_weights(
-            self.dataset_,
-            self.class_name_,
-            "mutual_info",
-            n_jobs,
-            self.show_progress,
-        )
-        # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
-        class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
-            0
-        ][0]
-        weights = np.delete(weights, class_node_idx, axis=0)
-        weights = np.delete(weights, class_node_idx, axis=1)
-        reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
-        D = TreeSearch._create_tree_and_dag(
-            weights, reduced_columns, self.feature_names_in_[self.head_]
-        )
-        # Step 4.3: Add edges from class_node to all other nodes.
-        D.add_edges_from(
-            [(self.class_name_, node) for node in reduced_columns]
-        )
-        self.dag_ = D
+        # n_jobs = -1
+        # weights = TreeSearch._get_conditional_weights(
+        #     self.dataset_,
+        #     self.class_name_,
+        #     "mutual_info",
+        #     n_jobs,
+        #     self.show_progress,
+        # )
+        # # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
+        # class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
+        #     0
+        # ][0]
+        # weights = np.delete(weights, class_node_idx, axis=0)
+        # weights = np.delete(weights, class_node_idx, axis=1)
+        # reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
+        # D = TreeSearch._create_tree_and_dag(
+        #     weights, reduced_columns, self.feature_names_in_[self.head_]
+        # )
+        # # Step 4.3: Add edges from class_node to all other nodes.
+        # D.add_edges_from(
+        #     [(self.class_name_, node) for node in reduced_columns]
+        # )
+        # self.dag_ = D
 
 
 class KDB(BayesBase):
@@ -345,7 +350,6 @@ class KDB(BayesBase):
         Compute the conditional probabilility infered by the structure of BN by
         using counts from DB, and output BN.
         """
-
         # 1. get the mutual information between each feature and the class
         mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
         # 2. symmetric matrix where each element represents I(X, Y| class_node)
@@ -449,3 +453,36 @@ class AODE(BayesBase, BaseEnsemble):
         for index, model in enumerate(self.models_):
             result[:, index] = model.predict(dataset).values.ravel()
         return mode(result, axis=1, keepdims=False).mode.ravel()
+
+
+class KDBNew(KDB):
+    def fit(self, X, y, **kwargs):
+        self.discretizer_ = MultiDiscretizer(n_jobs=1)
+        Xd = self.discretizer_.fit_transform(X, y)
+        features = kwargs["features"]
+        states = {
+            features[i]: np.unique(Xd[:, i]).tolist()
+            for i in range(Xd.shape[1])
+        }
+        kwargs["state_names"] = states
+        return super().fit(Xd, y, **kwargs)
+
+    def predict(self, X, **kwargs):
+        return super().predict(self.discretizer_.transform(X))
+
+    def check_integrity(self, X, state_names, features):
+        for i in range(X.shape[1]):
+            if not np.array_equal(
+                np.unique(X[:, i]), np.array(state_names[features[i]])
+            ):
+                print(
+                    "i",
+                    i,
+                    "features[i]",
+                    features[i],
+                    "np.unique(X[:, i])",
+                    np.unique(X[:, i]),
+                    "np.array(state_names[features[i]])",
+                    np.array(state_names[features[i]]),
+                )
+                raise ValueError("Discretization error")