From e837c6cef7db728f345d485fc728da6902aa5594 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ricardo=20Monta=C3=B1ana?= <rmontanana@gmail.com>
Date: Fri, 27 Jan 2023 19:25:01 +0100
Subject: [PATCH] feat: Add feature_names_in_ to classifiers

---
 bayesclass/clfs.py              | 44 +++++++++++++++++++--------------
 bayesclass/tests/test_AODE.py   | 13 +++++++---
 bayesclass/tests/test_KDB.py    | 21 ++++++++++------
 bayesclass/tests/test_TAN.py    | 15 ++++++++---
 bayesclass/tests/test_common.py |  6 ++---
 5 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/bayesclass/clfs.py b/bayesclass/clfs.py
index 563c83f..18510bd 100644
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -19,14 +19,12 @@ class BayesBase(BaseEstimator, ClassifierMixin):
     def __init__(self, random_state, show_progress):
         self.random_state = random_state
         self.show_progress = show_progress
-        # To keep compatiblity with the benchmark platform
-        self.nodes_leaves = self.nodes_edges
 
     def _more_tags(self):
         return {
             "requires_positive_X": True,
             "requires_positive_y": True,
-            "preserve_dtype": [np.int64, np.int32],
+            "preserve_dtype": [np.int32, np.int64],
             "requires_y": True,
         }
 
@@ -44,6 +42,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
         """Check the common parameters passed to fit"""
         # Check that X and y have correct shape
         X, y = check_X_y(X, y)
+        X = self._validate_data(X, reset=True)
         # Store the classes seen during fit
         self.classes_ = unique_labels(y)
         self.n_classes_ = self.classes_.shape[0]
@@ -55,9 +54,10 @@ class BayesBase(BaseEstimator, ClassifierMixin):
                 setattr(self, f"{key}_", value)
             else:
                 raise ValueError(f"Unexpected argument: {key}")
+        self.feature_names_in_ = self.features_
         if self.random_state is not None:
             random.seed(self.random_state)
-        if len(self.features_) != X.shape[1]:
+        if len(self.feature_names_in_) != X.shape[1]:
             raise ValueError(
                 "Number of features does not match the number of columns in X"
             )
@@ -116,13 +116,17 @@ class BayesBase(BaseEstimator, ClassifierMixin):
         # Store the information needed to build the model
         self.X_ = X_
         self.y_ = y_
-        self.dataset_ = pd.DataFrame(self.X_, columns=self.features_)
+        self.dataset_ = pd.DataFrame(
+            self.X_, columns=self.feature_names_in_, dtype=np.int32
+        )
         self.dataset_[self.class_name_] = self.y_
         # Build the DAG
         self._build()
         # Train the model
         self._train(kwargs)
         self.fitted_ = True
+        # To keep compatiblity with the benchmark platform
+        self.nodes_leaves = self.nodes_edges
         # Return the classifier
         return self
 
@@ -189,7 +193,9 @@ class BayesBase(BaseEstimator, ClassifierMixin):
 
         # Input validation
         X = check_array(X)
-        dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
+        dataset = pd.DataFrame(
+            X, columns=self.feature_names_in_, dtype=np.int32
+        )
         return self.model_.predict(dataset).values.ravel()
 
     def plot(self, title="", node_size=800):
@@ -226,7 +232,7 @@ class TAN(BayesBase):
         The classes seen at :meth:`fit`.
     class_name_ : str
         The name of the class column
-    features_ : list
+    feature_names_in_ : list
         The list of features names
     head_ : int
         The index of the node used as head for the initial DAG
@@ -254,7 +260,7 @@ class TAN(BayesBase):
         return X, y
 
     def _build(self):
-        # est = TreeSearch(self.dataset_, root_node=self.features_[self.head_])
+        # est = TreeSearch(self.dataset_, root_node=self. feature_names_in_[self.head_])
         # self.dag_ = est.estimate(
         #     estimator_type="tan",
         #     class_node=self.class_name_,
@@ -277,9 +283,8 @@ class TAN(BayesBase):
         weights = np.delete(weights, class_node_idx, axis=1)
         reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
         D = TreeSearch._create_tree_and_dag(
-            weights, reduced_columns, self.features_[self.head_]
+            weights, reduced_columns, self.feature_names_in_[self.head_]
         )
-
         # Step 4.3: Add edges from class_node to all other nodes.
         D.add_edges_from(
             [(self.class_name_, node) for node in reduced_columns]
@@ -309,7 +314,8 @@ class KDB(BayesBase):
             if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
                 try:
                     dag.add_edge(
-                        self.features_[max_minfo], self.features_[idx]
+                        self.feature_names_in_[max_minfo],
+                        self.feature_names_in_[idx],
                     )
                     num += 1
                 except ValueError:
@@ -349,7 +355,7 @@ class KDB(BayesBase):
         # 5. 5.1
         for idx in np.argsort(mutual):
             # 5.2
-            feature = self.features_[idx]
+            feature = self.feature_names_in_[idx]
             dag.add_node(feature)
             # 5.3
             dag.add_edge(self.class_name_, feature)
@@ -396,13 +402,13 @@ class AODE(BayesBase, BaseEnsemble):
     def _train(self, kwargs):
         """Build SPODE estimators (Super Parent One Dependent Estimator)"""
         self.models_ = []
-        class_edges = [(self.class_name_, f) for f in self.features_]
+        class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
         states = dict(state_names=kwargs.pop("state_names", []))
         for idx in range(self.n_features_in_):
             feature_edges = [
-                (self.features_[idx], f)
-                for f in self.features_
-                if f != self.features_[idx]
+                (self.feature_names_in_[idx], f)
+                for f in self.feature_names_in_
+                if f != self.feature_names_in_[idx]
             ]
             feature_edges.extend(class_edges)
             model = BayesianNetwork(
@@ -425,11 +431,13 @@ class AODE(BayesBase, BaseEnsemble):
     def predict(self, X: np.ndarray) -> np.ndarray:
         check_is_fitted(self, ["X_", "y_", "fitted_"])
         # Input validation
-        X = self._validate_data(X, reset=False)
+        X = check_array(X)
         n_samples = X.shape[0]
         n_estimators = len(self.models_)
         result = np.empty((n_samples, n_estimators))
-        dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
+        dataset = pd.DataFrame(
+            X, columns=self.feature_names_in_, dtype=np.int32
+        )
         for index, model in enumerate(self.models_):
             result[:, index] = model.predict(dataset).values.ravel()
         return mode(result, axis=1, keepdims=False).mode.ravel()
diff --git a/bayesclass/tests/test_AODE.py b/bayesclass/tests/test_AODE.py
index ec3c908..6f5e9df 100644
--- a/bayesclass/tests/test_AODE.py
+++ b/bayesclass/tests/test_AODE.py
@@ -31,7 +31,7 @@ def test_AODE_default_hyperparameters(data, clf):
     assert clf.random_state == 17
     clf.fit(*data)
     assert clf.class_name_ == "class"
-    assert clf.features_ == [
+    assert clf.feature_names_in_ == [
         "feature_0",
         "feature_1",
         "feature_2",
@@ -56,7 +56,7 @@ def test_AODE_version(clf):
 
 
 def test_AODE_nodes_edges(clf, data):
-    assert clf.nodes_leaves() == (0, 0)
+    assert clf.nodes_edges() == (0, 0)
     clf.fit(*data)
     assert clf.nodes_leaves() == (20, 28)
 
@@ -71,7 +71,14 @@ def test_AODE_states(clf, data):
 
 def test_AODE_classifier(data, clf):
     clf.fit(*data)
-    attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
+    attribs = [
+        "classes_",
+        "X_",
+        "y_",
+        "feature_names_in_",
+        "class_name_",
+        "n_features_in_",
+    ]
     for attr in attribs:
         assert hasattr(clf, attr)
     X = data[0]
diff --git a/bayesclass/tests/test_KDB.py b/bayesclass/tests/test_KDB.py
index 2d40d14..41b977d 100644
--- a/bayesclass/tests/test_KDB.py
+++ b/bayesclass/tests/test_KDB.py
@@ -34,7 +34,7 @@ def test_KDB_default_hyperparameters(data, clf):
     assert clf.k == 3
     clf.fit(*data)
     assert clf.class_name_ == "class"
-    assert clf.features_ == [
+    assert clf.feature_names_in_ == [
         "feature_0",
         "feature_1",
         "feature_2",
@@ -48,7 +48,7 @@ def test_KDB_version(clf):
 
 
 def test_KDB_nodes_edges(clf, data):
-    assert clf.nodes_leaves() == (0, 0)
+    assert clf.nodes_edges() == (0, 0)
     clf.fit(*data)
     assert clf.nodes_leaves() == (5, 10)
 
@@ -63,7 +63,7 @@ def test_KDB_states(clf, data):
 
 def test_KDB_classifier(data, clf):
     clf.fit(*data)
-    attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
+    attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
     for attr in attribs:
         assert hasattr(clf, attr)
     X = data[0]
@@ -108,14 +108,19 @@ def test_KDB_error_size_predict(data, clf):
 def test_KDB_dont_do_cycles():
     clf = KDB(k=4)
     dag = BayesianNetwork()
-    clf.features_ = ["feature_0", "feature_1", "feature_2", "feature_3"]
+    clf.feature_names_in_ = [
+        "feature_0",
+        "feature_1",
+        "feature_2",
+        "feature_3",
+    ]
     nodes = list(range(4))
     weights = np.ones((4, 4))
     for idx in range(1, 4):
-        dag.add_edge(clf.features_[0], clf.features_[idx])
-    dag.add_edge(clf.features_[1], clf.features_[2])
-    dag.add_edge(clf.features_[1], clf.features_[3])
-    dag.add_edge(clf.features_[2], clf.features_[3])
+        dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
+    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
+    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
+    dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
     for idx in range(4):
         clf._add_m_edges(dag, idx, nodes, weights)
         assert len(dag.edges()) == 6
diff --git a/bayesclass/tests/test_TAN.py b/bayesclass/tests/test_TAN.py
index d594bc8..ddba503 100644
--- a/bayesclass/tests/test_TAN.py
+++ b/bayesclass/tests/test_TAN.py
@@ -32,7 +32,7 @@ def test_TAN_default_hyperparameters(data, clf):
     clf.fit(*data)
     assert clf.head_ == 0
     assert clf.class_name_ == "class"
-    assert clf.features_ == [
+    assert clf.feature_names_in_ == [
         "feature_0",
         "feature_1",
         "feature_2",
@@ -46,7 +46,7 @@ def test_TAN_version(clf):
 
 
 def test_TAN_nodes_edges(clf, data):
-    assert clf.nodes_leaves() == (0, 0)
+    assert clf.nodes_edges() == (0, 0)
     clf = TAN(random_state=17)
     clf.fit(*data, head="random")
     assert clf.nodes_leaves() == (5, 7)
@@ -68,7 +68,14 @@ def test_TAN_random_head(data):
 
 def test_TAN_classifier(data, clf):
     clf.fit(*data)
-    attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"]
+    attribs = [
+        "classes_",
+        "X_",
+        "y_",
+        "head_",
+        "feature_names_in_",
+        "class_name_",
+    ]
     for attr in attribs:
         assert hasattr(clf, attr)
     X = data[0]
@@ -89,7 +96,7 @@ def test_TAN_plot(data, clf):
     clf.plot("TAN Iris head=0")
 
 
-def test_KDB_wrong_num_features(data, clf):
+def test_TAN_wrong_num_features(data, clf):
     with pytest.raises(
         ValueError,
         match="Number of features does not match the number of columns in X",
diff --git a/bayesclass/tests/test_common.py b/bayesclass/tests/test_common.py
index b5334d1..d111d58 100644
--- a/bayesclass/tests/test_common.py
+++ b/bayesclass/tests/test_common.py
@@ -5,10 +5,8 @@ from sklearn.utils.estimator_checks import check_estimator
 from bayesclass.clfs import TAN, KDB, AODE
 
 
-@pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
-# @pytest.mark.parametrize("estimators", [TAN()])
-
-
+# @pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
+@pytest.mark.parametrize("estimators", [AODE()])
 def test_all_estimators(estimators):
     i = 0
     for estimator, test in check_estimator(estimators, generate_only=True):