Add test_BoostAODE

Continue BootAODE
Add BoostAODE initial model
2025-08-17 16:45:54 +00:00 · 2023-06-18 16:51:38 +02:00 · 2023-06-17 17:06:37 +02:00 · 2023-06-15 14:28:35 +02:00 · 2023-06-15 14:22:24 +02:00 · 2023-06-15 14:13:15 +02:00
21 changed files with 1362 additions and 206 deletions
--- a/6
+++ b/6
@@ -37,6 +37,12 @@ doc-clean:  ## Update documentation
 audit: ## Audit pip
 	pip-audit
 version:
 	@echo "Current Python version .....: $(shell python --version)"
 	@echo "Current Bayesclass version .: $(shell python -c "from bayesclass import _version; print(_version.__version__)")"
 	@echo "Installed Bayesclass version: $(shell pip show bayesclass | grep Version | cut -d' ' -f2)"
 	@echo "Installed pgmpy version ....: $(shell pip show pgmpy | grep Version | cut -d' ' -f2)"
 help: ## Show help message
 	@IFS=$$'\n' ; \
 	help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \
--- a/bayesclass/init.py
+++ b/bayesclass/init.py
@@ -16,4 +16,8 @@ __all__ = [
    "TAN",
    "KDB",
    "AODE",
    "KDBNew",
    "AODENew",
    "BoostAODE",
    "BoostSPODE",
 ]
--- a/bayesclass/_version.py
+++ b/bayesclass/_version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.1.1"
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -1,8 +1,10 @@
 import random
 import warnings
 import numpy as np
 import pandas as pd
 from scipy.stats import mode
-from sklearn.base import ClassifierMixin, BaseEstimator
+from sklearn.base import clone, ClassifierMixin, BaseEstimator
 from sklearn.feature_selection import SelectKBest
 from sklearn.ensemble import BaseEnsemble
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.utils.multiclass import unique_labels
@@ -10,10 +12,16 @@ from sklearn.feature_selection import mutual_info_classif
 import networkx as nx
 from pgmpy.estimators import TreeSearch, BayesianEstimator
 from pgmpy.models import BayesianNetwork
 from pgmpy.base import DAG
 import matplotlib.pyplot as plt
 from fimdlp.mdlp import FImdlp
 from ._version import __version__
 def default_feature_names(num_features):
    return [f"feature_{i}" for i in range(num_features)]
 class BayesBase(BaseEstimator, ClassifierMixin):
    def __init__(self, random_state, show_progress):
        self.random_state = random_state
@@ -23,7 +31,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        return {
            "requires_positive_X": True,
            "requires_positive_y": True,
-            "preserve_dtype": [np.int64, np.int32],
+            "preserve_dtype": [np.int32, np.int64],
            "requires_y": True,
        }
@@ -32,35 +40,68 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        """Return the version of the package."""
        return __version__
-    def nodes_leaves(self):
+    def nodes_edges(self):
-        """To keep compatiblity with the benchmark platform"""
+        if hasattr(self, "dag_"):
            return len(self.dag_), len(self.dag_.edges())
        return 0, 0
    @staticmethod
    def default_class_name():
        return "class"
    def build_dataset(self):
        self.dataset_ = pd.DataFrame(
            self.X_, columns=self.feature_names_in_, dtype=np.int32
        )
        self.dataset_[self.class_name_] = self.y_
        if self.sample_weight_ is not None:
            self.dataset_["_weight"] = self.sample_weight_
    def _check_params_fit(self, X, y, expected_args, kwargs):
        """Check the common parameters passed to fit"""
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        X = self._validate_data(X, reset=True)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.n_classes_ = self.classes_.shape[0]
        # Default values
-        self.class_name_ = "class"
+        self.weighted_ = False
-        self.features_ = [f"feature_{i}" for i in range(X.shape[1])]
+        self.sample_weight_ = None
        self.class_name_ = self.default_class_name()
        self.features_ = default_feature_names(X.shape[1])
        for key, value in kwargs.items():
            if key in expected_args:
                setattr(self, f"{key}_", value)
            else:
                raise ValueError(f"Unexpected argument: {key}")
        self.feature_names_in_ = self.features_
        # used for local discretization
        self.indexed_features_ = {
            feature: i for i, feature in enumerate(self.features_)
        }
        if self.random_state is not None:
            random.seed(self.random_state)
-        if len(self.features_) != X.shape[1]:
+        if len(self.feature_names_in_) != X.shape[1]:
            raise ValueError(
                "Number of features does not match the number of columns in X"
            )
        self.n_features_in_ = X.shape[1]
        return X, y
    @property
    def states_(self):
        if hasattr(self, "fitted_"):
            return sum([len(item) for _, item in self.model_.states.items()])
        return 0
    @property
    def depth_(self):
        return self.states_
    def fit(self, X, y, **kwargs):
-        """A reference implementation of a fitting function for a classifier.
+        """Fit classifier
        Parameters
        ----------
@@ -97,28 +138,43 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        >>> model.fit(train_data, train_y, features=features, class_name='E')
        TAN(random_state=17)
        """
-        X_, y_ = self._check_params(X, y, kwargs)
+        self.X_, self.y_ = self._check_params(X, y, kwargs)
        # Store the information needed to build the model
-        self.X_ = X_
+        self.build_dataset()
        self.y_ = y_
        self.dataset_ = pd.DataFrame(self.X_, columns=self.features_)
        self.dataset_[self.class_name_] = self.y_
        # Build the DAG
        self._build()
        # Train the model
-        self._train()
+        self._train(kwargs)
        self.fitted_ = True
        # To keep compatiblity with the benchmark platform
        self.nodes_leaves = self.nodes_edges
        # Return the classifier
        return self
-    def _train(self):
+    def _build(self):
        """This method should be implemented by the subclasses to
        build the DAG
        """
        ...
    def _train(self, kwargs):
        """Build and train a BayesianNetwork from the DAG and the dataset
        Parameters
        ----------
        kwargs : dict
            fit parameters
        """
        self.model_ = BayesianNetwork(
            self.dag_.edges(), show_progress=self.show_progress
        )
        states = dict(state_names=kwargs.pop("state_names", []))
        self.model_.fit(
            self.dataset_,
            estimator=BayesianEstimator,
            prior_type="K2",
            weighted=self.weighted_,
            **states,
        )
    def predict(self, X):
@@ -169,13 +225,15 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        """
        # Check is fit had been called
        check_is_fitted(self, ["X_", "y_", "fitted_"])
        # Input validation
        X = check_array(X)
-        dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
+        dataset = pd.DataFrame(
            X, columns=self.feature_names_in_, dtype=np.int32
        )
        return self.model_.predict(dataset).values.ravel()
    def plot(self, title="", node_size=800):
        warnings.simplefilter("ignore", UserWarning)
        nx.draw_circular(
            self.model_,
            with_labels=True,
@@ -208,7 +266,7 @@ class TAN(BayesBase):
        The classes seen at :meth:`fit`.
    class_name_ : str
        The name of the class column
-    features_ : list
+    feature_names_in_ : list
        The list of features names
    head_ : int
        The index of the node used as head for the initial DAG
@@ -227,21 +285,47 @@ class TAN(BayesBase):
    def _check_params(self, X, y, kwargs):
        self.head_ = 0
-        expected_args = ["class_name", "features", "head"]
+        expected_args = ["class_name", "features", "head", "state_names"]
        X, y = self._check_params_fit(X, y, expected_args, kwargs)
        if self.head_ == "random":
-            self.head_ = random.randint(0, len(self.features_) - 1)
+            self.head_ = random.randint(0, self.n_features_in_ - 1)
-        if self.head_ is not None and self.head_ >= len(self.features_):
+        if self.head_ is not None and self.head_ >= self.n_features_in_:
            raise ValueError("Head index out of range")
        return X, y
    def _build(self):
-        est = TreeSearch(self.dataset_, root_node=self.features_[self.head_])
+        est = TreeSearch(
            self.dataset_, root_node=self.feature_names_in_[self.head_]
        )
        self.dag_ = est.estimate(
            estimator_type="tan",
            class_node=self.class_name_,
            show_progress=self.show_progress,
        )
        # Code taken from pgmpy
        # n_jobs = -1
        # weights = TreeSearch._get_conditional_weights(
        #     self.dataset_,
        #     self.class_name_,
        #     "mutual_info",
        #     n_jobs,
        #     self.show_progress,
        # )
        # # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
        # class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
        #     0
        # ][0]
        # weights = np.delete(weights, class_node_idx, axis=0)
        # weights = np.delete(weights, class_node_idx, axis=1)
        # reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
        # D = TreeSearch._create_tree_and_dag(
        #     weights, reduced_columns, self.feature_names_in_[self.head_]
        # )
        # # Step 4.3: Add edges from class_node to all other nodes.
        # D.add_edges_from(
        #     [(self.class_name_, node) for node in reduced_columns]
        # )
        # self.dag_ = D
 class KDB(BayesBase):
@@ -253,46 +337,55 @@ class KDB(BayesBase):
        )
    def _check_params(self, X, y, kwargs):
-        expected_args = ["class_name", "features"]
+        expected_args = [
            "class_name",
            "features",
            "state_names",
            "sample_weight",
            "weighted",
        ]
        return self._check_params_fit(X, y, expected_args, kwargs)
    def _add_m_edges(self, dag, idx, S_nodes, conditional_weights):
        n_edges = min(self.k, len(S_nodes))
        cond_w = conditional_weights.copy()
        exit_cond = self.k == 0
        num = 0
        while not exit_cond:
            max_minfo = np.argmax(cond_w[idx, :])
            if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
                try:
                    dag.add_edge(
                        self.feature_names_in_[max_minfo],
                        self.feature_names_in_[idx],
                    )
                    num += 1
                except ValueError:
                    # Loops are not allowed
                    pass
            cond_w[idx, max_minfo] = -1
            exit_cond = num == n_edges or np.all(cond_w[idx, :] <= self.theta)
    def _build(self):
        """
-        1. For each feature Xi, compute mutual information, I(X;;C), where C is the class.
+        1. For each feature Xi, compute mutual information, I(X;;C),
-        2. Compute class conditional mutual information I(Xi;XjIC), f or each pair of features Xi and Xj, where i#j.
+        where C is the class.
        2. Compute class conditional mutual information I(Xi;XjIC), f or each
        pair of features Xi and Xj, where i#j.
        3. Let the used variable list, S, be empty.
-        4. Let the Bayesian network being constructed, BN, begin with a single class node, C.
+        4. Let the DAG network being constructed, BN, begin with a single
        class node, C.
        5. Repeat until S includes all domain features
-        5.1. Select feature Xmax which is not in S and has the largest value I(Xmax;C).
+        5.1. Select feature Xmax which is not in S and has the largest value
        I(Xmax;C).
        5.2. Add a node to BN representing Xmax.
        5.3. Add an arc from C to Xmax in BN.
-        5.4. Add m =min(lSl,/c) arcs from m distinct features Xj in S with the highest value for I(Xmax;X,jC).
+        5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
        the highest value for I(Xmax;X,jC).
        5.5. Add Xmax to S.
-        Compute the conditional probabilility infered by the structure of BN by using counts from DB, and output BN.
+        Compute the conditional probabilility infered by the structure of BN by
        using counts from DB, and output BN.
        """
        def add_m_edges(dag, idx, S_nodes, conditional_weights):
            n_edges = min(self.k, len(S_nodes))
            cond_w = conditional_weights.copy()
            exit_cond = self.k == 0
            num = 0
            while not exit_cond:
                max_minfo = np.argmax(cond_w[idx, :])
                if (
                    max_minfo in S_nodes
                    and cond_w[idx, max_minfo] > self.theta
                ):
                    try:
                        dag.add_edge(
                            self.features_[max_minfo], self.features_[idx]
                        )
                        num += 1
                    except ValueError:
                        # Loops are not allowed
                        pass
                cond_w[idx, max_minfo] = -1
                exit_cond = num == n_edges or np.all(cond_w[idx, :] <= 0)
        # 1. get the mutual information between each feature and the class
        mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
        # 2. symmetric matrix where each element represents I(X, Y| class_node)
@@ -301,73 +394,522 @@ class KDB(BayesBase):
        )._get_conditional_weights(
            self.dataset_, self.class_name_, show_progress=self.show_progress
        )
-        # 3.
+        # 3. Let the used variable list, S, be empty.
        S_nodes = []
-        # 4.
+        # 4. Let the DAG being constructed, BN, begin with a single class node
-        dag = BayesianNetwork()
+        dag = BayesianNetwork(show_progress=self.show_progress)
        dag.add_node(self.class_name_)  # , state_names=self.classes_)
-        # 5. 5.1
+        # 5. Repeat until S includes all domain features
        # 5.1 Select feature Xmax which is not in S and has the largest value
        for idx in np.argsort(mutual):
-            # 5.2
+            # 5.2 Add a node to BN representing Xmax.
-            feature = self.features_[idx]
+            feature = self.feature_names_in_[idx]
            dag.add_node(feature)
-            # 5.3
+            # 5.3 Add an arc from C to Xmax in BN.
            dag.add_edge(self.class_name_, feature)
-            # 5.4
+            # 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S
-            add_m_edges(dag, idx, S_nodes, conditional_weights)
+            self._add_m_edges(dag, idx, S_nodes, conditional_weights)
-            # 5.5
+            # 5.5 Add Xmax to S.
            S_nodes.append(idx)
        self.dag_ = dag
-class AODE(BayesBase, BaseEnsemble):
+def build_spodes(features, class_name):
-    def __init__(self, show_progress=False, random_state=None):
+    """Build SPODE estimators (Super Parent One Dependent Estimator)"""
    class_edges = [(class_name, f) for f in features]
    for idx in range(len(features)):
        feature_edges = [
            (features[idx], f) for f in features if f != features[idx]
        ]
        feature_edges.extend(class_edges)
        model = BayesianNetwork(feature_edges, show_progress=False)
        yield model
 class SPODE(BayesBase):
    def _check_params(self, X, y, kwargs):
        expected_args = [
            "class_name",
            "features",
            "state_names",
            "sample_weight",
            "weighted",
        ]
        return self._check_params_fit(X, y, expected_args, kwargs)
 class AODE(ClassifierMixin, BaseEnsemble):
    def __init__(
        self,
        show_progress=False,
        random_state=None,
        estimator=None,
    ):
        self.show_progress = show_progress
        self.random_state = random_state
        super().__init__(estimator=estimator)
    def _validate_estimator(self) -> None:
        """Check the estimator and set the estimator_ attribute."""
        super()._validate_estimator(
            default=SPODE(
                random_state=self.random_state,
                show_progress=self.show_progress,
            )
        )
    def fit(self, X, y, **kwargs):
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = kwargs.get(
            "features", default_feature_names(self.n_features_in_)
        )
        self.class_name_ = kwargs.get("class_name", "class")
        # build estimator
        self._validate_estimator()
        self.X_ = X
        self.y_ = y
        self.n_samples_ = X.shape[0]
        self.estimators_ = []
        self._train(kwargs)
        self.fitted_ = True
        # To keep compatiblity with the benchmark platform
        self.nodes_leaves = self.nodes_edges
        return self
    def _train(self, kwargs):
        for dag in build_spodes(self.feature_names_in_, self.class_name_):
            estimator = clone(self.estimator_)
            estimator.dag_ = estimator.model_ = dag
            estimator.fit(self.X_, self.y_, **kwargs)
            self.estimators_.append(estimator)
    def predict(self, X: np.ndarray) -> np.ndarray:
        n_samples = X.shape[0]
        n_estimators = len(self.estimators_)
        result = np.empty((n_samples, n_estimators))
        for index, estimator in enumerate(self.estimators_):
            result[:, index] = estimator.predict(X)
        return mode(result, axis=1, keepdims=False).mode.ravel()
    def version(self):
        if hasattr(self, "fitted_"):
            return self.estimator_.version()
        return SPODE(None, False).version()
    @property
    def states_(self):
        if hasattr(self, "fitted_"):
            return sum(
                [
                    len(item)
                    for model in self.estimators_
                    for _, item in model.model_.states.items()
                ]
            ) / len(self.estimators_)
        return 0
    @property
    def depth_(self):
        return self.states_
    def nodes_edges(self):
        nodes = 0
        edges = 0
        if hasattr(self, "fitted_"):
            nodes = sum([len(x.dag_) for x in self.estimators_])
            edges = sum([len(x.dag_.edges()) for x in self.estimators_])
        return nodes, edges
    def plot(self, title=""):
        warnings.simplefilter("ignore", UserWarning)
        for idx, model in enumerate(self.estimators_):
            model.plot(title=f"{idx} {title}")
 class TANNew(TAN):
    def __init__(
        self,
        show_progress=False,
        random_state=None,
        discretizer_depth=1e6,
        discretizer_length=3,
        discretizer_cuts=0,
    ):
        self.discretizer_depth = discretizer_depth
        self.discretizer_length = discretizer_length
        self.discretizer_cuts = discretizer_cuts
        super().__init__(
            show_progress=show_progress, random_state=random_state
        )
-    def _check_params(self, X, y, kwargs):
+    def fit(self, X, y, **kwargs):
-        expected_args = ["class_name", "features"]
+        self.estimator_ = Proposal(self)
-        return self._check_params_fit(X, y, expected_args, kwargs)
+        self.estimator_.fit(X, y, **kwargs)
        return self
-    def _build(self):
+    def predict(self, X):
        return self.estimator_.predict(X)
        self.dag_ = None
-    def _train(self):
+class KDBNew(KDB):
-        """Build SPODE estimators (Super Parent One Dependent Estimator)"""
+    def __init__(
-        self.models_ = []
+        self,
-        class_edges = [(self.class_name_, f) for f in self.features_]
+        k=2,
-        for idx in range(len(self.features_)):
+        show_progress=False,
-            feature_edges = [
+        random_state=None,
-                (self.features_[idx], f)
+        discretizer_depth=1e6,
-                for f in self.features_
+        discretizer_length=3,
-                if f != self.features_[idx]
+        discretizer_cuts=0,
-            ]
+    ):
-            feature_edges.extend(class_edges)
+        self.discretizer_depth = discretizer_depth
-            model = BayesianNetwork(
+        self.discretizer_length = discretizer_length
-                feature_edges, show_progress=self.show_progress
+        self.discretizer_cuts = discretizer_cuts
-            )
+        super().__init__(
-            model.fit(
+            k=k, show_progress=show_progress, random_state=random_state
-                self.dataset_,
+        )
                estimator=BayesianEstimator,
                prior_type="K2",
            )
            self.models_.append(model)
-    def plot(self, title=""):
+    def fit(self, X, y, **kwargs):
-        for idx, model in enumerate(self.models_):
+        self.estimator_ = Proposal(self)
-            self.model_ = model
+        self.estimator_.fit(X, y, **kwargs)
-            super().plot(title=f"{idx} {title}")
+        return self
    def predict(self, X):
        return self.estimator_.predict(X)
 class SPODENew(SPODE):
    """This class implements a classifier for the SPODE algorithm similar to
    TANNew and KDBNew"""
    def __init__(
        self,
        random_state,
        show_progress,
        discretizer_depth=1e6,
        discretizer_length=3,
        discretizer_cuts=0,
    ):
        super().__init__(
            random_state=random_state, show_progress=show_progress
        )
        self.discretizer_depth = discretizer_depth
        self.discretizer_length = discretizer_length
        self.discretizer_cuts = discretizer_cuts
 class AODENew(AODE):
    def __init__(
        self,
        random_state=None,
        show_progress=False,
        discretizer_depth=1e6,
        discretizer_length=3,
        discretizer_cuts=0,
    ):
        self.discretizer_depth = discretizer_depth
        self.discretizer_length = discretizer_length
        self.discretizer_cuts = discretizer_cuts
        super().__init__(
            random_state=random_state,
            show_progress=show_progress,
            estimator=Proposal(
                SPODENew(
                    random_state=random_state,
                    show_progress=show_progress,
                    discretizer_depth=discretizer_depth,
                    discretizer_length=discretizer_length,
                    discretizer_cuts=discretizer_cuts,
                )
            ),
        )
    def _train(self, kwargs):
        for dag in build_spodes(self.feature_names_in_, self.class_name_):
            proposal = clone(self.estimator_)
            proposal.estimator.dag_ = proposal.estimator.model_ = dag
            self.estimators_.append(proposal.fit(self.X_, self.y_, **kwargs))
        self.n_estimators_ = len(self.estimators_)
    def predict(self, X: np.ndarray) -> np.ndarray:
        check_is_fitted(self, ["X_", "y_", "fitted_"])
        # Input validation
-        X = self._validate_data(X, reset=False)
+        X = check_array(X)
-        n_samples = X.shape[0]
+        result = np.empty((X.shape[0], self.n_estimators_))
-        n_estimators = len(self.models_)
+        for index, model in enumerate(self.estimators_):
-        result = np.empty((n_samples, n_estimators))
+            result[:, index] = model.predict(X)
        dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
        for index, model in enumerate(self.models_):
            result[:, index] = model.predict(dataset).values.ravel()
        return mode(result, axis=1, keepdims=False).mode.ravel()
    @property
    def states_(self):
        if hasattr(self, "fitted_"):
            return sum(
                [
                    len(item)
                    for model in self.estimators_
                    for _, item in model.estimator.model_.states.items()
                ]
            ) / len(self.estimators_)
        return 0
    @property
    def depth_(self):
        return self.states_
    def nodes_edges(self):
        nodes = 0
        edges = 0
        if hasattr(self, "fitted_"):
            nodes = sum([len(x.estimator.dag_) for x in self.estimators_])
            edges = sum(
                [len(x.estimator.dag_.edges()) for x in self.estimators_]
            )
        return nodes, edges
    def plot(self, title=""):
        warnings.simplefilter("ignore", UserWarning)
        for idx, model in enumerate(self.estimators_):
            model.estimator.plot(title=f"{idx} {title}")
    def version(self):
        if hasattr(self, "fitted_"):
            return self.estimator_.estimator.version()
        return SPODENew(None, False).version()
 class Proposal(BaseEstimator):
    def __init__(self, estimator):
        self.estimator = estimator
        self.class_type = estimator.__class__
    def fit(self, X, y, **kwargs):
        # Check parameters
        self.estimator._check_params(X, y, kwargs)
        # Discretize train data
        self.discretizer_ = FImdlp(
            n_jobs=1,
            max_depth=self.estimator.discretizer_depth,
            min_length=self.estimator.discretizer_length,
            max_cuts=self.estimator.discretizer_cuts,
        )
        self.Xd = self.discretizer_.fit_transform(X, y)
        kwargs = self.update_kwargs(y, kwargs)
        # Build the model
        super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
        # Local discretization based on the model
        self._local_discretization()
        # self.check_integrity("fit", self.Xd)
        self.fitted_ = True
        return self
    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self, ["fitted_"])
        # Input validation
        X = check_array(X)
        Xd = self.discretizer_.transform(X)
        # self.check_integrity("predict", Xd)
        return super(self.class_type, self.estimator).predict(Xd)
    def update_kwargs(self, y, kwargs):
        features = (
            kwargs["features"]
            if "features" in kwargs
            else default_feature_names(self.Xd.shape[1])
        )
        states = {
            features[i]: self.discretizer_.get_states_feature(i)
            for i in range(self.Xd.shape[1])
        }
        class_name = (
            kwargs["class_name"]
            if "class_name" in kwargs
            else self.estimator.default_class_name()
        )
        states[class_name] = np.unique(y).tolist()
        kwargs["state_names"] = states
        self.state_names_ = states
        self.features_ = features
        kwargs["features"] = features
        kwargs["class_name"] = class_name
        return kwargs
    def _local_discretization(self):
        """Discretize each feature with its fathers and the class"""
        upgrade = False
        # order of local discretization is important. no good 0, 1, 2...
        ancestral_order = list(nx.topological_sort(self.estimator.dag_))
        for feature in ancestral_order:
            if feature == self.estimator.class_name_:
                continue
            idx = self.estimator.indexed_features_[feature]
            fathers = self.estimator.dag_.get_parents(feature)
            if len(fathers) > 1:
                # First remove the class name as it will be added later
                fathers.remove(self.estimator.class_name_)
                # Get the fathers indices
                features = [
                    self.estimator.indexed_features_[f] for f in fathers
                ]
                # Update the discretization of the feature
                self.Xd[:, idx] = self.discretizer_.join_fit(
                    # each feature has to use previous discretization data=res
                    target=idx,
                    features=features,
                    data=self.Xd,
                )
                upgrade = True
        if upgrade:
            # Update the dataset
            self.estimator.X_ = self.Xd
            self.estimator.build_dataset()
            self.state_names_ = {
                key: self.discretizer_.get_states_feature(value)
                for key, value in self.estimator.indexed_features_.items()
            }
            states = {"state_names": self.state_names_}
            # Update the model
            self.estimator.model_.fit(
                self.estimator.dataset_,
                estimator=BayesianEstimator,
                prior_type="K2",
                **states,
            )
    # def check_integrity(self, source, X):
    #     # print(f"Checking integrity of {source} data")
    #     for i in range(X.shape[1]):
    #         if not set(np.unique(X[:, i]).tolist()).issubset(
    #             set(self.state_names_[self.features_[i]])
    #         ):
    #             print(
    #                 "i",
    #                 i,
    #                 "features[i]",
    #                 self.features_[i],
    #                 "np.unique(X[:, i])",
    #                 np.unique(X[:, i]),
    #                 "np.array(state_names[features[i]])",
    #                 np.array(self.state_names_[self.features_[i]]),
    #             )
    #             raise ValueError("Discretization error")
 class BoostSPODE(BayesBase):
    def _check_params(self, X, y, kwargs):
        expected_args = [
            "class_name",
            "features",
            "state_names",
            "sample_weight",
            "weighted",
            "sparent",
        ]
        return self._check_params_fit(X, y, expected_args, kwargs)
    def _build(self):
        class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
        feature_edges = [
            (self.sparent_, f)
            for f in self.feature_names_in_
            if f != self.sparent_
        ]
        feature_edges.extend(class_edges)
        self.dag_ = DAG(feature_edges)
    def _train(self, kwargs):
        states = dict(state_names=kwargs.get("state_names", []))
        breakpoint()
        self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False)
        self.model_.fit(
            self.dataset_,
            estimator=BayesianEstimator,
            prior_type="K2",
            weighted=self.weighted_,
            **states,
        )
 class BoostAODE(ClassifierMixin, BaseEnsemble):
    def __init__(
        self,
        show_progress=False,
        random_state=None,
        estimator=None,
        n_estimators=10,
    ):
        self.show_progress = show_progress
        self.random_state = random_state
        self.n_estimators = n_estimators
        super().__init__(estimator=estimator)
    def _validate_estimator(self) -> None:
        """Check the estimator and set the estimator_ attribute."""
        super()._validate_estimator(
            default=BoostSPODE(
                random_state=self.random_state,
                show_progress=self.show_progress,
            )
        )
    def fit(self, X, y, **kwargs):
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = kwargs.get(
            "features", default_feature_names(self.n_features_in_)
        )
        self.class_name_ = kwargs.get("class_name", "class")
        self.X_ = X
        self.y_ = y
        self.n_samples_ = X.shape[0]
        self.estimators_ = []
        self._validate_estimator()
        self._train(kwargs)
        self.fitted_ = True
        # To keep compatiblity with the benchmark platform
        self.nodes_leaves = self.nodes_edges
        return self
    def _train(self, kwargs):
        """Build boosted SPODEs"""
        weights = [1 / self.n_samples_] * self.n_samples_
        # Step 0: Set the finish condition
        for num in range(self.n_estimators):
            # Step 1: Build ranking with mutual information
            # OJO MAL, ESTO NO ACTUALIZA EL RANKING CON LOS PESOS
            # SIEMPRE VA A SACAR LO MISMO
            feature = (
                SelectKBest(k=1)
                .fit(self.X_, self.y_)
                .get_feature_names_out(self.feature_names_in_)
                .tolist()[0]
            )
            # Step 2: Build & train spode with the first feature as sparent
            estimator = clone(self.estimator_)
            _args = kwargs.copy()
            _args["sparent"] = feature
            _args["sample_weight"] = weights
            _args["weighted"] = True
            # Step 2.1: build dataset
            # Step 2.2: Train the model
            estimator.fit(self.X_, self.y_, **_args)
            # Step 3: Compute errors (epsilon sub m & alpha sub m)
            # Explanation in https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
            y_pred = estimator.predict(self.X_)
            em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
            am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1)
            # Step 3.2: Update weights for next classifier
            weights = [
                wm * np.exp(am * (ym != y_pred))
                for wm, ym in zip(weights, self.y_)
            ]
            # Step 4: Add the new model
            self.estimators_.append(estimator)
        """
        class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
        feature_edges = [
            (sparent, f) for f in self.feature_names_in_ if f != sparent
        ]
        self.weights_ = weights.copy() if weights is not None else None
        feature_edges.extend(class_edges)
        self.model_ = BayesianNetwork(feature_edges, show_progress=False)
        return self.model_
        """
--- a/bayesclass/tests/baseline_images/test_AODENew/line_dashes_AODENew-expected.png
+++ b/bayesclass/tests/baseline_images/test_AODENew/line_dashes_AODENew-expected.png
--- a/bayesclass/tests/baseline_images/test_AODENew/line_dashes_AODENew.png
+++ b/bayesclass/tests/baseline_images/test_AODENew/line_dashes_AODENew.png
--- a/bayesclass/tests/baseline_images/test_KDB/line_dashes_KDB.png
+++ b/bayesclass/tests/baseline_images/test_KDB/line_dashes_KDB.png
--- a/bayesclass/tests/baseline_images/test_KDBNew/line_dashes_KDBNew.png
+++ b/bayesclass/tests/baseline_images/test_KDBNew/line_dashes_KDBNew.png
--- a/bayesclass/tests/baseline_images/test_TANNew/line_dashes_TANNew.png
+++ b/bayesclass/tests/baseline_images/test_TANNew/line_dashes_TANNew.png
--- a/bayesclass/tests/conftest.py
+++ b/bayesclass/tests/conftest.py
@@ -0,0 +1,38 @@
 import pytest
 from sklearn.datasets import load_iris
 from fimdlp.mdlp import FImdlp
@pytest.fixture
 def iris():
    dataset = load_iris()
    X = dataset["data"]
    y = dataset["target"]
    features = dataset["feature_names"]
    # To make iris dataset has the same values as our iris.arff dataset
    patch = {(34, 3): (0.2, 0.1), (37, 1): (3.6, 3.1), (37, 2): (1.4, 1.5)}
    for key, value in patch.items():
        X[key] = value[1]
    return X, y, features
@pytest.fixture
 def data(iris):
    return iris[0], iris[1]
@pytest.fixture
 def features(iris):
    return iris[2]
@pytest.fixture
 def class_name():
    return "class"
@pytest.fixture
 def data_disc(data):
    clf = FImdlp()
    X, y = data
    return clf.fit_transform(X, y), y
--- a/bayesclass/tests/test_AODE.py
+++ b/bayesclass/tests/test_AODE.py
@@ -1,6 +1,5 @@
 import pytest
 import numpy as np
 from sklearn.datasets import load_iris
 from sklearn.preprocessing import KBinsDiscretizer
 from matplotlib.testing.decorators import image_comparison
 from matplotlib.testing.conftest import mpl_test_settings
@@ -10,28 +9,21 @@ from bayesclass.clfs import AODE
 from .._version import __version__
@pytest.fixture
 def data():
    X, y = load_iris(return_X_y=True)
    enc = KBinsDiscretizer(encode="ordinal")
    return enc.fit_transform(X), y
@pytest.fixture
 def clf():
-    return AODE()
+    return AODE(random_state=17)
-def test_AODE_default_hyperparameters(data, clf):
+def test_AODE_default_hyperparameters(data_disc, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
    assert clf.random_state is None
    clf = AODE(show_progress=True, random_state=17)
    assert clf.show_progress
    assert clf.random_state == 17
-    clf.fit(*data)
+    clf = AODE(show_progress=True)
    assert clf.show_progress
    assert clf.random_state is None
    clf.fit(*data_disc)
    assert clf.class_name_ == "class"
-    assert clf.features_ == [
+    assert clf.feature_names_in_ == [
        "feature_0",
        "feature_1",
        "feature_2",
@@ -42,50 +34,66 @@ def test_AODE_default_hyperparameters(data, clf):
@image_comparison(
    baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
 )
-def test_AODE_plot(data, clf):
+def test_AODE_plot(data_disc, features, clf):
    # mpl_test_settings will automatically clean these internal side effects
    mpl_test_settings
-    dataset = load_iris(as_frame=True)
+    clf.fit(*data_disc, features=features)
    clf.fit(*data, features=dataset["feature_names"])
    clf.plot("AODE Iris")
-def test_AODE_version(clf):
+def test_AODE_version(clf, features, data_disc):
    """Check AODE version."""
    assert __version__ == clf.version()
    clf.fit(*data_disc, features=features)
    assert __version__ == clf.version()
-def test_AODE_nodes_leaves(clf):
+def test_AODE_nodes_edges(clf, data_disc):
-    assert clf.nodes_leaves() == (0, 0)
+    assert clf.nodes_edges() == (0, 0)
    clf.fit(*data_disc)
    assert clf.nodes_leaves() == (20, 28)
-def test_AODE_classifier(data, clf):
+def test_AODE_states(clf, data_disc):
-    clf.fit(*data)
+    assert clf.states_ == 0
-    attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
+    clf.fit(*data_disc)
    assert clf.states_ == 19
    assert clf.depth_ == clf.states_
 def test_AODE_classifier(data_disc, clf):
    clf.fit(*data_disc)
    attribs = [
        "feature_names_in_",
        "class_name_",
        "n_features_in_",
        "X_",
        "y_",
    ]
    for attr in attribs:
        assert hasattr(clf, attr)
-    X = data[0]
+    X = data_disc[0]
-    y = data[1]
+    y = data_disc[1]
    y_pred = clf.predict(X)
    assert y_pred.shape == (X.shape[0],)
-    assert sum(y == y_pred) == 147
+    assert sum(y == y_pred) == 146
-def test_AODE_wrong_num_features(data, clf):
+def test_AODE_wrong_num_features(data_disc, clf):
    with pytest.raises(
        ValueError,
        match="Number of features does not match the number of columns in X",
    ):
-        clf.fit(*data, features=["feature_1", "feature_2"])
+        clf.fit(*data_disc, features=["feature_1", "feature_2"])
-def test_AODE_wrong_hyperparam(data, clf):
+def test_AODE_wrong_hyperparam(data_disc, clf):
    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
-        clf.fit(*data, wrong_param="wrong_param")
+        clf.fit(*data_disc, wrong_param="wrong_param")
-def test_AODE_error_size_predict(data, clf):
+def test_AODE_error_size_predict(data_disc, clf):
-    X, y = data
+    X, y = data_disc
    clf.fit(X, y)
    with pytest.raises(ValueError):
        X_diff_size = np.ones((10, X.shape[1] + 1))
--- a/bayesclass/tests/test_AODENew.py
+++ b/bayesclass/tests/test_AODENew.py
@@ -0,0 +1,123 @@
 import pytest
 import numpy as np
 from matplotlib.testing.decorators import image_comparison
 from matplotlib.testing.conftest import mpl_test_settings
 from bayesclass.clfs import AODENew
 from .._version import __version__
@pytest.fixture
 def clf():
    return AODENew(random_state=17)
 def test_AODENew_default_hyperparameters(data, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
    assert clf.random_state == 17
    clf = AODENew(show_progress=True)
    assert clf.show_progress
    assert clf.random_state is None
    clf.fit(*data)
    assert clf.class_name_ == "class"
    assert clf.feature_names_in_ == [
        "feature_0",
        "feature_1",
        "feature_2",
        "feature_3",
    ]
@image_comparison(
    baseline_images=["line_dashes_AODENew"],
    remove_text=True,
    extensions=["png"],
 )
 def test_AODENew_plot(data, features, clf):
    # mpl_test_settings will automatically clean these internal side effects
    mpl_test_settings
    clf.fit(*data, features=features)
    clf.plot("AODE Iris")
 def test_AODENew_version(clf, data):
    """Check AODENew version."""
    assert __version__ == clf.version()
    clf.fit(*data)
    assert __version__ == clf.version()
 def test_AODENew_nodes_edges(clf, data):
    assert clf.nodes_edges() == (0, 0)
    clf.fit(*data)
    assert clf.nodes_leaves() == (20, 28)
 def test_AODENew_states(clf, data):
    assert clf.states_ == 0
    clf.fit(*data)
    assert clf.states_ == 17.75
    assert clf.depth_ == clf.states_
 def test_AODENew_classifier(data, clf):
    clf.fit(*data)
    attribs = [
        "feature_names_in_",
        "class_name_",
        "n_features_in_",
        "X_",
        "y_",
    ]
    for attr in attribs:
        assert hasattr(clf, attr)
    X = data[0]
    y = data[1]
    y_pred = clf.predict(X)
    assert y_pred.shape == (X.shape[0],)
    assert sum(y == y_pred) == 146
 def test_AODENew_local_discretization(clf, data_disc):
    expected_data = [
        [-1, [0, -1], [0, -1], [0, -1]],
        [[1, -1], -1, [1, -1], [1, -1]],
        [[2, -1], [2, -1], -1, [2, -1]],
        [[3, -1], [3, -1], [3, -1], -1],
    ]
    clf.fit(*data_disc)
    for idx, estimator in enumerate(clf.estimators_):
        expected = expected_data[idx]
        for feature in range(4):
            computed = estimator.discretizer_.target_[feature]
            if type(computed) == list:
                for j, k in zip(expected[feature], computed):
                    assert j == k
            else:
                assert (
                    expected[feature]
                    == estimator.discretizer_.target_[feature]
                )
 def test_AODENew_wrong_num_features(data, clf):
    with pytest.raises(
        ValueError,
        match="Number of features does not match the number of columns in X",
    ):
        clf.fit(*data, features=["feature_1", "feature_2"])
 def test_AODENew_wrong_hyperparam(data, clf):
    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
        clf.fit(*data, wrong_param="wrong_param")
 def test_AODENew_error_size_predict(data, clf):
    X, y = data
    clf.fit(X, y)
    with pytest.raises(ValueError):
        X_diff_size = np.ones((10, X.shape[1] + 1))
        clf.predict(X_diff_size)
--- a/bayesclass/tests/test_BoostAODE.py
+++ b/bayesclass/tests/test_BoostAODE.py
@@ -0,0 +1,100 @@
 import pytest
 import numpy as np
 from sklearn.preprocessing import KBinsDiscretizer
 from matplotlib.testing.decorators import image_comparison
 from matplotlib.testing.conftest import mpl_test_settings
 from bayesclass.clfs import BoostAODE
 from .._version import __version__
@pytest.fixture
 def clf():
    return BoostAODE(random_state=17)
 def test_BoostAODE_default_hyperparameters(data_disc, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
    assert clf.random_state == 17
    clf = BoostAODE(show_progress=True)
    assert clf.show_progress
    assert clf.random_state is None
    clf.fit(*data_disc)
    assert clf.class_name_ == "class"
    assert clf.feature_names_in_ == [
        "feature_0",
        "feature_1",
        "feature_2",
        "feature_3",
    ]
 # @image_comparison(
 #     baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
 # )
 # def test_BoostAODE_plot(data_disc, features, clf):
 #     # mpl_test_settings will automatically clean these internal side effects
 #     mpl_test_settings
 #     clf.fit(*data_disc, features=features)
 #     clf.plot("AODE Iris")
 # def test_BoostAODE_version(clf, features, data_disc):
 #     """Check AODE version."""
 #     assert __version__ == clf.version()
 #     clf.fit(*data_disc, features=features)
 #     assert __version__ == clf.version()
 # def test_BoostAODE_nodes_edges(clf, data_disc):
 #     assert clf.nodes_edges() == (0, 0)
 #     clf.fit(*data_disc)
 #     assert clf.nodes_leaves() == (20, 28)
 # def test_BoostAODE_states(clf, data_disc):
 #     assert clf.states_ == 0
 #     clf.fit(*data_disc)
 #     assert clf.states_ == 19
 #     assert clf.depth_ == clf.states_
 # def test_BoostAODE_classifier(data_disc, clf):
 #     clf.fit(*data_disc)
 #     attribs = [
 #         "feature_names_in_",
 #         "class_name_",
 #         "n_features_in_",
 #         "X_",
 #         "y_",
 #     ]
 #     for attr in attribs:
 #         assert hasattr(clf, attr)
 #     X = data_disc[0]
 #     y = data_disc[1]
 #     y_pred = clf.predict(X)
 #     assert y_pred.shape == (X.shape[0],)
 #     assert sum(y == y_pred) == 146
 # def test_BoostAODE_wrong_num_features(data_disc, clf):
 #     with pytest.raises(
 #         ValueError,
 #         match="Number of features does not match the number of columns in X",
 #     ):
 #         clf.fit(*data_disc, features=["feature_1", "feature_2"])
 # def test_BoostAODE_wrong_hyperparam(data_disc, clf):
 #     with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
 #         clf.fit(*data_disc, wrong_param="wrong_param")
 # def test_BoostAODE_error_size_predict(data_disc, clf):
 #     X, y = data_disc
 #     clf.fit(X, y)
 #     with pytest.raises(ValueError):
 #         X_diff_size = np.ones((10, X.shape[1] + 1))
 #         clf.predict(X_diff_size)
--- a/bayesclass/tests/test_KDB.py
+++ b/bayesclass/tests/test_KDB.py
@@ -1,28 +1,21 @@
 import pytest
 import numpy as np
 from sklearn.datasets import load_iris
 from sklearn.preprocessing import KBinsDiscretizer
 from matplotlib.testing.decorators import image_comparison
 from matplotlib.testing.conftest import mpl_test_settings
 from pgmpy.models import BayesianNetwork
 from bayesclass.clfs import KDB
 from .._version import __version__
@pytest.fixture
 def data():
    X, y = load_iris(return_X_y=True)
    enc = KBinsDiscretizer(encode="ordinal")
    return enc.fit_transform(X), y
@pytest.fixture
 def clf():
-    return KDB(k=3)
+    return KDB(k=3, show_progress=False)
-def test_KDB_default_hyperparameters(data, clf):
+def test_KDB_default_hyperparameters(data_disc, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
    assert clf.random_state is None
@@ -31,9 +24,9 @@ def test_KDB_default_hyperparameters(data, clf):
    assert clf.show_progress
    assert clf.random_state == 17
    assert clf.k == 3
-    clf.fit(*data)
+    clf.fit(*data_disc)
    assert clf.class_name_ == "class"
-    assert clf.features_ == [
+    assert clf.feature_names_in_ == [
        "feature_0",
        "feature_1",
        "feature_2",
@@ -46,49 +39,85 @@ def test_KDB_version(clf):
    assert __version__ == clf.version()
-def test_KDB_nodes_leaves(clf):
+def test_KDB_nodes_edges(clf, data_disc):
-    assert clf.nodes_leaves() == (0, 0)
+    assert clf.nodes_edges() == (0, 0)
    clf.fit(*data_disc)
    assert clf.nodes_leaves() == (5, 9)
-def test_KDB_classifier(data, clf):
+def test_KDB_states(clf, data_disc):
-    clf.fit(*data)
+    assert clf.states_ == 0
-    attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
+    clf.fit(*data_disc)
    assert clf.states_ == 19
    assert clf.depth_ == clf.states_
 def test_KDB_classifier(data_disc, clf):
    clf.fit(*data_disc)
    attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
    for attr in attribs:
        assert hasattr(clf, attr)
-    X = data[0]
+    X = data_disc[0]
-    y = data[1]
+    y = data_disc[1]
    y_pred = clf.predict(X)
    assert y_pred.shape == (X.shape[0],)
-    assert sum(y == y_pred) == 148
+    assert sum(y == y_pred) == 146
 def test_KDB_classifier_weighted(data_disc, clf):
    sample_weight = [1] * data_disc[0].shape[0]
    sample_weight[:50] = [0] * 50
    clf.fit(*data_disc, sample_weight=sample_weight, weighted=True)
    assert clf.score(*data_disc) == 0.64
@image_comparison(
    baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"]
 )
-def test_KDB_plot(data, clf):
+def test_KDB_plot(data_disc, features, clf):
    # mpl_test_settings will automatically clean these internal side effects
    mpl_test_settings
-    dataset = load_iris(as_frame=True)
+    clf.fit(*data_disc, features=features)
    clf.fit(*data, features=dataset["feature_names"])
    clf.plot("KDB Iris")
-def test_KDB_wrong_num_features(data, clf):
+def test_KDB_wrong_num_features(data_disc, clf):
    with pytest.raises(
        ValueError,
        match="Number of features does not match the number of columns in X",
    ):
-        clf.fit(*data, features=["feature_1", "feature_2"])
+        clf.fit(*data_disc, features=["feature_1", "feature_2"])
-def test_KDB_wrong_hyperparam(data, clf):
+def test_KDB_wrong_hyperparam(data_disc, clf):
    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
-        clf.fit(*data, wrong_param="wrong_param")
+        clf.fit(*data_disc, wrong_param="wrong_param")
-def test_KDB_error_size_predict(data, clf):
+def test_KDB_error_size_predict(data_disc, clf):
-    X, y = data
+    X, y = data_disc
    clf.fit(X, y)
    with pytest.raises(ValueError):
        X_diff_size = np.ones((10, X.shape[1] + 1))
        clf.predict(X_diff_size)
 def test_KDB_dont_do_cycles():
    clf = KDB(k=4)
    dag = BayesianNetwork(show_progress=False)
    clf.feature_names_in_ = [
        "feature_0",
        "feature_1",
        "feature_2",
        "feature_3",
    ]
    nodes = list(range(4))
    weights = np.ones((4, 4))
    for idx in range(1, 4):
        dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
    dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
    for idx in range(4):
        clf._add_m_edges(dag, idx, nodes, weights)
        assert len(dag.edges()) == 6
--- a/bayesclass/tests/test_KDBNew.py
+++ b/bayesclass/tests/test_KDBNew.py
@@ -0,0 +1,132 @@
 import pytest
 import numpy as np
 from matplotlib.testing.decorators import image_comparison
 from matplotlib.testing.conftest import mpl_test_settings
 from pgmpy.models import BayesianNetwork
 from bayesclass.clfs import KDBNew
 from .._version import __version__
@pytest.fixture
 def clf():
    return KDBNew(k=3, show_progress=False)
 def test_KDBNew_default_hyperparameters(data, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
    assert clf.random_state is None
    assert clf.theta == 0.03
    clf = KDBNew(show_progress=True, random_state=17, k=3)
    assert clf.show_progress
    assert clf.random_state == 17
    assert clf.k == 3
    clf.fit(*data)
    assert clf.class_name_ == "class"
    assert clf.feature_names_in_ == [
        "feature_0",
        "feature_1",
        "feature_2",
        "feature_3",
    ]
 def test_KDBNew_version(clf):
    """Check KDBNew version."""
    assert __version__ == clf.version()
 def test_KDBNew_nodes_edges(clf, data):
    assert clf.nodes_edges() == (0, 0)
    clf.fit(*data)
    assert clf.nodes_leaves() == (5, 9)
 def test_KDBNew_states(clf, data):
    assert clf.states_ == 0
    clf.fit(*data)
    assert clf.states_ == 22
    assert clf.depth_ == clf.states_
 def test_KDBNew_classifier(data, clf):
    clf.fit(*data)
    attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
    for attr in attribs:
        assert hasattr(clf, attr)
    X = data[0]
    y = data[1]
    y_pred = clf.predict(X)
    assert y_pred.shape == (X.shape[0],)
    assert sum(y == y_pred) == 145
 def test_KDBNew_local_discretization(clf, data):
    expected = [[1, -1], -1, [0, 1, 3, -1], [1, -1]]
    clf.fit(*data)
    for feature in range(4):
        computed = clf.estimator_.discretizer_.target_[feature]
        if type(computed) == list:
            for j, k in zip(expected[feature], computed):
                assert j == k
        else:
            assert (
                expected[feature]
                == clf.estimator_.discretizer_.target_[feature]
            )
@image_comparison(
    baseline_images=["line_dashes_KDBNew"],
    remove_text=True,
    extensions=["png"],
 )
 def test_KDBNew_plot(data, features, class_name, clf):
    # mpl_test_settings will automatically clean these internal side effects
    mpl_test_settings
    clf.fit(*data, features=features, class_name=class_name)
    clf.plot("KDBNew Iris")
 def test_KDBNew_wrong_num_features(data, clf):
    with pytest.raises(
        ValueError,
        match="Number of features does not match the number of columns in X",
    ):
        clf.fit(*data, features=["feature_1", "feature_2"])
 def test_KDBNew_wrong_hyperparam(data, clf):
    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
        clf.fit(*data, wrong_param="wrong_param")
 def test_KDBNew_error_size_predict(data, clf):
    X, y = data
    clf.fit(X, y)
    with pytest.raises(ValueError):
        X_diff_size = np.ones((10, X.shape[1] + 1))
        clf.predict(X_diff_size)
 def test_KDBNew_dont_do_cycles():
    clf = KDBNew(k=4)
    dag = BayesianNetwork(show_progress=False)
    clf.feature_names_in_ = [
        "feature_0",
        "feature_1",
        "feature_2",
        "feature_3",
    ]
    nodes = list(range(4))
    weights = np.ones((4, 4))
    for idx in range(1, 4):
        dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
    dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
    dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
    for idx in range(4):
        clf._add_m_edges(dag, idx, nodes, weights)
        assert len(dag.edges()) == 6
--- a/bayesclass/tests/test_TAN.py
+++ b/bayesclass/tests/test_TAN.py
@@ -1,7 +1,5 @@
 import pytest
 import numpy as np
 from sklearn.datasets import load_iris
 from sklearn.preprocessing import KBinsDiscretizer
 from matplotlib.testing.decorators import image_comparison
 from matplotlib.testing.conftest import mpl_test_settings
@@ -10,29 +8,22 @@ from bayesclass.clfs import TAN
 from .._version import __version__
@pytest.fixture
 def data():
    X, y = load_iris(return_X_y=True)
    enc = KBinsDiscretizer(encode="ordinal")
    return enc.fit_transform(X), y
@pytest.fixture
 def clf():
-    return TAN()
+    return TAN(random_state=17, show_progress=False)
-def test_TAN_default_hyperparameters(data, clf):
+def test_TAN_default_hyperparameters(data_disc, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
    assert clf.random_state is None
    clf = TAN(show_progress=True, random_state=17)
    assert clf.show_progress
    assert clf.random_state == 17
-    clf.fit(*data)
+    clf = TAN(show_progress=True)
    assert clf.show_progress
    assert clf.random_state is None
    clf.fit(*data_disc)
    assert clf.head_ == 0
    assert clf.class_name_ == "class"
-    assert clf.features_ == [
+    assert clf.feature_names_in_ == [
        "feature_0",
        "feature_1",
        "feature_2",
@@ -45,59 +36,73 @@ def test_TAN_version(clf):
    assert __version__ == clf.version()
-def test_TAN_nodes_leaves(clf):
+def test_TAN_nodes_edges(clf, data_disc):
-    assert clf.nodes_leaves() == (0, 0)
+    assert clf.nodes_edges() == (0, 0)
    clf.fit(*data_disc, head="random")
    assert clf.nodes_leaves() == (5, 7)
-def test_TAN_random_head(data):
+def test_TAN_states(clf, data_disc):
-    clf = TAN(random_state=17)
+    assert clf.states_ == 0
-    clf.fit(*data, head="random")
+    clf.fit(*data_disc)
    assert clf.states_ == 19
    assert clf.depth_ == clf.states_
 def test_TAN_random_head(clf, data_disc):
    clf.fit(*data_disc, head="random")
    assert clf.head_ == 3
-def test_TAN_classifier(data, clf):
+def test_TAN_classifier(data_disc, clf):
-    clf.fit(*data)
+    clf.fit(*data_disc)
-    attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"]
+    attribs = [
        "classes_",
        "X_",
        "y_",
        "head_",
        "feature_names_in_",
        "class_name_",
    ]
    for attr in attribs:
        assert hasattr(clf, attr)
-    X = data[0]
+    X = data_disc[0]
-    y = data[1]
+    y = data_disc[1]
    y_pred = clf.predict(X)
    assert y_pred.shape == (X.shape[0],)
-    assert sum(y == y_pred) == 147
+    assert sum(y == y_pred) == 146
@image_comparison(
    baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"]
 )
-def test_TAN_plot(data, clf):
+def test_TAN_plot(data_disc, features, clf):
    # mpl_test_settings will automatically clean these internal side effects
    mpl_test_settings
-    dataset = load_iris(as_frame=True)
+    clf.fit(*data_disc, features=features, head=0)
    clf.fit(*data, features=dataset["feature_names"], head=0)
    clf.plot("TAN Iris head=0")
-def test_KDB_wrong_num_features(data, clf):
+def test_TAN_wrong_num_features(data_disc, clf):
    with pytest.raises(
        ValueError,
        match="Number of features does not match the number of columns in X",
    ):
-        clf.fit(*data, features=["feature_1", "feature_2"])
+        clf.fit(*data_disc, features=["feature_1", "feature_2"])
-def test_TAN_wrong_hyperparam(data, clf):
+def test_TAN_wrong_hyperparam(data_disc, clf):
    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
-        clf.fit(*data, wrong_param="wrong_param")
+        clf.fit(*data_disc, wrong_param="wrong_param")
-def test_TAN_head_out_of_range(data, clf):
+def test_TAN_head_out_of_range(data_disc, clf):
    with pytest.raises(ValueError, match="Head index out of range"):
-        clf.fit(*data, head=4)
+        clf.fit(*data_disc, head=4)
-def test_TAN_error_size_predict(data, clf):
+def test_TAN_error_size_predict(data_disc, clf):
-    X, y = data
+    X, y = data_disc
    clf.fit(X, y)
    with pytest.raises(ValueError):
        X_diff_size = np.ones((10, X.shape[1] + 1))
--- a/bayesclass/tests/test_TANNew.py
+++ b/bayesclass/tests/test_TANNew.py
@@ -0,0 +1,120 @@
 import pytest
 import numpy as np
 from matplotlib.testing.decorators import image_comparison
 from matplotlib.testing.conftest import mpl_test_settings
 from bayesclass.clfs import TANNew
 from .._version import __version__
@pytest.fixture
 def clf():
    return TANNew(random_state=17)
 def test_TANNew_default_hyperparameters(data, clf):
    # Test default values of hyperparameters
    assert not clf.show_progress
    assert clf.random_state == 17
    clf = TANNew(show_progress=True)
    assert clf.show_progress
    assert clf.random_state is None
    clf.fit(*data)
    assert clf.head_ == 0
    assert clf.class_name_ == "class"
    assert clf.feature_names_in_ == [
        "feature_0",
        "feature_1",
        "feature_2",
        "feature_3",
    ]
 def test_TANNew_version(clf):
    """Check TANNew version."""
    assert __version__ == clf.version()
 def test_TANNew_nodes_edges(clf, data):
    assert clf.nodes_edges() == (0, 0)
    clf.fit(*data, head="random")
    assert clf.nodes_leaves() == (5, 7)
 def test_TANNew_states(clf, data):
    assert clf.states_ == 0
    clf.fit(*data)
    assert clf.states_ == 18
    assert clf.depth_ == clf.states_
 def test_TANNew_random_head(clf, data):
    clf.fit(*data, head="random")
    assert clf.head_ == 3
 def test_TANNew_local_discretization(clf, data):
    expected = [-1, [0, -1], [0, -1], [1, -1]]
    clf.fit(*data)
    for feature in range(4):
        assert (
            expected[feature] == clf.estimator_.discretizer_.target_[feature]
        )
 def test_TANNew_classifier(data, clf):
    clf.fit(*data)
    attribs = [
        "classes_",
        "X_",
        "y_",
        "head_",
        "feature_names_in_",
        "class_name_",
    ]
    for attr in attribs:
        assert hasattr(clf, attr)
    X = data[0]
    y = data[1]
    y_pred = clf.predict(X)
    assert y_pred.shape == (X.shape[0],)
    assert sum(y == y_pred) == 146
@image_comparison(
    baseline_images=["line_dashes_TANNew"],
    remove_text=True,
    extensions=["png"],
 )
 def test_TANNew_plot(data, features, clf):
    # mpl_test_settings will automatically clean these internal side effects
    mpl_test_settings
    clf.fit(*data, features=features, head=0)
    clf.plot("TANNew Iris head=0")
 def test_TANNew_wrong_num_features(data, clf):
    with pytest.raises(
        ValueError,
        match="Number of features does not match the number of columns in X",
    ):
        clf.fit(*data, features=["feature_1", "feature_2"])
 def test_TANNew_wrong_hyperparam(data, clf):
    with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
        clf.fit(*data, wrong_param="wrong_param")
 def test_TANNew_head_out_of_range(data, clf):
    with pytest.raises(ValueError, match="Head index out of range"):
        clf.fit(*data, head=4)
 def test_TANNew_error_size_predict(data, clf):
    X, y = data
    clf.fit(X, y)
    with pytest.raises(ValueError):
        X_diff_size = np.ones((10, X.shape[1] + 1))
        clf.predict(X_diff_size)
--- a/bayesclass/tests/test_common.py
+++ b/bayesclass/tests/test_common.py
@@ -1,14 +1,29 @@
 import pytest
 import numpy as np
 from sklearn.utils.estimator_checks import check_estimator
-from bayesclass.clfs import TAN, KDB, AODE
+from bayesclass.clfs import BayesBase, TAN, KDB, AODE
-@pytest.mark.parametrize("estimator", [TAN(), KDB(k=2), AODE()])
+def test_more_tags():
-# @pytest.mark.parametrize("estimator", [AODE()])
+    expected = {
-def test_all_estimators(estimator):
+        "requires_positive_X": True,
        "requires_positive_y": True,
        "preserve_dtype": [np.int32, np.int64],
        "requires_y": True,
    }
    clf = BayesBase(None, True)
    computed = clf._more_tags()
    for key, value in expected.items():
        assert key in computed
        assert computed[key] == value
 # @pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
@pytest.mark.parametrize("estimators", [AODE()])
 def test_all_estimators(estimators):
    i = 0
-    for estimator, test in check_estimator(estimator, generate_only=True):
+    for estimator, test in check_estimator(estimators, generate_only=True):
        print(i := i + 1, test)
        # test(estimator)
--- a/patch_pgmpy_0.1.22.diff
+++ b/patch_pgmpy_0.1.22.diff
@@ -0,0 +1,32 @@
 diff --git a/pgmpy/models/BayesianNetwork.py b/pgmpy/models/BayesianNetwork.py
 index bd90122d..70ae38f7 100644
 --- a/pgmpy/models/BayesianNetwork.py
 +++ b/pgmpy/models/BayesianNetwork.py
@@ -27,7 +27,7 @@ class BayesianNetwork(DAG):
     Base class for Bayesian Models.
     """
 -    def __init__(self, ebunch=None, latents=set()):
 +    def __init__(self, ebunch=None, latents=set(), show_progress=False):
         """
         Initializes a Bayesian Model.
         A models stores nodes and edges with conditional probability
@@ -95,6 +95,7 @@ class BayesianNetwork(DAG):
         >>> len(G)  # number of nodes in graph
         3
         """
 +        self.show_progress = show_progress
         super(BayesianNetwork, self).__init__(ebunch=ebunch, latents=latents)
         self.cpds = []
         self.cardinalities = defaultdict(int)
@@ -738,7 +739,9 @@ class BayesianNetwork(DAG):
                     show_progress=False,
                 )
                 for index, data_point in tqdm(
 -                    data_unique.iterrows(), total=data_unique.shape[0]
 +                    data_unique.iterrows(),
 +                    total=data_unique.shape[0],
 +                    disable=not self.show_progress,
                 )
             )
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
  "pgmpy",
  "networkx",
  "matplotlib",
  "fimdlp",
 ]
 requires-python = ">=3.8"
 classifiers = [
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 numpy
 scipy
 pandas
 scikit-learn
 matplotlib
 networkx
Author	SHA1	Message	Date
Ricardo Montañana	212f7e5584	Add test_BoostAODE	2023-06-18 16:51:38 +02:00
Ricardo Montañana	a797381c00	Continue BootAODE	2023-06-17 17:06:37 +02:00
Ricardo Montañana	3812d271e5	Add BoostAODE initial model	2023-06-15 14:28:35 +02:00
Ricardo Montañana	923a06b3be	Patch pgmpy 0.1.22 show_progress	2023-06-15 14:22:24 +02:00
Ricardo Montañana	c906d6a361	Add weights to KDB classifier	2023-06-15 14:13:15 +02:00
Ricardo Montañana Gómez	f0f7c43944	Merge pull request #3 from Doctorado-ML/localdiscretization Localdiscretization	2023-05-15 11:42:52 +02:00
Ricardo Montañana	f9b35f61f0	Use ancest-order to process local discretization Fix local discretization Refactor tests Unifiy iris dataset from sklearn with iris.arff	2023-04-20 01:20:33 +02:00
Ricardo Montañana	74cd8a6aa2	Add local discretization tests	2023-04-08 11:44:25 +02:00
Ricardo Montañana	9843f5f8db	Refactor AODE & AODENew	2023-04-07 16:22:40 +02:00
Ricardo Montañana	c6390d9da9	Comment out the integrity check in Proposal	2023-03-30 23:23:23 +02:00
Ricardo Montañana	c9afafbf60	Fix AODENew tests	2023-03-30 21:03:42 +02:00
Ricardo Montañana	3af05c9511	First AODENew implementation working	2023-03-30 12:20:56 +02:00
Ricardo Montañana	80b1ab3699	Refactor AODE	2023-03-29 19:05:55 +02:00
Ricardo Montañana	5a772b0bca	Begin AODENew with tests	2023-03-29 11:18:42 +02:00
Ricardo Montañana	ea251aca05	Begin AODE implementation	2023-03-23 22:15:38 +01:00
Ricardo Montañana	7b66097728	Add messages to check_integrity	2023-03-23 22:10:03 +01:00
Ricardo Montañana	ea8c5b805e	Add KDBNew and TANNew tests	2023-03-23 14:13:01 +01:00
Ricardo Montañana	2ffc06b232	Update feature states setting for datasets	2023-02-13 17:34:15 +01:00
Ricardo Montañana	a5244f1c7f	remove trace messages for first try	2023-02-12 11:25:40 +01:00
Ricardo Montañana	42ac57eb79	Continue with New estimators	2023-02-07 18:02:35 +01:00
Ricardo Montañana	63a2feef3a	Begin refactorization of new estimators	2023-02-07 09:42:42 +01:00
Ricardo Montañana	3e049ac89d	default_features_class_name	2023-02-05 20:18:44 +01:00
Ricardo Montañana	2a6547c71d	Complete KDBNew	2023-02-05 00:30:25 +01:00
Ricardo Montañana	de45a94c9b	Add KDBNew estimator	2023-02-04 17:39:32 +01:00
Ricardo Montañana	9019b878f0	docs: 📝 Add text comment to KDB algorithm	2023-02-01 23:42:32 +01:00
Ricardo Montañana	bba9255605	Merge branch 'localdiscretization' of github.com:/doctorado-ml/bayesclass into localdiscretization	2023-02-01 23:41:40 +01:00
Ricardo Montañana	41ca6fad5e	fix: 🐛 Change exit condition in KDB add_m_edges method Change test if every conditional weight is less or equal to zero for less or equal to theta Add text comments to KDB algorithm	2023-02-01 23:40:42 +01:00
Ricardo Montañana	c88591dd64	fix: 🐛 Change exit condition in KDB add_m_edges method Change test if every conditional weight is less or equal to zero for less or equal to theta	2023-02-01 23:33:05 +01:00
Ricardo Montañana	8089e4fd57	docs: 📝 shorten comment lines length to <80	2023-01-30 19:27:27 +01:00
Ricardo Montañana	6f9488f281	Add version command to Makefile	2023-01-28 18:51:55 +01:00
Ricardo Montañana	e837c6cef7	feat: Add feature_names_in_ to classifiers	2023-01-27 19:25:01 +01:00
Ricardo Montañana	a4edc74e8d	Replace len(self.features_) by self.n_features_in_	2023-01-27 12:34:34 +01:00
Ricardo Montañana Gómez	4d416959ad	fix: 🐛 Fix depth_ property as an alias of states_	2023-01-22 14:15:19 +01:00
Ricardo Montañana Gómez	bdd3f483d9	feat: 🧐 Add nodes, edges and states info to models	2023-01-22 14:01:54 +01:00
Ricardo Montañana Gómez	8fd796155d	test: 🧪 Add cycle test in KDB to get 100% coverage	2023-01-17 11:33:55 +01:00
Ricardo Montañana Gómez	d08aea4681	fix AODE state_names mistake	2023-01-12 14:05:27 +01:00
Ricardo Montañana Gómez	dd2e0a3b7e	Update state_names hyperparameter to fit tests Add computed nodes to classifiers	2023-01-12 12:04:54 +01:00
Ricardo Montañana	65d41488cb	Fix AODE state_names	2022-12-29 00:45:10 +01:00
Ricardo Montañana	e7300366ca	Add fit_params to model fit	2022-12-28 19:15:34 +01:00