Continue BootAODE

2025-08-15 23:55:57 +00:00 · 2023-06-17 17:06:37 +02:00
parent 3812d271e5
commit a797381c00
5 changed files with 130 additions and 20 deletions
--- a/bayesclass/init.py
+++ b/bayesclass/init.py
@@ -19,4 +19,5 @@ __all__ = [
    "KDBNew",
    "AODENew",
    "BoostAODE",
    "BoostSPODE",
 ]
--- a/bayesclass/clfs.py
+++ b/bayesclass/clfs.py
@@ -4,6 +4,7 @@ import numpy as np
 import pandas as pd
 from scipy.stats import mode
 from sklearn.base import clone, ClassifierMixin, BaseEstimator
 from sklearn.feature_selection import SelectKBest
 from sklearn.ensemble import BaseEnsemble
 from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
 from sklearn.utils.multiclass import unique_labels
@@ -11,6 +12,7 @@ from sklearn.feature_selection import mutual_info_classif
 import networkx as nx
 from pgmpy.estimators import TreeSearch, BayesianEstimator
 from pgmpy.models import BayesianNetwork
 from pgmpy.base import DAG
 import matplotlib.pyplot as plt
 from fimdlp.mdlp import FImdlp
 from ._version import __version__
@@ -136,10 +138,8 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        >>> model.fit(train_data, train_y, features=features, class_name='E')
        TAN(random_state=17)
        """
-        X_, y_ = self._check_params(X, y, kwargs)
+        self.X_, self.y_ = self._check_params(X, y, kwargs)
        # Store the information needed to build the model
        self.X_ = X_
        self.y_ = y_
        self.build_dataset()
        # Build the DAG
        self._build()
@@ -152,11 +152,21 @@ class BayesBase(BaseEstimator, ClassifierMixin):
        return self
    def _build(self):
        """This method should be implemented by the subclasses to
        build the DAG
        """
        ...
    def _train(self, kwargs):
        """Build and train a BayesianNetwork from the DAG and the dataset
        Parameters
        ----------
        kwargs : dict
            fit parameters
        """
        self.model_ = BayesianNetwork(
-            self.dag_.edges()  # , show_progress=self.show_progress
+            self.dag_.edges(), show_progress=self.show_progress
        )
        states = dict(state_names=kwargs.pop("state_names", []))
        self.model_.fit(
@@ -363,7 +373,7 @@ class KDB(BayesBase):
        2. Compute class conditional mutual information I(Xi;XjIC), f or each
        pair of features Xi and Xj, where i#j.
        3. Let the used variable list, S, be empty.
-        4. Let the Bayesian network being constructed, BN, begin with a single
+        4. Let the DAG network being constructed, BN, begin with a single
        class node, C.
        5. Repeat until S includes all domain features
        5.1. Select feature Xmax which is not in S and has the largest value
@@ -386,8 +396,8 @@ class KDB(BayesBase):
        )
        # 3. Let the used variable list, S, be empty.
        S_nodes = []
-        # 4. Let the BN being constructed, BN, begin with a single class node
+        # 4. Let the DAG being constructed, BN, begin with a single class node
-        dag = BayesianNetwork()
+        dag = BayesianNetwork(show_progress=self.show_progress)
        dag.add_node(self.class_name_)  # , state_names=self.classes_)
        # 5. Repeat until S includes all domain features
        # 5.1 Select feature Xmax which is not in S and has the largest value
@@ -458,10 +468,11 @@ class AODE(ClassifierMixin, BaseEnsemble):
        self._validate_estimator()
        self.X_ = X
        self.y_ = y
        self.n_samples_ = X.shape[0]
        self.estimators_ = []
        self._train(kwargs)
        # To keep compatiblity with the benchmark platform
        self.fitted_ = True
        # To keep compatiblity with the benchmark platform
        self.nodes_leaves = self.nodes_edges
        return self
@@ -783,27 +794,125 @@ class Proposal(BaseEstimator):
    #             raise ValueError("Discretization error")
-class BoostAODE(AODE):
+class BoostSPODE(BayesBase):
    def _check_params(self, X, y, kwargs):
        expected_args = [
            "class_name",
            "features",
            "state_names",
            "sample_weight",
            "weighted",
            "sparent",
        ]
        return self._check_params_fit(X, y, expected_args, kwargs)
    def _build(self):
        class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
        feature_edges = [
            (self.sparent_, f)
            for f in self.feature_names_in_
            if f != self.sparent_
        ]
        feature_edges.extend(class_edges)
        self.dag_ = DAG(feature_edges)
    def _train(self, kwargs):
        states = dict(state_names=kwargs.get("state_names", []))
        self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False)
        self.model_.fit(
            self.dataset_,
            estimator=BayesianEstimator,
            prior_type="K2",
            weighted=self.weighted_,
            **states,
        )
 class BoostAODE(ClassifierMixin, BaseEnsemble):
    def __init__(
        self,
        show_progress=False,
        random_state=None,
        estimator=None,
    ):
        self.show_progress = show_progress
        self.random_state = random_state
        super().__init__(estimator=estimator)
    def _validate_estimator(self) -> None:
        """Check the estimator and set the estimator_ attribute."""
        super()._validate_estimator(
            default=BoostSPODE(
                random_state=self.random_state,
                show_progress=self.show_progress,
            )
        )
    def fit(self, X, y, **kwargs):
        self.n_features_in_ = X.shape[1]
        self.feature_names_in_ = kwargs.get(
            "features", default_feature_names(self.n_features_in_)
        )
        self.class_name_ = kwargs.get("class_name", "class")
        # build estimator
        self._validate_estimator()
        self.X_ = X
        self.y_ = y
        self.n_samples_ = X.shape[0]
        self.estimators_ = []
        self._train(kwargs)
        # To keep compatiblity with the benchmark platform
        self.fitted_ = True
        # To keep compatiblity with the benchmark platform
        self.nodes_leaves = self.nodes_edges
        return self
    def _train(self, kwargs):
-        for dag in build_spodes(self.feature_names_in_, self.class_name_):
+        """Build boosted SPODEs"""
        weights = [1 / self.n_samples_] * self.n_samples_
        # Step 0: Set the finish condition
        pending_features = self.feature_names_in_.copy()
        exit_condition = len(pending_features) == 0
        while not exit_condition:
            # Step 1: Build ranking with mutual information
            feature = (
                SelectKBest(k="all")
                .fit(self.X_, self.y_)
                .get_feature_names_out(self.feature_names_in_)
                .tolist()[0]
            )
            # Step 2: Build & train spode with the first feature as sparent
            self._validate_estimator()
            estimator = clone(self.estimator_)
-            estimator.dag_ = estimator.model_ = dag
+            _args = kwargs.copy()
-            estimator.fit(self.X_, self.y_, **kwargs)
+            _args["sparent"] = feature
            _args["sample_weight"] = weights
            _args["weighted"] = True
            _args["X"] = self.X_
            _args["y"] = self.y_
            # Step 2.1: build dataset
            # Step 2.2: Train the model
            estimator.fit(**_args)
            # Step 3: Compute errors (epsilon sub m & alpha sub m)
            # Explanation in https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
            y_pred = estimator.predict(self.X_)
            em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
            am = np.log((1 - em) / em) + np.log(self.n_classes_ - 1)
            # Step 3.2: Update weights for next classifier
            weights = [
                wm * np.exp(am * (ym != y_pred))
                for wm, ym in zip(weights, self.y_)
            ]
            print(weights)
            # Step 4: Add the new model
            self.estimators_.append(estimator)
            # Final step: Update the finish condition
            pending_features.remove(feature)
            exit_condition = len(pending_features) == 0
        """
        class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
        feature_edges = [
            (sparent, f) for f in self.feature_names_in_ if f != sparent
        ]
        self.weights_ = weights.copy() if weights is not None else None
        feature_edges.extend(class_edges)
        self.model_ = BayesianNetwork(feature_edges, show_progress=False)
        return self.model_
        """
--- a/bayesclass/tests/test_KDB.py
+++ b/bayesclass/tests/test_KDB.py
@@ -12,7 +12,7 @@ from .._version import __version__
@pytest.fixture
 def clf():
-    return KDB(k=3)
+    return KDB(k=3, show_progress=False)
 def test_KDB_default_hyperparameters(data_disc, clf):
@@ -104,7 +104,7 @@ def test_KDB_error_size_predict(data_disc, clf):
 def test_KDB_dont_do_cycles():
    clf = KDB(k=4)
-    dag = BayesianNetwork()
+    dag = BayesianNetwork(show_progress=False)
    clf.feature_names_in_ = [
        "feature_0",
        "feature_1",
--- a/bayesclass/tests/test_KDBNew.py
+++ b/bayesclass/tests/test_KDBNew.py
@@ -11,7 +11,7 @@ from .._version import __version__
@pytest.fixture
 def clf():
-    return KDBNew(k=3)
+    return KDBNew(k=3, show_progress=False)
 def test_KDBNew_default_hyperparameters(data, clf):
@@ -113,7 +113,7 @@ def test_KDBNew_error_size_predict(data, clf):
 def test_KDBNew_dont_do_cycles():
    clf = KDBNew(k=4)
-    dag = BayesianNetwork()
+    dag = BayesianNetwork(show_progress=False)
    clf.feature_names_in_ = [
        "feature_0",
        "feature_1",
--- a/bayesclass/tests/test_TAN.py
+++ b/bayesclass/tests/test_TAN.py
@@ -10,7 +10,7 @@ from .._version import __version__
@pytest.fixture
 def clf():
-    return TAN(random_state=17)
+    return TAN(random_state=17, show_progress=False)
 def test_TAN_default_hyperparameters(data_disc, clf):