Continue BootAODE

This commit is contained in:
2023-06-17 17:06:37 +02:00
parent 3812d271e5
commit a797381c00
5 changed files with 130 additions and 20 deletions

View File

@@ -19,4 +19,5 @@ __all__ = [
"KDBNew", "KDBNew",
"AODENew", "AODENew",
"BoostAODE", "BoostAODE",
"BoostSPODE",
] ]

View File

@@ -4,6 +4,7 @@ import numpy as np
import pandas as pd import pandas as pd
from scipy.stats import mode from scipy.stats import mode
from sklearn.base import clone, ClassifierMixin, BaseEstimator from sklearn.base import clone, ClassifierMixin, BaseEstimator
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import BaseEnsemble from sklearn.ensemble import BaseEnsemble
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
@@ -11,6 +12,7 @@ from sklearn.feature_selection import mutual_info_classif
import networkx as nx import networkx as nx
from pgmpy.estimators import TreeSearch, BayesianEstimator from pgmpy.estimators import TreeSearch, BayesianEstimator
from pgmpy.models import BayesianNetwork from pgmpy.models import BayesianNetwork
from pgmpy.base import DAG
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from fimdlp.mdlp import FImdlp from fimdlp.mdlp import FImdlp
from ._version import __version__ from ._version import __version__
@@ -136,10 +138,8 @@ class BayesBase(BaseEstimator, ClassifierMixin):
>>> model.fit(train_data, train_y, features=features, class_name='E') >>> model.fit(train_data, train_y, features=features, class_name='E')
TAN(random_state=17) TAN(random_state=17)
""" """
X_, y_ = self._check_params(X, y, kwargs) self.X_, self.y_ = self._check_params(X, y, kwargs)
# Store the information needed to build the model # Store the information needed to build the model
self.X_ = X_
self.y_ = y_
self.build_dataset() self.build_dataset()
# Build the DAG # Build the DAG
self._build() self._build()
@@ -152,11 +152,21 @@ class BayesBase(BaseEstimator, ClassifierMixin):
return self return self
def _build(self): def _build(self):
"""This method should be implemented by the subclasses to
build the DAG
"""
... ...
def _train(self, kwargs): def _train(self, kwargs):
"""Build and train a BayesianNetwork from the DAG and the dataset
Parameters
----------
kwargs : dict
fit parameters
"""
self.model_ = BayesianNetwork( self.model_ = BayesianNetwork(
self.dag_.edges() # , show_progress=self.show_progress self.dag_.edges(), show_progress=self.show_progress
) )
states = dict(state_names=kwargs.pop("state_names", [])) states = dict(state_names=kwargs.pop("state_names", []))
self.model_.fit( self.model_.fit(
@@ -363,7 +373,7 @@ class KDB(BayesBase):
2. Compute class conditional mutual information I(Xi;XjIC), f or each 2. Compute class conditional mutual information I(Xi;XjIC), f or each
pair of features Xi and Xj, where i#j. pair of features Xi and Xj, where i#j.
3. Let the used variable list, S, be empty. 3. Let the used variable list, S, be empty.
4. Let the Bayesian network being constructed, BN, begin with a single 4. Let the DAG network being constructed, BN, begin with a single
class node, C. class node, C.
5. Repeat until S includes all domain features 5. Repeat until S includes all domain features
5.1. Select feature Xmax which is not in S and has the largest value 5.1. Select feature Xmax which is not in S and has the largest value
@@ -386,8 +396,8 @@ class KDB(BayesBase):
) )
# 3. Let the used variable list, S, be empty. # 3. Let the used variable list, S, be empty.
S_nodes = [] S_nodes = []
# 4. Let the BN being constructed, BN, begin with a single class node # 4. Let the DAG being constructed, BN, begin with a single class node
dag = BayesianNetwork() dag = BayesianNetwork(show_progress=self.show_progress)
dag.add_node(self.class_name_) # , state_names=self.classes_) dag.add_node(self.class_name_) # , state_names=self.classes_)
# 5. Repeat until S includes all domain features # 5. Repeat until S includes all domain features
# 5.1 Select feature Xmax which is not in S and has the largest value # 5.1 Select feature Xmax which is not in S and has the largest value
@@ -458,10 +468,11 @@ class AODE(ClassifierMixin, BaseEnsemble):
self._validate_estimator() self._validate_estimator()
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.n_samples_ = X.shape[0]
self.estimators_ = [] self.estimators_ = []
self._train(kwargs) self._train(kwargs)
# To keep compatiblity with the benchmark platform
self.fitted_ = True self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges self.nodes_leaves = self.nodes_edges
return self return self
@@ -783,27 +794,125 @@ class Proposal(BaseEstimator):
# raise ValueError("Discretization error") # raise ValueError("Discretization error")
class BoostAODE(AODE): class BoostSPODE(BayesBase):
def _check_params(self, X, y, kwargs):
expected_args = [
"class_name",
"features",
"state_names",
"sample_weight",
"weighted",
"sparent",
]
return self._check_params_fit(X, y, expected_args, kwargs)
def _build(self):
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [
(self.sparent_, f)
for f in self.feature_names_in_
if f != self.sparent_
]
feature_edges.extend(class_edges)
self.dag_ = DAG(feature_edges)
def _train(self, kwargs):
states = dict(state_names=kwargs.get("state_names", []))
self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False)
self.model_.fit(
self.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
weighted=self.weighted_,
**states,
)
class BoostAODE(ClassifierMixin, BaseEnsemble):
def __init__(
self,
show_progress=False,
random_state=None,
estimator=None,
):
self.show_progress = show_progress
self.random_state = random_state
super().__init__(estimator=estimator)
def _validate_estimator(self) -> None:
"""Check the estimator and set the estimator_ attribute."""
super()._validate_estimator(
default=BoostSPODE(
random_state=self.random_state,
show_progress=self.show_progress,
)
)
def fit(self, X, y, **kwargs): def fit(self, X, y, **kwargs):
self.n_features_in_ = X.shape[1] self.n_features_in_ = X.shape[1]
self.feature_names_in_ = kwargs.get( self.feature_names_in_ = kwargs.get(
"features", default_feature_names(self.n_features_in_) "features", default_feature_names(self.n_features_in_)
) )
self.class_name_ = kwargs.get("class_name", "class") self.class_name_ = kwargs.get("class_name", "class")
# build estimator
self._validate_estimator()
self.X_ = X self.X_ = X
self.y_ = y self.y_ = y
self.n_samples_ = X.shape[0]
self.estimators_ = [] self.estimators_ = []
self._train(kwargs) self._train(kwargs)
# To keep compatiblity with the benchmark platform
self.fitted_ = True self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges self.nodes_leaves = self.nodes_edges
return self return self
def _train(self, kwargs): def _train(self, kwargs):
for dag in build_spodes(self.feature_names_in_, self.class_name_): """Build boosted SPODEs"""
weights = [1 / self.n_samples_] * self.n_samples_
# Step 0: Set the finish condition
pending_features = self.feature_names_in_.copy()
exit_condition = len(pending_features) == 0
while not exit_condition:
# Step 1: Build ranking with mutual information
feature = (
SelectKBest(k="all")
.fit(self.X_, self.y_)
.get_feature_names_out(self.feature_names_in_)
.tolist()[0]
)
# Step 2: Build & train spode with the first feature as sparent
self._validate_estimator()
estimator = clone(self.estimator_) estimator = clone(self.estimator_)
estimator.dag_ = estimator.model_ = dag _args = kwargs.copy()
estimator.fit(self.X_, self.y_, **kwargs) _args["sparent"] = feature
_args["sample_weight"] = weights
_args["weighted"] = True
_args["X"] = self.X_
_args["y"] = self.y_
# Step 2.1: build dataset
# Step 2.2: Train the model
estimator.fit(**_args)
# Step 3: Compute errors (epsilon sub m & alpha sub m)
# Explanation in https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
y_pred = estimator.predict(self.X_)
em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
am = np.log((1 - em) / em) + np.log(self.n_classes_ - 1)
# Step 3.2: Update weights for next classifier
weights = [
wm * np.exp(am * (ym != y_pred))
for wm, ym in zip(weights, self.y_)
]
print(weights)
# Step 4: Add the new model
self.estimators_.append(estimator) self.estimators_.append(estimator)
# Final step: Update the finish condition
pending_features.remove(feature)
exit_condition = len(pending_features) == 0
"""
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [
(sparent, f) for f in self.feature_names_in_ if f != sparent
]
self.weights_ = weights.copy() if weights is not None else None
feature_edges.extend(class_edges)
self.model_ = BayesianNetwork(feature_edges, show_progress=False)
return self.model_
"""

View File

@@ -12,7 +12,7 @@ from .._version import __version__
@pytest.fixture @pytest.fixture
def clf(): def clf():
return KDB(k=3) return KDB(k=3, show_progress=False)
def test_KDB_default_hyperparameters(data_disc, clf): def test_KDB_default_hyperparameters(data_disc, clf):
@@ -104,7 +104,7 @@ def test_KDB_error_size_predict(data_disc, clf):
def test_KDB_dont_do_cycles(): def test_KDB_dont_do_cycles():
clf = KDB(k=4) clf = KDB(k=4)
dag = BayesianNetwork() dag = BayesianNetwork(show_progress=False)
clf.feature_names_in_ = [ clf.feature_names_in_ = [
"feature_0", "feature_0",
"feature_1", "feature_1",

View File

@@ -11,7 +11,7 @@ from .._version import __version__
@pytest.fixture @pytest.fixture
def clf(): def clf():
return KDBNew(k=3) return KDBNew(k=3, show_progress=False)
def test_KDBNew_default_hyperparameters(data, clf): def test_KDBNew_default_hyperparameters(data, clf):
@@ -113,7 +113,7 @@ def test_KDBNew_error_size_predict(data, clf):
def test_KDBNew_dont_do_cycles(): def test_KDBNew_dont_do_cycles():
clf = KDBNew(k=4) clf = KDBNew(k=4)
dag = BayesianNetwork() dag = BayesianNetwork(show_progress=False)
clf.feature_names_in_ = [ clf.feature_names_in_ = [
"feature_0", "feature_0",
"feature_1", "feature_1",

View File

@@ -10,7 +10,7 @@ from .._version import __version__
@pytest.fixture @pytest.fixture
def clf(): def clf():
return TAN(random_state=17) return TAN(random_state=17, show_progress=False)
def test_TAN_default_hyperparameters(data_disc, clf): def test_TAN_default_hyperparameters(data_disc, clf):