39 Commits
v0.1.0 ... main

Author SHA1 Message Date
212f7e5584 Add test_BoostAODE 2023-06-18 16:51:38 +02:00
a797381c00 Continue BootAODE 2023-06-17 17:06:37 +02:00
3812d271e5 Add BoostAODE initial model 2023-06-15 14:28:35 +02:00
923a06b3be Patch pgmpy 0.1.22 show_progress 2023-06-15 14:22:24 +02:00
c906d6a361 Add weights to KDB classifier 2023-06-15 14:13:15 +02:00
Ricardo Montañana Gómez
f0f7c43944 Merge pull request #3 from Doctorado-ML/localdiscretization
Localdiscretization
2023-05-15 11:42:52 +02:00
f9b35f61f0 Use ancest-order to process local discretization
Fix local discretization
Refactor tests
Unifiy iris dataset from sklearn with iris.arff
2023-04-20 01:20:33 +02:00
74cd8a6aa2 Add local discretization tests 2023-04-08 11:44:25 +02:00
9843f5f8db Refactor AODE & AODENew 2023-04-07 16:22:40 +02:00
c6390d9da9 Comment out the integrity check in Proposal 2023-03-30 23:23:23 +02:00
c9afafbf60 Fix AODENew tests 2023-03-30 21:03:42 +02:00
3af05c9511 First AODENew implementation working 2023-03-30 12:20:56 +02:00
80b1ab3699 Refactor AODE 2023-03-29 19:05:55 +02:00
5a772b0bca Begin AODENew with tests 2023-03-29 11:18:42 +02:00
ea251aca05 Begin AODE implementation 2023-03-23 22:15:38 +01:00
7b66097728 Add messages to check_integrity 2023-03-23 22:10:03 +01:00
ea8c5b805e Add KDBNew and TANNew tests 2023-03-23 14:13:01 +01:00
2ffc06b232 Update feature states setting for datasets 2023-02-13 17:34:15 +01:00
a5244f1c7f remove trace messages for first try 2023-02-12 11:25:40 +01:00
42ac57eb79 Continue with New estimators 2023-02-07 18:02:35 +01:00
63a2feef3a Begin refactorization of new estimators 2023-02-07 09:42:42 +01:00
3e049ac89d default_features_class_name 2023-02-05 20:18:44 +01:00
2a6547c71d Complete KDBNew 2023-02-05 00:30:25 +01:00
de45a94c9b Add KDBNew estimator 2023-02-04 17:39:32 +01:00
9019b878f0 docs: 📝 Add text comment to KDB algorithm 2023-02-01 23:42:32 +01:00
bba9255605 Merge branch 'localdiscretization' of github.com:/doctorado-ml/bayesclass into localdiscretization 2023-02-01 23:41:40 +01:00
41ca6fad5e fix: 🐛 Change exit condition in KDB add_m_edges method
Change test if every conditional weight is less or equal to zero for less or equal to theta
Add text comments to KDB algorithm
2023-02-01 23:40:42 +01:00
c88591dd64 fix: 🐛 Change exit condition in KDB add_m_edges method
Change test if every conditional weight is less or equal to zero for less or equal to theta
2023-02-01 23:33:05 +01:00
8089e4fd57 docs: 📝 shorten comment lines length to <80 2023-01-30 19:27:27 +01:00
6f9488f281 Add version command to Makefile 2023-01-28 18:51:55 +01:00
e837c6cef7 feat: Add feature_names_in_ to classifiers 2023-01-27 19:25:01 +01:00
a4edc74e8d Replace len(self.features_) by self.n_features_in_ 2023-01-27 12:34:34 +01:00
Ricardo Montañana Gómez
4d416959ad fix: 🐛 Fix depth_ property as an alias of states_ 2023-01-22 14:15:19 +01:00
Ricardo Montañana Gómez
bdd3f483d9 feat: 🧐 Add nodes, edges and states info to models 2023-01-22 14:01:54 +01:00
Ricardo Montañana Gómez
8fd796155d test: 🧪 Add cycle test in KDB to get 100% coverage 2023-01-17 11:33:55 +01:00
Ricardo Montañana Gómez
d08aea4681 fix AODE state_names mistake 2023-01-12 14:05:27 +01:00
Ricardo Montañana Gómez
dd2e0a3b7e Update state_names hyperparameter to fit tests
Add computed nodes to classifiers
2023-01-12 12:04:54 +01:00
65d41488cb Fix AODE state_names 2022-12-29 00:45:10 +01:00
e7300366ca Add fit_params to model fit 2022-12-28 19:15:34 +01:00
21 changed files with 1362 additions and 206 deletions

View File

@@ -37,6 +37,12 @@ doc-clean: ## Update documentation
audit: ## Audit pip
pip-audit
version:
@echo "Current Python version .....: $(shell python --version)"
@echo "Current Bayesclass version .: $(shell python -c "from bayesclass import _version; print(_version.__version__)")"
@echo "Installed Bayesclass version: $(shell pip show bayesclass | grep Version | cut -d' ' -f2)"
@echo "Installed pgmpy version ....: $(shell pip show pgmpy | grep Version | cut -d' ' -f2)"
help: ## Show help message
@IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

View File

@@ -16,4 +16,8 @@ __all__ = [
"TAN",
"KDB",
"AODE",
"KDBNew",
"AODENew",
"BoostAODE",
"BoostSPODE",
]

View File

@@ -1 +1 @@
__version__ = "0.1.0"
__version__ = "0.1.1"

View File

@@ -1,8 +1,10 @@
import random
import warnings
import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.base import clone, ClassifierMixin, BaseEstimator
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import BaseEnsemble
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
@@ -10,10 +12,16 @@ from sklearn.feature_selection import mutual_info_classif
import networkx as nx
from pgmpy.estimators import TreeSearch, BayesianEstimator
from pgmpy.models import BayesianNetwork
from pgmpy.base import DAG
import matplotlib.pyplot as plt
from fimdlp.mdlp import FImdlp
from ._version import __version__
def default_feature_names(num_features):
return [f"feature_{i}" for i in range(num_features)]
class BayesBase(BaseEstimator, ClassifierMixin):
def __init__(self, random_state, show_progress):
self.random_state = random_state
@@ -23,7 +31,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
return {
"requires_positive_X": True,
"requires_positive_y": True,
"preserve_dtype": [np.int64, np.int32],
"preserve_dtype": [np.int32, np.int64],
"requires_y": True,
}
@@ -32,35 +40,68 @@ class BayesBase(BaseEstimator, ClassifierMixin):
"""Return the version of the package."""
return __version__
def nodes_leaves(self):
"""To keep compatiblity with the benchmark platform"""
def nodes_edges(self):
if hasattr(self, "dag_"):
return len(self.dag_), len(self.dag_.edges())
return 0, 0
@staticmethod
def default_class_name():
return "class"
def build_dataset(self):
self.dataset_ = pd.DataFrame(
self.X_, columns=self.feature_names_in_, dtype=np.int32
)
self.dataset_[self.class_name_] = self.y_
if self.sample_weight_ is not None:
self.dataset_["_weight"] = self.sample_weight_
def _check_params_fit(self, X, y, expected_args, kwargs):
"""Check the common parameters passed to fit"""
# Check that X and y have correct shape
X, y = check_X_y(X, y)
X = self._validate_data(X, reset=True)
# Store the classes seen during fit
self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0]
# Default values
self.class_name_ = "class"
self.features_ = [f"feature_{i}" for i in range(X.shape[1])]
self.weighted_ = False
self.sample_weight_ = None
self.class_name_ = self.default_class_name()
self.features_ = default_feature_names(X.shape[1])
for key, value in kwargs.items():
if key in expected_args:
setattr(self, f"{key}_", value)
else:
raise ValueError(f"Unexpected argument: {key}")
self.feature_names_in_ = self.features_
# used for local discretization
self.indexed_features_ = {
feature: i for i, feature in enumerate(self.features_)
}
if self.random_state is not None:
random.seed(self.random_state)
if len(self.features_) != X.shape[1]:
if len(self.feature_names_in_) != X.shape[1]:
raise ValueError(
"Number of features does not match the number of columns in X"
)
self.n_features_in_ = X.shape[1]
return X, y
@property
def states_(self):
if hasattr(self, "fitted_"):
return sum([len(item) for _, item in self.model_.states.items()])
return 0
@property
def depth_(self):
return self.states_
def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a classifier.
"""Fit classifier
Parameters
----------
@@ -97,28 +138,43 @@ class BayesBase(BaseEstimator, ClassifierMixin):
>>> model.fit(train_data, train_y, features=features, class_name='E')
TAN(random_state=17)
"""
X_, y_ = self._check_params(X, y, kwargs)
self.X_, self.y_ = self._check_params(X, y, kwargs)
# Store the information needed to build the model
self.X_ = X_
self.y_ = y_
self.dataset_ = pd.DataFrame(self.X_, columns=self.features_)
self.dataset_[self.class_name_] = self.y_
self.build_dataset()
# Build the DAG
self._build()
# Train the model
self._train()
self._train(kwargs)
self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
# Return the classifier
return self
def _train(self):
def _build(self):
"""This method should be implemented by the subclasses to
build the DAG
"""
...
def _train(self, kwargs):
"""Build and train a BayesianNetwork from the DAG and the dataset
Parameters
----------
kwargs : dict
fit parameters
"""
self.model_ = BayesianNetwork(
self.dag_.edges(), show_progress=self.show_progress
)
states = dict(state_names=kwargs.pop("state_names", []))
self.model_.fit(
self.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
weighted=self.weighted_,
**states,
)
def predict(self, X):
@@ -169,13 +225,15 @@ class BayesBase(BaseEstimator, ClassifierMixin):
"""
# Check is fit had been called
check_is_fitted(self, ["X_", "y_", "fitted_"])
# Input validation
X = check_array(X)
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
dataset = pd.DataFrame(
X, columns=self.feature_names_in_, dtype=np.int32
)
return self.model_.predict(dataset).values.ravel()
def plot(self, title="", node_size=800):
warnings.simplefilter("ignore", UserWarning)
nx.draw_circular(
self.model_,
with_labels=True,
@@ -208,7 +266,7 @@ class TAN(BayesBase):
The classes seen at :meth:`fit`.
class_name_ : str
The name of the class column
features_ : list
feature_names_in_ : list
The list of features names
head_ : int
The index of the node used as head for the initial DAG
@@ -227,21 +285,47 @@ class TAN(BayesBase):
def _check_params(self, X, y, kwargs):
self.head_ = 0
expected_args = ["class_name", "features", "head"]
expected_args = ["class_name", "features", "head", "state_names"]
X, y = self._check_params_fit(X, y, expected_args, kwargs)
if self.head_ == "random":
self.head_ = random.randint(0, len(self.features_) - 1)
if self.head_ is not None and self.head_ >= len(self.features_):
self.head_ = random.randint(0, self.n_features_in_ - 1)
if self.head_ is not None and self.head_ >= self.n_features_in_:
raise ValueError("Head index out of range")
return X, y
def _build(self):
est = TreeSearch(self.dataset_, root_node=self.features_[self.head_])
est = TreeSearch(
self.dataset_, root_node=self.feature_names_in_[self.head_]
)
self.dag_ = est.estimate(
estimator_type="tan",
class_node=self.class_name_,
show_progress=self.show_progress,
)
# Code taken from pgmpy
# n_jobs = -1
# weights = TreeSearch._get_conditional_weights(
# self.dataset_,
# self.class_name_,
# "mutual_info",
# n_jobs,
# self.show_progress,
# )
# # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
# class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
# 0
# ][0]
# weights = np.delete(weights, class_node_idx, axis=0)
# weights = np.delete(weights, class_node_idx, axis=1)
# reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
# D = TreeSearch._create_tree_and_dag(
# weights, reduced_columns, self.feature_names_in_[self.head_]
# )
# # Step 4.3: Add edges from class_node to all other nodes.
# D.add_edges_from(
# [(self.class_name_, node) for node in reduced_columns]
# )
# self.dag_ = D
class KDB(BayesBase):
@@ -253,46 +337,55 @@ class KDB(BayesBase):
)
def _check_params(self, X, y, kwargs):
expected_args = ["class_name", "features"]
expected_args = [
"class_name",
"features",
"state_names",
"sample_weight",
"weighted",
]
return self._check_params_fit(X, y, expected_args, kwargs)
def _build(self):
"""
1. For each feature Xi, compute mutual information, I(X;;C), where C is the class.
2. Compute class conditional mutual information I(Xi;XjIC), f or each pair of features Xi and Xj, where i#j.
3. Let the used variable list, S, be empty.
4. Let the Bayesian network being constructed, BN, begin with a single class node, C.
5. Repeat until S includes all domain features
5.1. Select feature Xmax which is not in S and has the largest value I(Xmax;C).
5.2. Add a node to BN representing Xmax.
5.3. Add an arc from C to Xmax in BN.
5.4. Add m =min(lSl,/c) arcs from m distinct features Xj in S with the highest value for I(Xmax;X,jC).
5.5. Add Xmax to S.
Compute the conditional probabilility infered by the structure of BN by using counts from DB, and output BN.
"""
def add_m_edges(dag, idx, S_nodes, conditional_weights):
def _add_m_edges(self, dag, idx, S_nodes, conditional_weights):
n_edges = min(self.k, len(S_nodes))
cond_w = conditional_weights.copy()
exit_cond = self.k == 0
num = 0
while not exit_cond:
max_minfo = np.argmax(cond_w[idx, :])
if (
max_minfo in S_nodes
and cond_w[idx, max_minfo] > self.theta
):
if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
try:
dag.add_edge(
self.features_[max_minfo], self.features_[idx]
self.feature_names_in_[max_minfo],
self.feature_names_in_[idx],
)
num += 1
except ValueError:
# Loops are not allowed
pass
cond_w[idx, max_minfo] = -1
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= 0)
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= self.theta)
def _build(self):
"""
1. For each feature Xi, compute mutual information, I(X;;C),
where C is the class.
2. Compute class conditional mutual information I(Xi;XjIC), f or each
pair of features Xi and Xj, where i#j.
3. Let the used variable list, S, be empty.
4. Let the DAG network being constructed, BN, begin with a single
class node, C.
5. Repeat until S includes all domain features
5.1. Select feature Xmax which is not in S and has the largest value
I(Xmax;C).
5.2. Add a node to BN representing Xmax.
5.3. Add an arc from C to Xmax in BN.
5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
the highest value for I(Xmax;X,jC).
5.5. Add Xmax to S.
Compute the conditional probabilility infered by the structure of BN by
using counts from DB, and output BN.
"""
# 1. get the mutual information between each feature and the class
mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
# 2. symmetric matrix where each element represents I(X, Y| class_node)
@@ -301,73 +394,522 @@ class KDB(BayesBase):
)._get_conditional_weights(
self.dataset_, self.class_name_, show_progress=self.show_progress
)
# 3.
# 3. Let the used variable list, S, be empty.
S_nodes = []
# 4.
dag = BayesianNetwork()
# 4. Let the DAG being constructed, BN, begin with a single class node
dag = BayesianNetwork(show_progress=self.show_progress)
dag.add_node(self.class_name_) # , state_names=self.classes_)
# 5. 5.1
# 5. Repeat until S includes all domain features
# 5.1 Select feature Xmax which is not in S and has the largest value
for idx in np.argsort(mutual):
# 5.2
feature = self.features_[idx]
# 5.2 Add a node to BN representing Xmax.
feature = self.feature_names_in_[idx]
dag.add_node(feature)
# 5.3
# 5.3 Add an arc from C to Xmax in BN.
dag.add_edge(self.class_name_, feature)
# 5.4
add_m_edges(dag, idx, S_nodes, conditional_weights)
# 5.5
# 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S
self._add_m_edges(dag, idx, S_nodes, conditional_weights)
# 5.5 Add Xmax to S.
S_nodes.append(idx)
self.dag_ = dag
class AODE(BayesBase, BaseEnsemble):
def __init__(self, show_progress=False, random_state=None):
def build_spodes(features, class_name):
"""Build SPODE estimators (Super Parent One Dependent Estimator)"""
class_edges = [(class_name, f) for f in features]
for idx in range(len(features)):
feature_edges = [
(features[idx], f) for f in features if f != features[idx]
]
feature_edges.extend(class_edges)
model = BayesianNetwork(feature_edges, show_progress=False)
yield model
class SPODE(BayesBase):
def _check_params(self, X, y, kwargs):
expected_args = [
"class_name",
"features",
"state_names",
"sample_weight",
"weighted",
]
return self._check_params_fit(X, y, expected_args, kwargs)
class AODE(ClassifierMixin, BaseEnsemble):
def __init__(
self,
show_progress=False,
random_state=None,
estimator=None,
):
self.show_progress = show_progress
self.random_state = random_state
super().__init__(estimator=estimator)
def _validate_estimator(self) -> None:
"""Check the estimator and set the estimator_ attribute."""
super()._validate_estimator(
default=SPODE(
random_state=self.random_state,
show_progress=self.show_progress,
)
)
def fit(self, X, y, **kwargs):
self.n_features_in_ = X.shape[1]
self.feature_names_in_ = kwargs.get(
"features", default_feature_names(self.n_features_in_)
)
self.class_name_ = kwargs.get("class_name", "class")
# build estimator
self._validate_estimator()
self.X_ = X
self.y_ = y
self.n_samples_ = X.shape[0]
self.estimators_ = []
self._train(kwargs)
self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
return self
def _train(self, kwargs):
for dag in build_spodes(self.feature_names_in_, self.class_name_):
estimator = clone(self.estimator_)
estimator.dag_ = estimator.model_ = dag
estimator.fit(self.X_, self.y_, **kwargs)
self.estimators_.append(estimator)
def predict(self, X: np.ndarray) -> np.ndarray:
n_samples = X.shape[0]
n_estimators = len(self.estimators_)
result = np.empty((n_samples, n_estimators))
for index, estimator in enumerate(self.estimators_):
result[:, index] = estimator.predict(X)
return mode(result, axis=1, keepdims=False).mode.ravel()
def version(self):
if hasattr(self, "fitted_"):
return self.estimator_.version()
return SPODE(None, False).version()
@property
def states_(self):
if hasattr(self, "fitted_"):
return sum(
[
len(item)
for model in self.estimators_
for _, item in model.model_.states.items()
]
) / len(self.estimators_)
return 0
@property
def depth_(self):
return self.states_
def nodes_edges(self):
nodes = 0
edges = 0
if hasattr(self, "fitted_"):
nodes = sum([len(x.dag_) for x in self.estimators_])
edges = sum([len(x.dag_.edges()) for x in self.estimators_])
return nodes, edges
def plot(self, title=""):
warnings.simplefilter("ignore", UserWarning)
for idx, model in enumerate(self.estimators_):
model.plot(title=f"{idx} {title}")
class TANNew(TAN):
def __init__(
self,
show_progress=False,
random_state=None,
discretizer_depth=1e6,
discretizer_length=3,
discretizer_cuts=0,
):
self.discretizer_depth = discretizer_depth
self.discretizer_length = discretizer_length
self.discretizer_cuts = discretizer_cuts
super().__init__(
show_progress=show_progress, random_state=random_state
)
def _check_params(self, X, y, kwargs):
expected_args = ["class_name", "features"]
return self._check_params_fit(X, y, expected_args, kwargs)
def fit(self, X, y, **kwargs):
self.estimator_ = Proposal(self)
self.estimator_.fit(X, y, **kwargs)
return self
def _build(self):
def predict(self, X):
return self.estimator_.predict(X)
self.dag_ = None
def _train(self):
"""Build SPODE estimators (Super Parent One Dependent Estimator)"""
self.models_ = []
class_edges = [(self.class_name_, f) for f in self.features_]
for idx in range(len(self.features_)):
feature_edges = [
(self.features_[idx], f)
for f in self.features_
if f != self.features_[idx]
]
feature_edges.extend(class_edges)
model = BayesianNetwork(
feature_edges, show_progress=self.show_progress
class KDBNew(KDB):
def __init__(
self,
k=2,
show_progress=False,
random_state=None,
discretizer_depth=1e6,
discretizer_length=3,
discretizer_cuts=0,
):
self.discretizer_depth = discretizer_depth
self.discretizer_length = discretizer_length
self.discretizer_cuts = discretizer_cuts
super().__init__(
k=k, show_progress=show_progress, random_state=random_state
)
model.fit(
self.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
)
self.models_.append(model)
def plot(self, title=""):
for idx, model in enumerate(self.models_):
self.model_ = model
super().plot(title=f"{idx} {title}")
def fit(self, X, y, **kwargs):
self.estimator_ = Proposal(self)
self.estimator_.fit(X, y, **kwargs)
return self
def predict(self, X):
return self.estimator_.predict(X)
class SPODENew(SPODE):
"""This class implements a classifier for the SPODE algorithm similar to
TANNew and KDBNew"""
def __init__(
self,
random_state,
show_progress,
discretizer_depth=1e6,
discretizer_length=3,
discretizer_cuts=0,
):
super().__init__(
random_state=random_state, show_progress=show_progress
)
self.discretizer_depth = discretizer_depth
self.discretizer_length = discretizer_length
self.discretizer_cuts = discretizer_cuts
class AODENew(AODE):
def __init__(
self,
random_state=None,
show_progress=False,
discretizer_depth=1e6,
discretizer_length=3,
discretizer_cuts=0,
):
self.discretizer_depth = discretizer_depth
self.discretizer_length = discretizer_length
self.discretizer_cuts = discretizer_cuts
super().__init__(
random_state=random_state,
show_progress=show_progress,
estimator=Proposal(
SPODENew(
random_state=random_state,
show_progress=show_progress,
discretizer_depth=discretizer_depth,
discretizer_length=discretizer_length,
discretizer_cuts=discretizer_cuts,
)
),
)
def _train(self, kwargs):
for dag in build_spodes(self.feature_names_in_, self.class_name_):
proposal = clone(self.estimator_)
proposal.estimator.dag_ = proposal.estimator.model_ = dag
self.estimators_.append(proposal.fit(self.X_, self.y_, **kwargs))
self.n_estimators_ = len(self.estimators_)
def predict(self, X: np.ndarray) -> np.ndarray:
check_is_fitted(self, ["X_", "y_", "fitted_"])
# Input validation
X = self._validate_data(X, reset=False)
n_samples = X.shape[0]
n_estimators = len(self.models_)
result = np.empty((n_samples, n_estimators))
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
for index, model in enumerate(self.models_):
result[:, index] = model.predict(dataset).values.ravel()
X = check_array(X)
result = np.empty((X.shape[0], self.n_estimators_))
for index, model in enumerate(self.estimators_):
result[:, index] = model.predict(X)
return mode(result, axis=1, keepdims=False).mode.ravel()
@property
def states_(self):
if hasattr(self, "fitted_"):
return sum(
[
len(item)
for model in self.estimators_
for _, item in model.estimator.model_.states.items()
]
) / len(self.estimators_)
return 0
@property
def depth_(self):
return self.states_
def nodes_edges(self):
nodes = 0
edges = 0
if hasattr(self, "fitted_"):
nodes = sum([len(x.estimator.dag_) for x in self.estimators_])
edges = sum(
[len(x.estimator.dag_.edges()) for x in self.estimators_]
)
return nodes, edges
def plot(self, title=""):
warnings.simplefilter("ignore", UserWarning)
for idx, model in enumerate(self.estimators_):
model.estimator.plot(title=f"{idx} {title}")
def version(self):
if hasattr(self, "fitted_"):
return self.estimator_.estimator.version()
return SPODENew(None, False).version()
class Proposal(BaseEstimator):
def __init__(self, estimator):
self.estimator = estimator
self.class_type = estimator.__class__
def fit(self, X, y, **kwargs):
# Check parameters
self.estimator._check_params(X, y, kwargs)
# Discretize train data
self.discretizer_ = FImdlp(
n_jobs=1,
max_depth=self.estimator.discretizer_depth,
min_length=self.estimator.discretizer_length,
max_cuts=self.estimator.discretizer_cuts,
)
self.Xd = self.discretizer_.fit_transform(X, y)
kwargs = self.update_kwargs(y, kwargs)
# Build the model
super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
# Local discretization based on the model
self._local_discretization()
# self.check_integrity("fit", self.Xd)
self.fitted_ = True
return self
def predict(self, X):
# Check is fit had been called
check_is_fitted(self, ["fitted_"])
# Input validation
X = check_array(X)
Xd = self.discretizer_.transform(X)
# self.check_integrity("predict", Xd)
return super(self.class_type, self.estimator).predict(Xd)
def update_kwargs(self, y, kwargs):
features = (
kwargs["features"]
if "features" in kwargs
else default_feature_names(self.Xd.shape[1])
)
states = {
features[i]: self.discretizer_.get_states_feature(i)
for i in range(self.Xd.shape[1])
}
class_name = (
kwargs["class_name"]
if "class_name" in kwargs
else self.estimator.default_class_name()
)
states[class_name] = np.unique(y).tolist()
kwargs["state_names"] = states
self.state_names_ = states
self.features_ = features
kwargs["features"] = features
kwargs["class_name"] = class_name
return kwargs
def _local_discretization(self):
"""Discretize each feature with its fathers and the class"""
upgrade = False
# order of local discretization is important. no good 0, 1, 2...
ancestral_order = list(nx.topological_sort(self.estimator.dag_))
for feature in ancestral_order:
if feature == self.estimator.class_name_:
continue
idx = self.estimator.indexed_features_[feature]
fathers = self.estimator.dag_.get_parents(feature)
if len(fathers) > 1:
# First remove the class name as it will be added later
fathers.remove(self.estimator.class_name_)
# Get the fathers indices
features = [
self.estimator.indexed_features_[f] for f in fathers
]
# Update the discretization of the feature
self.Xd[:, idx] = self.discretizer_.join_fit(
# each feature has to use previous discretization data=res
target=idx,
features=features,
data=self.Xd,
)
upgrade = True
if upgrade:
# Update the dataset
self.estimator.X_ = self.Xd
self.estimator.build_dataset()
self.state_names_ = {
key: self.discretizer_.get_states_feature(value)
for key, value in self.estimator.indexed_features_.items()
}
states = {"state_names": self.state_names_}
# Update the model
self.estimator.model_.fit(
self.estimator.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
**states,
)
# def check_integrity(self, source, X):
# # print(f"Checking integrity of {source} data")
# for i in range(X.shape[1]):
# if not set(np.unique(X[:, i]).tolist()).issubset(
# set(self.state_names_[self.features_[i]])
# ):
# print(
# "i",
# i,
# "features[i]",
# self.features_[i],
# "np.unique(X[:, i])",
# np.unique(X[:, i]),
# "np.array(state_names[features[i]])",
# np.array(self.state_names_[self.features_[i]]),
# )
# raise ValueError("Discretization error")
class BoostSPODE(BayesBase):
def _check_params(self, X, y, kwargs):
expected_args = [
"class_name",
"features",
"state_names",
"sample_weight",
"weighted",
"sparent",
]
return self._check_params_fit(X, y, expected_args, kwargs)
def _build(self):
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [
(self.sparent_, f)
for f in self.feature_names_in_
if f != self.sparent_
]
feature_edges.extend(class_edges)
self.dag_ = DAG(feature_edges)
def _train(self, kwargs):
states = dict(state_names=kwargs.get("state_names", []))
breakpoint()
self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False)
self.model_.fit(
self.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
weighted=self.weighted_,
**states,
)
class BoostAODE(ClassifierMixin, BaseEnsemble):
def __init__(
self,
show_progress=False,
random_state=None,
estimator=None,
n_estimators=10,
):
self.show_progress = show_progress
self.random_state = random_state
self.n_estimators = n_estimators
super().__init__(estimator=estimator)
def _validate_estimator(self) -> None:
"""Check the estimator and set the estimator_ attribute."""
super()._validate_estimator(
default=BoostSPODE(
random_state=self.random_state,
show_progress=self.show_progress,
)
)
def fit(self, X, y, **kwargs):
self.n_features_in_ = X.shape[1]
self.feature_names_in_ = kwargs.get(
"features", default_feature_names(self.n_features_in_)
)
self.class_name_ = kwargs.get("class_name", "class")
self.X_ = X
self.y_ = y
self.n_samples_ = X.shape[0]
self.estimators_ = []
self._validate_estimator()
self._train(kwargs)
self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
return self
def _train(self, kwargs):
"""Build boosted SPODEs"""
weights = [1 / self.n_samples_] * self.n_samples_
# Step 0: Set the finish condition
for num in range(self.n_estimators):
# Step 1: Build ranking with mutual information
# OJO MAL, ESTO NO ACTUALIZA EL RANKING CON LOS PESOS
# SIEMPRE VA A SACAR LO MISMO
feature = (
SelectKBest(k=1)
.fit(self.X_, self.y_)
.get_feature_names_out(self.feature_names_in_)
.tolist()[0]
)
# Step 2: Build & train spode with the first feature as sparent
estimator = clone(self.estimator_)
_args = kwargs.copy()
_args["sparent"] = feature
_args["sample_weight"] = weights
_args["weighted"] = True
# Step 2.1: build dataset
# Step 2.2: Train the model
estimator.fit(self.X_, self.y_, **_args)
# Step 3: Compute errors (epsilon sub m & alpha sub m)
# Explanation in https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
y_pred = estimator.predict(self.X_)
em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1)
# Step 3.2: Update weights for next classifier
weights = [
wm * np.exp(am * (ym != y_pred))
for wm, ym in zip(weights, self.y_)
]
# Step 4: Add the new model
self.estimators_.append(estimator)
"""
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [
(sparent, f) for f in self.feature_names_in_ if f != sparent
]
self.weights_ = weights.copy() if weights is not None else None
feature_edges.extend(class_edges)
self.model_ = BayesianNetwork(feature_edges, show_progress=False)
return self.model_
"""

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1,38 @@
import pytest
from sklearn.datasets import load_iris
from fimdlp.mdlp import FImdlp
@pytest.fixture
def iris():
dataset = load_iris()
X = dataset["data"]
y = dataset["target"]
features = dataset["feature_names"]
# To make iris dataset has the same values as our iris.arff dataset
patch = {(34, 3): (0.2, 0.1), (37, 1): (3.6, 3.1), (37, 2): (1.4, 1.5)}
for key, value in patch.items():
X[key] = value[1]
return X, y, features
@pytest.fixture
def data(iris):
return iris[0], iris[1]
@pytest.fixture
def features(iris):
return iris[2]
@pytest.fixture
def class_name():
return "class"
@pytest.fixture
def data_disc(data):
clf = FImdlp()
X, y = data
return clf.fit_transform(X, y), y

View File

@@ -1,6 +1,5 @@
import pytest
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
@@ -10,28 +9,21 @@ from bayesclass.clfs import AODE
from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture
def clf():
return AODE()
return AODE(random_state=17)
def test_AODE_default_hyperparameters(data, clf):
def test_AODE_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state is None
clf = AODE(show_progress=True, random_state=17)
assert clf.show_progress
assert clf.random_state == 17
clf.fit(*data)
clf = AODE(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data_disc)
assert clf.class_name_ == "class"
assert clf.features_ == [
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
@@ -42,50 +34,66 @@ def test_AODE_default_hyperparameters(data, clf):
@image_comparison(
baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
)
def test_AODE_plot(data, clf):
def test_AODE_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
dataset = load_iris(as_frame=True)
clf.fit(*data, features=dataset["feature_names"])
clf.fit(*data_disc, features=features)
clf.plot("AODE Iris")
def test_AODE_version(clf):
def test_AODE_version(clf, features, data_disc):
"""Check AODE version."""
assert __version__ == clf.version()
clf.fit(*data_disc, features=features)
assert __version__ == clf.version()
def test_AODE_nodes_leaves(clf):
assert clf.nodes_leaves() == (0, 0)
def test_AODE_nodes_edges(clf, data_disc):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data_disc)
assert clf.nodes_leaves() == (20, 28)
def test_AODE_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
def test_AODE_states(clf, data_disc):
assert clf.states_ == 0
clf.fit(*data_disc)
assert clf.states_ == 19
assert clf.depth_ == clf.states_
def test_AODE_classifier(data_disc, clf):
clf.fit(*data_disc)
attribs = [
"feature_names_in_",
"class_name_",
"n_features_in_",
"X_",
"y_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
X = data_disc[0]
y = data_disc[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 147
assert sum(y == y_pred) == 146
def test_AODE_wrong_num_features(data, clf):
def test_AODE_wrong_num_features(data_disc, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_AODE_wrong_hyperparam(data, clf):
def test_AODE_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
clf.fit(*data_disc, wrong_param="wrong_param")
def test_AODE_error_size_predict(data, clf):
X, y = data
def test_AODE_error_size_predict(data_disc, clf):
X, y = data_disc
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))

View File

@@ -0,0 +1,123 @@
import pytest
import numpy as np
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from bayesclass.clfs import AODENew
from .._version import __version__
@pytest.fixture
def clf():
return AODENew(random_state=17)
def test_AODENew_default_hyperparameters(data, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state == 17
clf = AODENew(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data)
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
@image_comparison(
baseline_images=["line_dashes_AODENew"],
remove_text=True,
extensions=["png"],
)
def test_AODENew_plot(data, features, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
clf.fit(*data, features=features)
clf.plot("AODE Iris")
def test_AODENew_version(clf, data):
"""Check AODENew version."""
assert __version__ == clf.version()
clf.fit(*data)
assert __version__ == clf.version()
def test_AODENew_nodes_edges(clf, data):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data)
assert clf.nodes_leaves() == (20, 28)
def test_AODENew_states(clf, data):
assert clf.states_ == 0
clf.fit(*data)
assert clf.states_ == 17.75
assert clf.depth_ == clf.states_
def test_AODENew_classifier(data, clf):
clf.fit(*data)
attribs = [
"feature_names_in_",
"class_name_",
"n_features_in_",
"X_",
"y_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 146
def test_AODENew_local_discretization(clf, data_disc):
expected_data = [
[-1, [0, -1], [0, -1], [0, -1]],
[[1, -1], -1, [1, -1], [1, -1]],
[[2, -1], [2, -1], -1, [2, -1]],
[[3, -1], [3, -1], [3, -1], -1],
]
clf.fit(*data_disc)
for idx, estimator in enumerate(clf.estimators_):
expected = expected_data[idx]
for feature in range(4):
computed = estimator.discretizer_.target_[feature]
if type(computed) == list:
for j, k in zip(expected[feature], computed):
assert j == k
else:
assert (
expected[feature]
== estimator.discretizer_.target_[feature]
)
def test_AODENew_wrong_num_features(data, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
def test_AODENew_wrong_hyperparam(data, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
def test_AODENew_error_size_predict(data, clf):
X, y = data
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size)

View File

@@ -0,0 +1,100 @@
import pytest
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from bayesclass.clfs import BoostAODE
from .._version import __version__
@pytest.fixture
def clf():
return BoostAODE(random_state=17)
def test_BoostAODE_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state == 17
clf = BoostAODE(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data_disc)
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
# @image_comparison(
# baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
# )
# def test_BoostAODE_plot(data_disc, features, clf):
# # mpl_test_settings will automatically clean these internal side effects
# mpl_test_settings
# clf.fit(*data_disc, features=features)
# clf.plot("AODE Iris")
# def test_BoostAODE_version(clf, features, data_disc):
# """Check AODE version."""
# assert __version__ == clf.version()
# clf.fit(*data_disc, features=features)
# assert __version__ == clf.version()
# def test_BoostAODE_nodes_edges(clf, data_disc):
# assert clf.nodes_edges() == (0, 0)
# clf.fit(*data_disc)
# assert clf.nodes_leaves() == (20, 28)
# def test_BoostAODE_states(clf, data_disc):
# assert clf.states_ == 0
# clf.fit(*data_disc)
# assert clf.states_ == 19
# assert clf.depth_ == clf.states_
# def test_BoostAODE_classifier(data_disc, clf):
# clf.fit(*data_disc)
# attribs = [
# "feature_names_in_",
# "class_name_",
# "n_features_in_",
# "X_",
# "y_",
# ]
# for attr in attribs:
# assert hasattr(clf, attr)
# X = data_disc[0]
# y = data_disc[1]
# y_pred = clf.predict(X)
# assert y_pred.shape == (X.shape[0],)
# assert sum(y == y_pred) == 146
# def test_BoostAODE_wrong_num_features(data_disc, clf):
# with pytest.raises(
# ValueError,
# match="Number of features does not match the number of columns in X",
# ):
# clf.fit(*data_disc, features=["feature_1", "feature_2"])
# def test_BoostAODE_wrong_hyperparam(data_disc, clf):
# with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
# clf.fit(*data_disc, wrong_param="wrong_param")
# def test_BoostAODE_error_size_predict(data_disc, clf):
# X, y = data_disc
# clf.fit(X, y)
# with pytest.raises(ValueError):
# X_diff_size = np.ones((10, X.shape[1] + 1))
# clf.predict(X_diff_size)

View File

@@ -1,28 +1,21 @@
import pytest
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from pgmpy.models import BayesianNetwork
from bayesclass.clfs import KDB
from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture
def clf():
return KDB(k=3)
return KDB(k=3, show_progress=False)
def test_KDB_default_hyperparameters(data, clf):
def test_KDB_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state is None
@@ -31,9 +24,9 @@ def test_KDB_default_hyperparameters(data, clf):
assert clf.show_progress
assert clf.random_state == 17
assert clf.k == 3
clf.fit(*data)
clf.fit(*data_disc)
assert clf.class_name_ == "class"
assert clf.features_ == [
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
@@ -46,49 +39,85 @@ def test_KDB_version(clf):
assert __version__ == clf.version()
def test_KDB_nodes_leaves(clf):
assert clf.nodes_leaves() == (0, 0)
def test_KDB_nodes_edges(clf, data_disc):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data_disc)
assert clf.nodes_leaves() == (5, 9)
def test_KDB_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "features_", "class_name_"]
def test_KDB_states(clf, data_disc):
assert clf.states_ == 0
clf.fit(*data_disc)
assert clf.states_ == 19
assert clf.depth_ == clf.states_
def test_KDB_classifier(data_disc, clf):
clf.fit(*data_disc)
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
X = data_disc[0]
y = data_disc[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 148
assert sum(y == y_pred) == 146
def test_KDB_classifier_weighted(data_disc, clf):
sample_weight = [1] * data_disc[0].shape[0]
sample_weight[:50] = [0] * 50
clf.fit(*data_disc, sample_weight=sample_weight, weighted=True)
assert clf.score(*data_disc) == 0.64
@image_comparison(
baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"]
)
def test_KDB_plot(data, clf):
def test_KDB_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
dataset = load_iris(as_frame=True)
clf.fit(*data, features=dataset["feature_names"])
clf.fit(*data_disc, features=features)
clf.plot("KDB Iris")
def test_KDB_wrong_num_features(data, clf):
def test_KDB_wrong_num_features(data_disc, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_KDB_wrong_hyperparam(data, clf):
def test_KDB_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
clf.fit(*data_disc, wrong_param="wrong_param")
def test_KDB_error_size_predict(data, clf):
X, y = data
def test_KDB_error_size_predict(data_disc, clf):
X, y = data_disc
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size)
def test_KDB_dont_do_cycles():
clf = KDB(k=4)
dag = BayesianNetwork(show_progress=False)
clf.feature_names_in_ = [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
nodes = list(range(4))
weights = np.ones((4, 4))
for idx in range(1, 4):
dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
for idx in range(4):
clf._add_m_edges(dag, idx, nodes, weights)
assert len(dag.edges()) == 6

View File

@@ -0,0 +1,132 @@
import pytest
import numpy as np
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from pgmpy.models import BayesianNetwork
from bayesclass.clfs import KDBNew
from .._version import __version__
@pytest.fixture
def clf():
return KDBNew(k=3, show_progress=False)
def test_KDBNew_default_hyperparameters(data, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state is None
assert clf.theta == 0.03
clf = KDBNew(show_progress=True, random_state=17, k=3)
assert clf.show_progress
assert clf.random_state == 17
assert clf.k == 3
clf.fit(*data)
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
def test_KDBNew_version(clf):
"""Check KDBNew version."""
assert __version__ == clf.version()
def test_KDBNew_nodes_edges(clf, data):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data)
assert clf.nodes_leaves() == (5, 9)
def test_KDBNew_states(clf, data):
assert clf.states_ == 0
clf.fit(*data)
assert clf.states_ == 22
assert clf.depth_ == clf.states_
def test_KDBNew_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 145
def test_KDBNew_local_discretization(clf, data):
expected = [[1, -1], -1, [0, 1, 3, -1], [1, -1]]
clf.fit(*data)
for feature in range(4):
computed = clf.estimator_.discretizer_.target_[feature]
if type(computed) == list:
for j, k in zip(expected[feature], computed):
assert j == k
else:
assert (
expected[feature]
== clf.estimator_.discretizer_.target_[feature]
)
@image_comparison(
baseline_images=["line_dashes_KDBNew"],
remove_text=True,
extensions=["png"],
)
def test_KDBNew_plot(data, features, class_name, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
clf.fit(*data, features=features, class_name=class_name)
clf.plot("KDBNew Iris")
def test_KDBNew_wrong_num_features(data, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
def test_KDBNew_wrong_hyperparam(data, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
def test_KDBNew_error_size_predict(data, clf):
X, y = data
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size)
def test_KDBNew_dont_do_cycles():
clf = KDBNew(k=4)
dag = BayesianNetwork(show_progress=False)
clf.feature_names_in_ = [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
nodes = list(range(4))
weights = np.ones((4, 4))
for idx in range(1, 4):
dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
for idx in range(4):
clf._add_m_edges(dag, idx, nodes, weights)
assert len(dag.edges()) == 6

View File

@@ -1,7 +1,5 @@
import pytest
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
@@ -10,29 +8,22 @@ from bayesclass.clfs import TAN
from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture
def clf():
return TAN()
return TAN(random_state=17, show_progress=False)
def test_TAN_default_hyperparameters(data, clf):
def test_TAN_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state is None
clf = TAN(show_progress=True, random_state=17)
assert clf.show_progress
assert clf.random_state == 17
clf.fit(*data)
clf = TAN(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data_disc)
assert clf.head_ == 0
assert clf.class_name_ == "class"
assert clf.features_ == [
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
@@ -45,59 +36,73 @@ def test_TAN_version(clf):
assert __version__ == clf.version()
def test_TAN_nodes_leaves(clf):
assert clf.nodes_leaves() == (0, 0)
def test_TAN_nodes_edges(clf, data_disc):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data_disc, head="random")
assert clf.nodes_leaves() == (5, 7)
def test_TAN_random_head(data):
clf = TAN(random_state=17)
clf.fit(*data, head="random")
def test_TAN_states(clf, data_disc):
assert clf.states_ == 0
clf.fit(*data_disc)
assert clf.states_ == 19
assert clf.depth_ == clf.states_
def test_TAN_random_head(clf, data_disc):
clf.fit(*data_disc, head="random")
assert clf.head_ == 3
def test_TAN_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"]
def test_TAN_classifier(data_disc, clf):
clf.fit(*data_disc)
attribs = [
"classes_",
"X_",
"y_",
"head_",
"feature_names_in_",
"class_name_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
X = data_disc[0]
y = data_disc[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 147
assert sum(y == y_pred) == 146
@image_comparison(
baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"]
)
def test_TAN_plot(data, clf):
def test_TAN_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
dataset = load_iris(as_frame=True)
clf.fit(*data, features=dataset["feature_names"], head=0)
clf.fit(*data_disc, features=features, head=0)
clf.plot("TAN Iris head=0")
def test_KDB_wrong_num_features(data, clf):
def test_TAN_wrong_num_features(data_disc, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_TAN_wrong_hyperparam(data, clf):
def test_TAN_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
clf.fit(*data_disc, wrong_param="wrong_param")
def test_TAN_head_out_of_range(data, clf):
def test_TAN_head_out_of_range(data_disc, clf):
with pytest.raises(ValueError, match="Head index out of range"):
clf.fit(*data, head=4)
clf.fit(*data_disc, head=4)
def test_TAN_error_size_predict(data, clf):
X, y = data
def test_TAN_error_size_predict(data_disc, clf):
X, y = data_disc
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))

View File

@@ -0,0 +1,120 @@
import pytest
import numpy as np
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from bayesclass.clfs import TANNew
from .._version import __version__
@pytest.fixture
def clf():
return TANNew(random_state=17)
def test_TANNew_default_hyperparameters(data, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state == 17
clf = TANNew(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data)
assert clf.head_ == 0
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
def test_TANNew_version(clf):
"""Check TANNew version."""
assert __version__ == clf.version()
def test_TANNew_nodes_edges(clf, data):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data, head="random")
assert clf.nodes_leaves() == (5, 7)
def test_TANNew_states(clf, data):
assert clf.states_ == 0
clf.fit(*data)
assert clf.states_ == 18
assert clf.depth_ == clf.states_
def test_TANNew_random_head(clf, data):
clf.fit(*data, head="random")
assert clf.head_ == 3
def test_TANNew_local_discretization(clf, data):
expected = [-1, [0, -1], [0, -1], [1, -1]]
clf.fit(*data)
for feature in range(4):
assert (
expected[feature] == clf.estimator_.discretizer_.target_[feature]
)
def test_TANNew_classifier(data, clf):
clf.fit(*data)
attribs = [
"classes_",
"X_",
"y_",
"head_",
"feature_names_in_",
"class_name_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 146
@image_comparison(
baseline_images=["line_dashes_TANNew"],
remove_text=True,
extensions=["png"],
)
def test_TANNew_plot(data, features, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
clf.fit(*data, features=features, head=0)
clf.plot("TANNew Iris head=0")
def test_TANNew_wrong_num_features(data, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
def test_TANNew_wrong_hyperparam(data, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
def test_TANNew_head_out_of_range(data, clf):
with pytest.raises(ValueError, match="Head index out of range"):
clf.fit(*data, head=4)
def test_TANNew_error_size_predict(data, clf):
X, y = data
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size)

View File

@@ -1,14 +1,29 @@
import pytest
import numpy as np
from sklearn.utils.estimator_checks import check_estimator
from bayesclass.clfs import TAN, KDB, AODE
from bayesclass.clfs import BayesBase, TAN, KDB, AODE
@pytest.mark.parametrize("estimator", [TAN(), KDB(k=2), AODE()])
# @pytest.mark.parametrize("estimator", [AODE()])
def test_all_estimators(estimator):
def test_more_tags():
expected = {
"requires_positive_X": True,
"requires_positive_y": True,
"preserve_dtype": [np.int32, np.int64],
"requires_y": True,
}
clf = BayesBase(None, True)
computed = clf._more_tags()
for key, value in expected.items():
assert key in computed
assert computed[key] == value
# @pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
@pytest.mark.parametrize("estimators", [AODE()])
def test_all_estimators(estimators):
i = 0
for estimator, test in check_estimator(estimator, generate_only=True):
for estimator, test in check_estimator(estimators, generate_only=True):
print(i := i + 1, test)
# test(estimator)

32
patch_pgmpy_0.1.22.diff Normal file
View File

@@ -0,0 +1,32 @@
diff --git a/pgmpy/models/BayesianNetwork.py b/pgmpy/models/BayesianNetwork.py
index bd90122d..70ae38f7 100644
--- a/pgmpy/models/BayesianNetwork.py
+++ b/pgmpy/models/BayesianNetwork.py
@@ -27,7 +27,7 @@ class BayesianNetwork(DAG):
Base class for Bayesian Models.
"""
- def __init__(self, ebunch=None, latents=set()):
+ def __init__(self, ebunch=None, latents=set(), show_progress=False):
"""
Initializes a Bayesian Model.
A models stores nodes and edges with conditional probability
@@ -95,6 +95,7 @@ class BayesianNetwork(DAG):
>>> len(G) # number of nodes in graph
3
"""
+ self.show_progress = show_progress
super(BayesianNetwork, self).__init__(ebunch=ebunch, latents=latents)
self.cpds = []
self.cardinalities = defaultdict(int)
@@ -738,7 +739,9 @@ class BayesianNetwork(DAG):
show_progress=False,
)
for index, data_point in tqdm(
- data_unique.iterrows(), total=data_unique.shape[0]
+ data_unique.iterrows(),
+ total=data_unique.shape[0],
+ disable=not self.show_progress,
)
)

View File

@@ -25,6 +25,7 @@ dependencies = [
"pgmpy",
"networkx",
"matplotlib",
"fimdlp",
]
requires-python = ">=3.8"
classifiers = [

View File

@@ -1,5 +1,6 @@
numpy
scipy
pandas
scikit-learn
matplotlib
networkx