39 Commits
v0.1.0 ... main

Author SHA1 Message Date
212f7e5584 Add test_BoostAODE 2023-06-18 16:51:38 +02:00
a797381c00 Continue BootAODE 2023-06-17 17:06:37 +02:00
3812d271e5 Add BoostAODE initial model 2023-06-15 14:28:35 +02:00
923a06b3be Patch pgmpy 0.1.22 show_progress 2023-06-15 14:22:24 +02:00
c906d6a361 Add weights to KDB classifier 2023-06-15 14:13:15 +02:00
Ricardo Montañana Gómez
f0f7c43944 Merge pull request #3 from Doctorado-ML/localdiscretization
Localdiscretization
2023-05-15 11:42:52 +02:00
f9b35f61f0 Use ancest-order to process local discretization
Fix local discretization
Refactor tests
Unifiy iris dataset from sklearn with iris.arff
2023-04-20 01:20:33 +02:00
74cd8a6aa2 Add local discretization tests 2023-04-08 11:44:25 +02:00
9843f5f8db Refactor AODE & AODENew 2023-04-07 16:22:40 +02:00
c6390d9da9 Comment out the integrity check in Proposal 2023-03-30 23:23:23 +02:00
c9afafbf60 Fix AODENew tests 2023-03-30 21:03:42 +02:00
3af05c9511 First AODENew implementation working 2023-03-30 12:20:56 +02:00
80b1ab3699 Refactor AODE 2023-03-29 19:05:55 +02:00
5a772b0bca Begin AODENew with tests 2023-03-29 11:18:42 +02:00
ea251aca05 Begin AODE implementation 2023-03-23 22:15:38 +01:00
7b66097728 Add messages to check_integrity 2023-03-23 22:10:03 +01:00
ea8c5b805e Add KDBNew and TANNew tests 2023-03-23 14:13:01 +01:00
2ffc06b232 Update feature states setting for datasets 2023-02-13 17:34:15 +01:00
a5244f1c7f remove trace messages for first try 2023-02-12 11:25:40 +01:00
42ac57eb79 Continue with New estimators 2023-02-07 18:02:35 +01:00
63a2feef3a Begin refactorization of new estimators 2023-02-07 09:42:42 +01:00
3e049ac89d default_features_class_name 2023-02-05 20:18:44 +01:00
2a6547c71d Complete KDBNew 2023-02-05 00:30:25 +01:00
de45a94c9b Add KDBNew estimator 2023-02-04 17:39:32 +01:00
9019b878f0 docs: 📝 Add text comment to KDB algorithm 2023-02-01 23:42:32 +01:00
bba9255605 Merge branch 'localdiscretization' of github.com:/doctorado-ml/bayesclass into localdiscretization 2023-02-01 23:41:40 +01:00
41ca6fad5e fix: 🐛 Change exit condition in KDB add_m_edges method
Change test if every conditional weight is less or equal to zero for less or equal to theta
Add text comments to KDB algorithm
2023-02-01 23:40:42 +01:00
c88591dd64 fix: 🐛 Change exit condition in KDB add_m_edges method
Change test if every conditional weight is less or equal to zero for less or equal to theta
2023-02-01 23:33:05 +01:00
8089e4fd57 docs: 📝 shorten comment lines length to <80 2023-01-30 19:27:27 +01:00
6f9488f281 Add version command to Makefile 2023-01-28 18:51:55 +01:00
e837c6cef7 feat: Add feature_names_in_ to classifiers 2023-01-27 19:25:01 +01:00
a4edc74e8d Replace len(self.features_) by self.n_features_in_ 2023-01-27 12:34:34 +01:00
Ricardo Montañana Gómez
4d416959ad fix: 🐛 Fix depth_ property as an alias of states_ 2023-01-22 14:15:19 +01:00
Ricardo Montañana Gómez
bdd3f483d9 feat: 🧐 Add nodes, edges and states info to models 2023-01-22 14:01:54 +01:00
Ricardo Montañana Gómez
8fd796155d test: 🧪 Add cycle test in KDB to get 100% coverage 2023-01-17 11:33:55 +01:00
Ricardo Montañana Gómez
d08aea4681 fix AODE state_names mistake 2023-01-12 14:05:27 +01:00
Ricardo Montañana Gómez
dd2e0a3b7e Update state_names hyperparameter to fit tests
Add computed nodes to classifiers
2023-01-12 12:04:54 +01:00
65d41488cb Fix AODE state_names 2022-12-29 00:45:10 +01:00
e7300366ca Add fit_params to model fit 2022-12-28 19:15:34 +01:00
21 changed files with 1362 additions and 206 deletions

View File

@@ -37,6 +37,12 @@ doc-clean: ## Update documentation
audit: ## Audit pip audit: ## Audit pip
pip-audit pip-audit
version:
@echo "Current Python version .....: $(shell python --version)"
@echo "Current Bayesclass version .: $(shell python -c "from bayesclass import _version; print(_version.__version__)")"
@echo "Installed Bayesclass version: $(shell pip show bayesclass | grep Version | cut -d' ' -f2)"
@echo "Installed pgmpy version ....: $(shell pip show pgmpy | grep Version | cut -d' ' -f2)"
help: ## Show help message help: ## Show help message
@IFS=$$'\n' ; \ @IFS=$$'\n' ; \
help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \ help_lines=(`fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's/##/:/'`); \

View File

@@ -16,4 +16,8 @@ __all__ = [
"TAN", "TAN",
"KDB", "KDB",
"AODE", "AODE",
"KDBNew",
"AODENew",
"BoostAODE",
"BoostSPODE",
] ]

View File

@@ -1 +1 @@
__version__ = "0.1.0" __version__ = "0.1.1"

View File

@@ -1,8 +1,10 @@
import random import random
import warnings
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from scipy.stats import mode from scipy.stats import mode
from sklearn.base import ClassifierMixin, BaseEstimator from sklearn.base import clone, ClassifierMixin, BaseEstimator
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import BaseEnsemble from sklearn.ensemble import BaseEnsemble
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels from sklearn.utils.multiclass import unique_labels
@@ -10,10 +12,16 @@ from sklearn.feature_selection import mutual_info_classif
import networkx as nx import networkx as nx
from pgmpy.estimators import TreeSearch, BayesianEstimator from pgmpy.estimators import TreeSearch, BayesianEstimator
from pgmpy.models import BayesianNetwork from pgmpy.models import BayesianNetwork
from pgmpy.base import DAG
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from fimdlp.mdlp import FImdlp
from ._version import __version__ from ._version import __version__
def default_feature_names(num_features):
return [f"feature_{i}" for i in range(num_features)]
class BayesBase(BaseEstimator, ClassifierMixin): class BayesBase(BaseEstimator, ClassifierMixin):
def __init__(self, random_state, show_progress): def __init__(self, random_state, show_progress):
self.random_state = random_state self.random_state = random_state
@@ -23,7 +31,7 @@ class BayesBase(BaseEstimator, ClassifierMixin):
return { return {
"requires_positive_X": True, "requires_positive_X": True,
"requires_positive_y": True, "requires_positive_y": True,
"preserve_dtype": [np.int64, np.int32], "preserve_dtype": [np.int32, np.int64],
"requires_y": True, "requires_y": True,
} }
@@ -32,35 +40,68 @@ class BayesBase(BaseEstimator, ClassifierMixin):
"""Return the version of the package.""" """Return the version of the package."""
return __version__ return __version__
def nodes_leaves(self): def nodes_edges(self):
"""To keep compatiblity with the benchmark platform""" if hasattr(self, "dag_"):
return len(self.dag_), len(self.dag_.edges())
return 0, 0 return 0, 0
@staticmethod
def default_class_name():
return "class"
def build_dataset(self):
self.dataset_ = pd.DataFrame(
self.X_, columns=self.feature_names_in_, dtype=np.int32
)
self.dataset_[self.class_name_] = self.y_
if self.sample_weight_ is not None:
self.dataset_["_weight"] = self.sample_weight_
def _check_params_fit(self, X, y, expected_args, kwargs): def _check_params_fit(self, X, y, expected_args, kwargs):
"""Check the common parameters passed to fit""" """Check the common parameters passed to fit"""
# Check that X and y have correct shape # Check that X and y have correct shape
X, y = check_X_y(X, y) X, y = check_X_y(X, y)
X = self._validate_data(X, reset=True)
# Store the classes seen during fit # Store the classes seen during fit
self.classes_ = unique_labels(y) self.classes_ = unique_labels(y)
self.n_classes_ = self.classes_.shape[0] self.n_classes_ = self.classes_.shape[0]
# Default values # Default values
self.class_name_ = "class" self.weighted_ = False
self.features_ = [f"feature_{i}" for i in range(X.shape[1])] self.sample_weight_ = None
self.class_name_ = self.default_class_name()
self.features_ = default_feature_names(X.shape[1])
for key, value in kwargs.items(): for key, value in kwargs.items():
if key in expected_args: if key in expected_args:
setattr(self, f"{key}_", value) setattr(self, f"{key}_", value)
else: else:
raise ValueError(f"Unexpected argument: {key}") raise ValueError(f"Unexpected argument: {key}")
self.feature_names_in_ = self.features_
# used for local discretization
self.indexed_features_ = {
feature: i for i, feature in enumerate(self.features_)
}
if self.random_state is not None: if self.random_state is not None:
random.seed(self.random_state) random.seed(self.random_state)
if len(self.features_) != X.shape[1]: if len(self.feature_names_in_) != X.shape[1]:
raise ValueError( raise ValueError(
"Number of features does not match the number of columns in X" "Number of features does not match the number of columns in X"
) )
self.n_features_in_ = X.shape[1]
return X, y return X, y
@property
def states_(self):
if hasattr(self, "fitted_"):
return sum([len(item) for _, item in self.model_.states.items()])
return 0
@property
def depth_(self):
return self.states_
def fit(self, X, y, **kwargs): def fit(self, X, y, **kwargs):
"""A reference implementation of a fitting function for a classifier. """Fit classifier
Parameters Parameters
---------- ----------
@@ -97,28 +138,43 @@ class BayesBase(BaseEstimator, ClassifierMixin):
>>> model.fit(train_data, train_y, features=features, class_name='E') >>> model.fit(train_data, train_y, features=features, class_name='E')
TAN(random_state=17) TAN(random_state=17)
""" """
X_, y_ = self._check_params(X, y, kwargs) self.X_, self.y_ = self._check_params(X, y, kwargs)
# Store the information needed to build the model # Store the information needed to build the model
self.X_ = X_ self.build_dataset()
self.y_ = y_
self.dataset_ = pd.DataFrame(self.X_, columns=self.features_)
self.dataset_[self.class_name_] = self.y_
# Build the DAG # Build the DAG
self._build() self._build()
# Train the model # Train the model
self._train() self._train(kwargs)
self.fitted_ = True self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
# Return the classifier # Return the classifier
return self return self
def _train(self): def _build(self):
"""This method should be implemented by the subclasses to
build the DAG
"""
...
def _train(self, kwargs):
"""Build and train a BayesianNetwork from the DAG and the dataset
Parameters
----------
kwargs : dict
fit parameters
"""
self.model_ = BayesianNetwork( self.model_ = BayesianNetwork(
self.dag_.edges(), show_progress=self.show_progress self.dag_.edges(), show_progress=self.show_progress
) )
states = dict(state_names=kwargs.pop("state_names", []))
self.model_.fit( self.model_.fit(
self.dataset_, self.dataset_,
estimator=BayesianEstimator, estimator=BayesianEstimator,
prior_type="K2", prior_type="K2",
weighted=self.weighted_,
**states,
) )
def predict(self, X): def predict(self, X):
@@ -169,13 +225,15 @@ class BayesBase(BaseEstimator, ClassifierMixin):
""" """
# Check is fit had been called # Check is fit had been called
check_is_fitted(self, ["X_", "y_", "fitted_"]) check_is_fitted(self, ["X_", "y_", "fitted_"])
# Input validation # Input validation
X = check_array(X) X = check_array(X)
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16") dataset = pd.DataFrame(
X, columns=self.feature_names_in_, dtype=np.int32
)
return self.model_.predict(dataset).values.ravel() return self.model_.predict(dataset).values.ravel()
def plot(self, title="", node_size=800): def plot(self, title="", node_size=800):
warnings.simplefilter("ignore", UserWarning)
nx.draw_circular( nx.draw_circular(
self.model_, self.model_,
with_labels=True, with_labels=True,
@@ -208,7 +266,7 @@ class TAN(BayesBase):
The classes seen at :meth:`fit`. The classes seen at :meth:`fit`.
class_name_ : str class_name_ : str
The name of the class column The name of the class column
features_ : list feature_names_in_ : list
The list of features names The list of features names
head_ : int head_ : int
The index of the node used as head for the initial DAG The index of the node used as head for the initial DAG
@@ -227,21 +285,47 @@ class TAN(BayesBase):
def _check_params(self, X, y, kwargs): def _check_params(self, X, y, kwargs):
self.head_ = 0 self.head_ = 0
expected_args = ["class_name", "features", "head"] expected_args = ["class_name", "features", "head", "state_names"]
X, y = self._check_params_fit(X, y, expected_args, kwargs) X, y = self._check_params_fit(X, y, expected_args, kwargs)
if self.head_ == "random": if self.head_ == "random":
self.head_ = random.randint(0, len(self.features_) - 1) self.head_ = random.randint(0, self.n_features_in_ - 1)
if self.head_ is not None and self.head_ >= len(self.features_): if self.head_ is not None and self.head_ >= self.n_features_in_:
raise ValueError("Head index out of range") raise ValueError("Head index out of range")
return X, y return X, y
def _build(self): def _build(self):
est = TreeSearch(self.dataset_, root_node=self.features_[self.head_]) est = TreeSearch(
self.dataset_, root_node=self.feature_names_in_[self.head_]
)
self.dag_ = est.estimate( self.dag_ = est.estimate(
estimator_type="tan", estimator_type="tan",
class_node=self.class_name_, class_node=self.class_name_,
show_progress=self.show_progress, show_progress=self.show_progress,
) )
# Code taken from pgmpy
# n_jobs = -1
# weights = TreeSearch._get_conditional_weights(
# self.dataset_,
# self.class_name_,
# "mutual_info",
# n_jobs,
# self.show_progress,
# )
# # Step 4.2: Construct chow-liu DAG on {data.columns - class_node}
# class_node_idx = np.where(self.dataset_.columns == self.class_name_)[
# 0
# ][0]
# weights = np.delete(weights, class_node_idx, axis=0)
# weights = np.delete(weights, class_node_idx, axis=1)
# reduced_columns = np.delete(self.dataset_.columns, class_node_idx)
# D = TreeSearch._create_tree_and_dag(
# weights, reduced_columns, self.feature_names_in_[self.head_]
# )
# # Step 4.3: Add edges from class_node to all other nodes.
# D.add_edges_from(
# [(self.class_name_, node) for node in reduced_columns]
# )
# self.dag_ = D
class KDB(BayesBase): class KDB(BayesBase):
@@ -253,46 +337,55 @@ class KDB(BayesBase):
) )
def _check_params(self, X, y, kwargs): def _check_params(self, X, y, kwargs):
expected_args = ["class_name", "features"] expected_args = [
"class_name",
"features",
"state_names",
"sample_weight",
"weighted",
]
return self._check_params_fit(X, y, expected_args, kwargs) return self._check_params_fit(X, y, expected_args, kwargs)
def _add_m_edges(self, dag, idx, S_nodes, conditional_weights):
n_edges = min(self.k, len(S_nodes))
cond_w = conditional_weights.copy()
exit_cond = self.k == 0
num = 0
while not exit_cond:
max_minfo = np.argmax(cond_w[idx, :])
if max_minfo in S_nodes and cond_w[idx, max_minfo] > self.theta:
try:
dag.add_edge(
self.feature_names_in_[max_minfo],
self.feature_names_in_[idx],
)
num += 1
except ValueError:
# Loops are not allowed
pass
cond_w[idx, max_minfo] = -1
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= self.theta)
def _build(self): def _build(self):
""" """
1. For each feature Xi, compute mutual information, I(X;;C), where C is the class. 1. For each feature Xi, compute mutual information, I(X;;C),
2. Compute class conditional mutual information I(Xi;XjIC), f or each pair of features Xi and Xj, where i#j. where C is the class.
2. Compute class conditional mutual information I(Xi;XjIC), f or each
pair of features Xi and Xj, where i#j.
3. Let the used variable list, S, be empty. 3. Let the used variable list, S, be empty.
4. Let the Bayesian network being constructed, BN, begin with a single class node, C. 4. Let the DAG network being constructed, BN, begin with a single
class node, C.
5. Repeat until S includes all domain features 5. Repeat until S includes all domain features
5.1. Select feature Xmax which is not in S and has the largest value I(Xmax;C). 5.1. Select feature Xmax which is not in S and has the largest value
I(Xmax;C).
5.2. Add a node to BN representing Xmax. 5.2. Add a node to BN representing Xmax.
5.3. Add an arc from C to Xmax in BN. 5.3. Add an arc from C to Xmax in BN.
5.4. Add m =min(lSl,/c) arcs from m distinct features Xj in S with the highest value for I(Xmax;X,jC). 5.4. Add m = min(lSl,/c) arcs from m distinct features Xj in S with
the highest value for I(Xmax;X,jC).
5.5. Add Xmax to S. 5.5. Add Xmax to S.
Compute the conditional probabilility infered by the structure of BN by using counts from DB, and output BN. Compute the conditional probabilility infered by the structure of BN by
using counts from DB, and output BN.
""" """
def add_m_edges(dag, idx, S_nodes, conditional_weights):
n_edges = min(self.k, len(S_nodes))
cond_w = conditional_weights.copy()
exit_cond = self.k == 0
num = 0
while not exit_cond:
max_minfo = np.argmax(cond_w[idx, :])
if (
max_minfo in S_nodes
and cond_w[idx, max_minfo] > self.theta
):
try:
dag.add_edge(
self.features_[max_minfo], self.features_[idx]
)
num += 1
except ValueError:
# Loops are not allowed
pass
cond_w[idx, max_minfo] = -1
exit_cond = num == n_edges or np.all(cond_w[idx, :] <= 0)
# 1. get the mutual information between each feature and the class # 1. get the mutual information between each feature and the class
mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True) mutual = mutual_info_classif(self.X_, self.y_, discrete_features=True)
# 2. symmetric matrix where each element represents I(X, Y| class_node) # 2. symmetric matrix where each element represents I(X, Y| class_node)
@@ -301,73 +394,522 @@ class KDB(BayesBase):
)._get_conditional_weights( )._get_conditional_weights(
self.dataset_, self.class_name_, show_progress=self.show_progress self.dataset_, self.class_name_, show_progress=self.show_progress
) )
# 3. # 3. Let the used variable list, S, be empty.
S_nodes = [] S_nodes = []
# 4. # 4. Let the DAG being constructed, BN, begin with a single class node
dag = BayesianNetwork() dag = BayesianNetwork(show_progress=self.show_progress)
dag.add_node(self.class_name_) # , state_names=self.classes_) dag.add_node(self.class_name_) # , state_names=self.classes_)
# 5. 5.1 # 5. Repeat until S includes all domain features
# 5.1 Select feature Xmax which is not in S and has the largest value
for idx in np.argsort(mutual): for idx in np.argsort(mutual):
# 5.2 # 5.2 Add a node to BN representing Xmax.
feature = self.features_[idx] feature = self.feature_names_in_[idx]
dag.add_node(feature) dag.add_node(feature)
# 5.3 # 5.3 Add an arc from C to Xmax in BN.
dag.add_edge(self.class_name_, feature) dag.add_edge(self.class_name_, feature)
# 5.4 # 5.4 Add m = min(lSl,/c) arcs from m distinct features Xj in S
add_m_edges(dag, idx, S_nodes, conditional_weights) self._add_m_edges(dag, idx, S_nodes, conditional_weights)
# 5.5 # 5.5 Add Xmax to S.
S_nodes.append(idx) S_nodes.append(idx)
self.dag_ = dag self.dag_ = dag
class AODE(BayesBase, BaseEnsemble): def build_spodes(features, class_name):
def __init__(self, show_progress=False, random_state=None): """Build SPODE estimators (Super Parent One Dependent Estimator)"""
class_edges = [(class_name, f) for f in features]
for idx in range(len(features)):
feature_edges = [
(features[idx], f) for f in features if f != features[idx]
]
feature_edges.extend(class_edges)
model = BayesianNetwork(feature_edges, show_progress=False)
yield model
class SPODE(BayesBase):
def _check_params(self, X, y, kwargs):
expected_args = [
"class_name",
"features",
"state_names",
"sample_weight",
"weighted",
]
return self._check_params_fit(X, y, expected_args, kwargs)
class AODE(ClassifierMixin, BaseEnsemble):
def __init__(
self,
show_progress=False,
random_state=None,
estimator=None,
):
self.show_progress = show_progress
self.random_state = random_state
super().__init__(estimator=estimator)
def _validate_estimator(self) -> None:
"""Check the estimator and set the estimator_ attribute."""
super()._validate_estimator(
default=SPODE(
random_state=self.random_state,
show_progress=self.show_progress,
)
)
def fit(self, X, y, **kwargs):
self.n_features_in_ = X.shape[1]
self.feature_names_in_ = kwargs.get(
"features", default_feature_names(self.n_features_in_)
)
self.class_name_ = kwargs.get("class_name", "class")
# build estimator
self._validate_estimator()
self.X_ = X
self.y_ = y
self.n_samples_ = X.shape[0]
self.estimators_ = []
self._train(kwargs)
self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
return self
def _train(self, kwargs):
for dag in build_spodes(self.feature_names_in_, self.class_name_):
estimator = clone(self.estimator_)
estimator.dag_ = estimator.model_ = dag
estimator.fit(self.X_, self.y_, **kwargs)
self.estimators_.append(estimator)
def predict(self, X: np.ndarray) -> np.ndarray:
n_samples = X.shape[0]
n_estimators = len(self.estimators_)
result = np.empty((n_samples, n_estimators))
for index, estimator in enumerate(self.estimators_):
result[:, index] = estimator.predict(X)
return mode(result, axis=1, keepdims=False).mode.ravel()
def version(self):
if hasattr(self, "fitted_"):
return self.estimator_.version()
return SPODE(None, False).version()
@property
def states_(self):
if hasattr(self, "fitted_"):
return sum(
[
len(item)
for model in self.estimators_
for _, item in model.model_.states.items()
]
) / len(self.estimators_)
return 0
@property
def depth_(self):
return self.states_
def nodes_edges(self):
nodes = 0
edges = 0
if hasattr(self, "fitted_"):
nodes = sum([len(x.dag_) for x in self.estimators_])
edges = sum([len(x.dag_.edges()) for x in self.estimators_])
return nodes, edges
def plot(self, title=""):
warnings.simplefilter("ignore", UserWarning)
for idx, model in enumerate(self.estimators_):
model.plot(title=f"{idx} {title}")
class TANNew(TAN):
def __init__(
self,
show_progress=False,
random_state=None,
discretizer_depth=1e6,
discretizer_length=3,
discretizer_cuts=0,
):
self.discretizer_depth = discretizer_depth
self.discretizer_length = discretizer_length
self.discretizer_cuts = discretizer_cuts
super().__init__( super().__init__(
show_progress=show_progress, random_state=random_state show_progress=show_progress, random_state=random_state
) )
def _check_params(self, X, y, kwargs): def fit(self, X, y, **kwargs):
expected_args = ["class_name", "features"] self.estimator_ = Proposal(self)
return self._check_params_fit(X, y, expected_args, kwargs) self.estimator_.fit(X, y, **kwargs)
return self
def _build(self): def predict(self, X):
return self.estimator_.predict(X)
self.dag_ = None
def _train(self): class KDBNew(KDB):
"""Build SPODE estimators (Super Parent One Dependent Estimator)""" def __init__(
self.models_ = [] self,
class_edges = [(self.class_name_, f) for f in self.features_] k=2,
for idx in range(len(self.features_)): show_progress=False,
feature_edges = [ random_state=None,
(self.features_[idx], f) discretizer_depth=1e6,
for f in self.features_ discretizer_length=3,
if f != self.features_[idx] discretizer_cuts=0,
] ):
feature_edges.extend(class_edges) self.discretizer_depth = discretizer_depth
model = BayesianNetwork( self.discretizer_length = discretizer_length
feature_edges, show_progress=self.show_progress self.discretizer_cuts = discretizer_cuts
) super().__init__(
model.fit( k=k, show_progress=show_progress, random_state=random_state
self.dataset_, )
estimator=BayesianEstimator,
prior_type="K2",
)
self.models_.append(model)
def plot(self, title=""): def fit(self, X, y, **kwargs):
for idx, model in enumerate(self.models_): self.estimator_ = Proposal(self)
self.model_ = model self.estimator_.fit(X, y, **kwargs)
super().plot(title=f"{idx} {title}") return self
def predict(self, X):
return self.estimator_.predict(X)
class SPODENew(SPODE):
"""This class implements a classifier for the SPODE algorithm similar to
TANNew and KDBNew"""
def __init__(
self,
random_state,
show_progress,
discretizer_depth=1e6,
discretizer_length=3,
discretizer_cuts=0,
):
super().__init__(
random_state=random_state, show_progress=show_progress
)
self.discretizer_depth = discretizer_depth
self.discretizer_length = discretizer_length
self.discretizer_cuts = discretizer_cuts
class AODENew(AODE):
def __init__(
self,
random_state=None,
show_progress=False,
discretizer_depth=1e6,
discretizer_length=3,
discretizer_cuts=0,
):
self.discretizer_depth = discretizer_depth
self.discretizer_length = discretizer_length
self.discretizer_cuts = discretizer_cuts
super().__init__(
random_state=random_state,
show_progress=show_progress,
estimator=Proposal(
SPODENew(
random_state=random_state,
show_progress=show_progress,
discretizer_depth=discretizer_depth,
discretizer_length=discretizer_length,
discretizer_cuts=discretizer_cuts,
)
),
)
def _train(self, kwargs):
for dag in build_spodes(self.feature_names_in_, self.class_name_):
proposal = clone(self.estimator_)
proposal.estimator.dag_ = proposal.estimator.model_ = dag
self.estimators_.append(proposal.fit(self.X_, self.y_, **kwargs))
self.n_estimators_ = len(self.estimators_)
def predict(self, X: np.ndarray) -> np.ndarray: def predict(self, X: np.ndarray) -> np.ndarray:
check_is_fitted(self, ["X_", "y_", "fitted_"]) check_is_fitted(self, ["X_", "y_", "fitted_"])
# Input validation # Input validation
X = self._validate_data(X, reset=False) X = check_array(X)
n_samples = X.shape[0] result = np.empty((X.shape[0], self.n_estimators_))
n_estimators = len(self.models_) for index, model in enumerate(self.estimators_):
result = np.empty((n_samples, n_estimators)) result[:, index] = model.predict(X)
dataset = pd.DataFrame(X, columns=self.features_, dtype="int16")
for index, model in enumerate(self.models_):
result[:, index] = model.predict(dataset).values.ravel()
return mode(result, axis=1, keepdims=False).mode.ravel() return mode(result, axis=1, keepdims=False).mode.ravel()
@property
def states_(self):
if hasattr(self, "fitted_"):
return sum(
[
len(item)
for model in self.estimators_
for _, item in model.estimator.model_.states.items()
]
) / len(self.estimators_)
return 0
@property
def depth_(self):
return self.states_
def nodes_edges(self):
nodes = 0
edges = 0
if hasattr(self, "fitted_"):
nodes = sum([len(x.estimator.dag_) for x in self.estimators_])
edges = sum(
[len(x.estimator.dag_.edges()) for x in self.estimators_]
)
return nodes, edges
def plot(self, title=""):
warnings.simplefilter("ignore", UserWarning)
for idx, model in enumerate(self.estimators_):
model.estimator.plot(title=f"{idx} {title}")
def version(self):
if hasattr(self, "fitted_"):
return self.estimator_.estimator.version()
return SPODENew(None, False).version()
class Proposal(BaseEstimator):
def __init__(self, estimator):
self.estimator = estimator
self.class_type = estimator.__class__
def fit(self, X, y, **kwargs):
# Check parameters
self.estimator._check_params(X, y, kwargs)
# Discretize train data
self.discretizer_ = FImdlp(
n_jobs=1,
max_depth=self.estimator.discretizer_depth,
min_length=self.estimator.discretizer_length,
max_cuts=self.estimator.discretizer_cuts,
)
self.Xd = self.discretizer_.fit_transform(X, y)
kwargs = self.update_kwargs(y, kwargs)
# Build the model
super(self.class_type, self.estimator).fit(self.Xd, y, **kwargs)
# Local discretization based on the model
self._local_discretization()
# self.check_integrity("fit", self.Xd)
self.fitted_ = True
return self
def predict(self, X):
# Check is fit had been called
check_is_fitted(self, ["fitted_"])
# Input validation
X = check_array(X)
Xd = self.discretizer_.transform(X)
# self.check_integrity("predict", Xd)
return super(self.class_type, self.estimator).predict(Xd)
def update_kwargs(self, y, kwargs):
features = (
kwargs["features"]
if "features" in kwargs
else default_feature_names(self.Xd.shape[1])
)
states = {
features[i]: self.discretizer_.get_states_feature(i)
for i in range(self.Xd.shape[1])
}
class_name = (
kwargs["class_name"]
if "class_name" in kwargs
else self.estimator.default_class_name()
)
states[class_name] = np.unique(y).tolist()
kwargs["state_names"] = states
self.state_names_ = states
self.features_ = features
kwargs["features"] = features
kwargs["class_name"] = class_name
return kwargs
def _local_discretization(self):
"""Discretize each feature with its fathers and the class"""
upgrade = False
# order of local discretization is important. no good 0, 1, 2...
ancestral_order = list(nx.topological_sort(self.estimator.dag_))
for feature in ancestral_order:
if feature == self.estimator.class_name_:
continue
idx = self.estimator.indexed_features_[feature]
fathers = self.estimator.dag_.get_parents(feature)
if len(fathers) > 1:
# First remove the class name as it will be added later
fathers.remove(self.estimator.class_name_)
# Get the fathers indices
features = [
self.estimator.indexed_features_[f] for f in fathers
]
# Update the discretization of the feature
self.Xd[:, idx] = self.discretizer_.join_fit(
# each feature has to use previous discretization data=res
target=idx,
features=features,
data=self.Xd,
)
upgrade = True
if upgrade:
# Update the dataset
self.estimator.X_ = self.Xd
self.estimator.build_dataset()
self.state_names_ = {
key: self.discretizer_.get_states_feature(value)
for key, value in self.estimator.indexed_features_.items()
}
states = {"state_names": self.state_names_}
# Update the model
self.estimator.model_.fit(
self.estimator.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
**states,
)
# def check_integrity(self, source, X):
# # print(f"Checking integrity of {source} data")
# for i in range(X.shape[1]):
# if not set(np.unique(X[:, i]).tolist()).issubset(
# set(self.state_names_[self.features_[i]])
# ):
# print(
# "i",
# i,
# "features[i]",
# self.features_[i],
# "np.unique(X[:, i])",
# np.unique(X[:, i]),
# "np.array(state_names[features[i]])",
# np.array(self.state_names_[self.features_[i]]),
# )
# raise ValueError("Discretization error")
class BoostSPODE(BayesBase):
def _check_params(self, X, y, kwargs):
expected_args = [
"class_name",
"features",
"state_names",
"sample_weight",
"weighted",
"sparent",
]
return self._check_params_fit(X, y, expected_args, kwargs)
def _build(self):
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [
(self.sparent_, f)
for f in self.feature_names_in_
if f != self.sparent_
]
feature_edges.extend(class_edges)
self.dag_ = DAG(feature_edges)
def _train(self, kwargs):
states = dict(state_names=kwargs.get("state_names", []))
breakpoint()
self.model_ = BayesianNetwork(self.dag_.edges(), show_progress=False)
self.model_.fit(
self.dataset_,
estimator=BayesianEstimator,
prior_type="K2",
weighted=self.weighted_,
**states,
)
class BoostAODE(ClassifierMixin, BaseEnsemble):
def __init__(
self,
show_progress=False,
random_state=None,
estimator=None,
n_estimators=10,
):
self.show_progress = show_progress
self.random_state = random_state
self.n_estimators = n_estimators
super().__init__(estimator=estimator)
def _validate_estimator(self) -> None:
"""Check the estimator and set the estimator_ attribute."""
super()._validate_estimator(
default=BoostSPODE(
random_state=self.random_state,
show_progress=self.show_progress,
)
)
def fit(self, X, y, **kwargs):
self.n_features_in_ = X.shape[1]
self.feature_names_in_ = kwargs.get(
"features", default_feature_names(self.n_features_in_)
)
self.class_name_ = kwargs.get("class_name", "class")
self.X_ = X
self.y_ = y
self.n_samples_ = X.shape[0]
self.estimators_ = []
self._validate_estimator()
self._train(kwargs)
self.fitted_ = True
# To keep compatiblity with the benchmark platform
self.nodes_leaves = self.nodes_edges
return self
def _train(self, kwargs):
"""Build boosted SPODEs"""
weights = [1 / self.n_samples_] * self.n_samples_
# Step 0: Set the finish condition
for num in range(self.n_estimators):
# Step 1: Build ranking with mutual information
# OJO MAL, ESTO NO ACTUALIZA EL RANKING CON LOS PESOS
# SIEMPRE VA A SACAR LO MISMO
feature = (
SelectKBest(k=1)
.fit(self.X_, self.y_)
.get_feature_names_out(self.feature_names_in_)
.tolist()[0]
)
# Step 2: Build & train spode with the first feature as sparent
estimator = clone(self.estimator_)
_args = kwargs.copy()
_args["sparent"] = feature
_args["sample_weight"] = weights
_args["weighted"] = True
# Step 2.1: build dataset
# Step 2.2: Train the model
estimator.fit(self.X_, self.y_, **_args)
# Step 3: Compute errors (epsilon sub m & alpha sub m)
# Explanation in https://medium.datadriveninvestor.com/understanding-adaboost-and-scikit-learns-algorithm-c8d8af5ace10
y_pred = estimator.predict(self.X_)
em = np.sum(weights * (y_pred != self.y_)) / np.sum(weights)
am = np.log((1 - em) / em) + np.log(estimator.n_classes_ - 1)
# Step 3.2: Update weights for next classifier
weights = [
wm * np.exp(am * (ym != y_pred))
for wm, ym in zip(weights, self.y_)
]
# Step 4: Add the new model
self.estimators_.append(estimator)
"""
class_edges = [(self.class_name_, f) for f in self.feature_names_in_]
feature_edges = [
(sparent, f) for f in self.feature_names_in_ if f != sparent
]
self.weights_ = weights.copy() if weights is not None else None
feature_edges.extend(class_edges)
self.model_ = BayesianNetwork(feature_edges, show_progress=False)
return self.model_
"""

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 50 KiB

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1,38 @@
import pytest
from sklearn.datasets import load_iris
from fimdlp.mdlp import FImdlp
@pytest.fixture
def iris():
dataset = load_iris()
X = dataset["data"]
y = dataset["target"]
features = dataset["feature_names"]
# To make iris dataset has the same values as our iris.arff dataset
patch = {(34, 3): (0.2, 0.1), (37, 1): (3.6, 3.1), (37, 2): (1.4, 1.5)}
for key, value in patch.items():
X[key] = value[1]
return X, y, features
@pytest.fixture
def data(iris):
return iris[0], iris[1]
@pytest.fixture
def features(iris):
return iris[2]
@pytest.fixture
def class_name():
return "class"
@pytest.fixture
def data_disc(data):
clf = FImdlp()
X, y = data
return clf.fit_transform(X, y), y

View File

@@ -1,6 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
@@ -10,28 +9,21 @@ from bayesclass.clfs import AODE
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return AODE() return AODE(random_state=17)
def test_AODE_default_hyperparameters(data, clf): def test_AODE_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters # Test default values of hyperparameters
assert not clf.show_progress assert not clf.show_progress
assert clf.random_state is None
clf = AODE(show_progress=True, random_state=17)
assert clf.show_progress
assert clf.random_state == 17 assert clf.random_state == 17
clf.fit(*data) clf = AODE(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data_disc)
assert clf.class_name_ == "class" assert clf.class_name_ == "class"
assert clf.features_ == [ assert clf.feature_names_in_ == [
"feature_0", "feature_0",
"feature_1", "feature_1",
"feature_2", "feature_2",
@@ -42,50 +34,66 @@ def test_AODE_default_hyperparameters(data, clf):
@image_comparison( @image_comparison(
baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"] baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
) )
def test_AODE_plot(data, clf): def test_AODE_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data_disc, features=features)
clf.fit(*data, features=dataset["feature_names"])
clf.plot("AODE Iris") clf.plot("AODE Iris")
def test_AODE_version(clf): def test_AODE_version(clf, features, data_disc):
"""Check AODE version.""" """Check AODE version."""
assert __version__ == clf.version() assert __version__ == clf.version()
clf.fit(*data_disc, features=features)
assert __version__ == clf.version()
def test_AODE_nodes_leaves(clf): def test_AODE_nodes_edges(clf, data_disc):
assert clf.nodes_leaves() == (0, 0) assert clf.nodes_edges() == (0, 0)
clf.fit(*data_disc)
assert clf.nodes_leaves() == (20, 28)
def test_AODE_classifier(data, clf): def test_AODE_states(clf, data_disc):
clf.fit(*data) assert clf.states_ == 0
attribs = ["classes_", "X_", "y_", "features_", "class_name_"] clf.fit(*data_disc)
assert clf.states_ == 19
assert clf.depth_ == clf.states_
def test_AODE_classifier(data_disc, clf):
clf.fit(*data_disc)
attribs = [
"feature_names_in_",
"class_name_",
"n_features_in_",
"X_",
"y_",
]
for attr in attribs: for attr in attribs:
assert hasattr(clf, attr) assert hasattr(clf, attr)
X = data[0] X = data_disc[0]
y = data[1] y = data_disc[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 147 assert sum(y == y_pred) == 146
def test_AODE_wrong_num_features(data, clf): def test_AODE_wrong_num_features(data_disc, clf):
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="Number of features does not match the number of columns in X", match="Number of features does not match the number of columns in X",
): ):
clf.fit(*data, features=["feature_1", "feature_2"]) clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_AODE_wrong_hyperparam(data, clf): def test_AODE_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param") clf.fit(*data_disc, wrong_param="wrong_param")
def test_AODE_error_size_predict(data, clf): def test_AODE_error_size_predict(data_disc, clf):
X, y = data X, y = data_disc
clf.fit(X, y) clf.fit(X, y)
with pytest.raises(ValueError): with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1)) X_diff_size = np.ones((10, X.shape[1] + 1))

View File

@@ -0,0 +1,123 @@
import pytest
import numpy as np
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from bayesclass.clfs import AODENew
from .._version import __version__
@pytest.fixture
def clf():
return AODENew(random_state=17)
def test_AODENew_default_hyperparameters(data, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state == 17
clf = AODENew(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data)
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
@image_comparison(
baseline_images=["line_dashes_AODENew"],
remove_text=True,
extensions=["png"],
)
def test_AODENew_plot(data, features, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
clf.fit(*data, features=features)
clf.plot("AODE Iris")
def test_AODENew_version(clf, data):
"""Check AODENew version."""
assert __version__ == clf.version()
clf.fit(*data)
assert __version__ == clf.version()
def test_AODENew_nodes_edges(clf, data):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data)
assert clf.nodes_leaves() == (20, 28)
def test_AODENew_states(clf, data):
assert clf.states_ == 0
clf.fit(*data)
assert clf.states_ == 17.75
assert clf.depth_ == clf.states_
def test_AODENew_classifier(data, clf):
clf.fit(*data)
attribs = [
"feature_names_in_",
"class_name_",
"n_features_in_",
"X_",
"y_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 146
def test_AODENew_local_discretization(clf, data_disc):
expected_data = [
[-1, [0, -1], [0, -1], [0, -1]],
[[1, -1], -1, [1, -1], [1, -1]],
[[2, -1], [2, -1], -1, [2, -1]],
[[3, -1], [3, -1], [3, -1], -1],
]
clf.fit(*data_disc)
for idx, estimator in enumerate(clf.estimators_):
expected = expected_data[idx]
for feature in range(4):
computed = estimator.discretizer_.target_[feature]
if type(computed) == list:
for j, k in zip(expected[feature], computed):
assert j == k
else:
assert (
expected[feature]
== estimator.discretizer_.target_[feature]
)
def test_AODENew_wrong_num_features(data, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
def test_AODENew_wrong_hyperparam(data, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
def test_AODENew_error_size_predict(data, clf):
X, y = data
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size)

View File

@@ -0,0 +1,100 @@
import pytest
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from bayesclass.clfs import BoostAODE
from .._version import __version__
@pytest.fixture
def clf():
return BoostAODE(random_state=17)
def test_BoostAODE_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state == 17
clf = BoostAODE(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data_disc)
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
# @image_comparison(
# baseline_images=["line_dashes_AODE"], remove_text=True, extensions=["png"]
# )
# def test_BoostAODE_plot(data_disc, features, clf):
# # mpl_test_settings will automatically clean these internal side effects
# mpl_test_settings
# clf.fit(*data_disc, features=features)
# clf.plot("AODE Iris")
# def test_BoostAODE_version(clf, features, data_disc):
# """Check AODE version."""
# assert __version__ == clf.version()
# clf.fit(*data_disc, features=features)
# assert __version__ == clf.version()
# def test_BoostAODE_nodes_edges(clf, data_disc):
# assert clf.nodes_edges() == (0, 0)
# clf.fit(*data_disc)
# assert clf.nodes_leaves() == (20, 28)
# def test_BoostAODE_states(clf, data_disc):
# assert clf.states_ == 0
# clf.fit(*data_disc)
# assert clf.states_ == 19
# assert clf.depth_ == clf.states_
# def test_BoostAODE_classifier(data_disc, clf):
# clf.fit(*data_disc)
# attribs = [
# "feature_names_in_",
# "class_name_",
# "n_features_in_",
# "X_",
# "y_",
# ]
# for attr in attribs:
# assert hasattr(clf, attr)
# X = data_disc[0]
# y = data_disc[1]
# y_pred = clf.predict(X)
# assert y_pred.shape == (X.shape[0],)
# assert sum(y == y_pred) == 146
# def test_BoostAODE_wrong_num_features(data_disc, clf):
# with pytest.raises(
# ValueError,
# match="Number of features does not match the number of columns in X",
# ):
# clf.fit(*data_disc, features=["feature_1", "feature_2"])
# def test_BoostAODE_wrong_hyperparam(data_disc, clf):
# with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
# clf.fit(*data_disc, wrong_param="wrong_param")
# def test_BoostAODE_error_size_predict(data_disc, clf):
# X, y = data_disc
# clf.fit(X, y)
# with pytest.raises(ValueError):
# X_diff_size = np.ones((10, X.shape[1] + 1))
# clf.predict(X_diff_size)

View File

@@ -1,28 +1,21 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
from pgmpy.models import BayesianNetwork
from bayesclass.clfs import KDB from bayesclass.clfs import KDB
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return KDB(k=3) return KDB(k=3, show_progress=False)
def test_KDB_default_hyperparameters(data, clf): def test_KDB_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters # Test default values of hyperparameters
assert not clf.show_progress assert not clf.show_progress
assert clf.random_state is None assert clf.random_state is None
@@ -31,9 +24,9 @@ def test_KDB_default_hyperparameters(data, clf):
assert clf.show_progress assert clf.show_progress
assert clf.random_state == 17 assert clf.random_state == 17
assert clf.k == 3 assert clf.k == 3
clf.fit(*data) clf.fit(*data_disc)
assert clf.class_name_ == "class" assert clf.class_name_ == "class"
assert clf.features_ == [ assert clf.feature_names_in_ == [
"feature_0", "feature_0",
"feature_1", "feature_1",
"feature_2", "feature_2",
@@ -46,49 +39,85 @@ def test_KDB_version(clf):
assert __version__ == clf.version() assert __version__ == clf.version()
def test_KDB_nodes_leaves(clf): def test_KDB_nodes_edges(clf, data_disc):
assert clf.nodes_leaves() == (0, 0) assert clf.nodes_edges() == (0, 0)
clf.fit(*data_disc)
assert clf.nodes_leaves() == (5, 9)
def test_KDB_classifier(data, clf): def test_KDB_states(clf, data_disc):
clf.fit(*data) assert clf.states_ == 0
attribs = ["classes_", "X_", "y_", "features_", "class_name_"] clf.fit(*data_disc)
assert clf.states_ == 19
assert clf.depth_ == clf.states_
def test_KDB_classifier(data_disc, clf):
clf.fit(*data_disc)
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
for attr in attribs: for attr in attribs:
assert hasattr(clf, attr) assert hasattr(clf, attr)
X = data[0] X = data_disc[0]
y = data[1] y = data_disc[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 148 assert sum(y == y_pred) == 146
def test_KDB_classifier_weighted(data_disc, clf):
sample_weight = [1] * data_disc[0].shape[0]
sample_weight[:50] = [0] * 50
clf.fit(*data_disc, sample_weight=sample_weight, weighted=True)
assert clf.score(*data_disc) == 0.64
@image_comparison( @image_comparison(
baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"] baseline_images=["line_dashes_KDB"], remove_text=True, extensions=["png"]
) )
def test_KDB_plot(data, clf): def test_KDB_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data_disc, features=features)
clf.fit(*data, features=dataset["feature_names"])
clf.plot("KDB Iris") clf.plot("KDB Iris")
def test_KDB_wrong_num_features(data, clf): def test_KDB_wrong_num_features(data_disc, clf):
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="Number of features does not match the number of columns in X", match="Number of features does not match the number of columns in X",
): ):
clf.fit(*data, features=["feature_1", "feature_2"]) clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_KDB_wrong_hyperparam(data, clf): def test_KDB_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param") clf.fit(*data_disc, wrong_param="wrong_param")
def test_KDB_error_size_predict(data, clf): def test_KDB_error_size_predict(data_disc, clf):
X, y = data X, y = data_disc
clf.fit(X, y) clf.fit(X, y)
with pytest.raises(ValueError): with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1)) X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size) clf.predict(X_diff_size)
def test_KDB_dont_do_cycles():
clf = KDB(k=4)
dag = BayesianNetwork(show_progress=False)
clf.feature_names_in_ = [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
nodes = list(range(4))
weights = np.ones((4, 4))
for idx in range(1, 4):
dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
for idx in range(4):
clf._add_m_edges(dag, idx, nodes, weights)
assert len(dag.edges()) == 6

View File

@@ -0,0 +1,132 @@
import pytest
import numpy as np
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from pgmpy.models import BayesianNetwork
from bayesclass.clfs import KDBNew
from .._version import __version__
@pytest.fixture
def clf():
return KDBNew(k=3, show_progress=False)
def test_KDBNew_default_hyperparameters(data, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state is None
assert clf.theta == 0.03
clf = KDBNew(show_progress=True, random_state=17, k=3)
assert clf.show_progress
assert clf.random_state == 17
assert clf.k == 3
clf.fit(*data)
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
def test_KDBNew_version(clf):
"""Check KDBNew version."""
assert __version__ == clf.version()
def test_KDBNew_nodes_edges(clf, data):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data)
assert clf.nodes_leaves() == (5, 9)
def test_KDBNew_states(clf, data):
assert clf.states_ == 0
clf.fit(*data)
assert clf.states_ == 22
assert clf.depth_ == clf.states_
def test_KDBNew_classifier(data, clf):
clf.fit(*data)
attribs = ["classes_", "X_", "y_", "feature_names_in_", "class_name_"]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 145
def test_KDBNew_local_discretization(clf, data):
expected = [[1, -1], -1, [0, 1, 3, -1], [1, -1]]
clf.fit(*data)
for feature in range(4):
computed = clf.estimator_.discretizer_.target_[feature]
if type(computed) == list:
for j, k in zip(expected[feature], computed):
assert j == k
else:
assert (
expected[feature]
== clf.estimator_.discretizer_.target_[feature]
)
@image_comparison(
baseline_images=["line_dashes_KDBNew"],
remove_text=True,
extensions=["png"],
)
def test_KDBNew_plot(data, features, class_name, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
clf.fit(*data, features=features, class_name=class_name)
clf.plot("KDBNew Iris")
def test_KDBNew_wrong_num_features(data, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
def test_KDBNew_wrong_hyperparam(data, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
def test_KDBNew_error_size_predict(data, clf):
X, y = data
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size)
def test_KDBNew_dont_do_cycles():
clf = KDBNew(k=4)
dag = BayesianNetwork(show_progress=False)
clf.feature_names_in_ = [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
nodes = list(range(4))
weights = np.ones((4, 4))
for idx in range(1, 4):
dag.add_edge(clf.feature_names_in_[0], clf.feature_names_in_[idx])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[2])
dag.add_edge(clf.feature_names_in_[1], clf.feature_names_in_[3])
dag.add_edge(clf.feature_names_in_[2], clf.feature_names_in_[3])
for idx in range(4):
clf._add_m_edges(dag, idx, nodes, weights)
assert len(dag.edges()) == 6

View File

@@ -1,7 +1,5 @@
import pytest import pytest
import numpy as np import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import KBinsDiscretizer
from matplotlib.testing.decorators import image_comparison from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings from matplotlib.testing.conftest import mpl_test_settings
@@ -10,29 +8,22 @@ from bayesclass.clfs import TAN
from .._version import __version__ from .._version import __version__
@pytest.fixture
def data():
X, y = load_iris(return_X_y=True)
enc = KBinsDiscretizer(encode="ordinal")
return enc.fit_transform(X), y
@pytest.fixture @pytest.fixture
def clf(): def clf():
return TAN() return TAN(random_state=17, show_progress=False)
def test_TAN_default_hyperparameters(data, clf): def test_TAN_default_hyperparameters(data_disc, clf):
# Test default values of hyperparameters # Test default values of hyperparameters
assert not clf.show_progress assert not clf.show_progress
assert clf.random_state is None
clf = TAN(show_progress=True, random_state=17)
assert clf.show_progress
assert clf.random_state == 17 assert clf.random_state == 17
clf.fit(*data) clf = TAN(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data_disc)
assert clf.head_ == 0 assert clf.head_ == 0
assert clf.class_name_ == "class" assert clf.class_name_ == "class"
assert clf.features_ == [ assert clf.feature_names_in_ == [
"feature_0", "feature_0",
"feature_1", "feature_1",
"feature_2", "feature_2",
@@ -45,59 +36,73 @@ def test_TAN_version(clf):
assert __version__ == clf.version() assert __version__ == clf.version()
def test_TAN_nodes_leaves(clf): def test_TAN_nodes_edges(clf, data_disc):
assert clf.nodes_leaves() == (0, 0) assert clf.nodes_edges() == (0, 0)
clf.fit(*data_disc, head="random")
assert clf.nodes_leaves() == (5, 7)
def test_TAN_random_head(data): def test_TAN_states(clf, data_disc):
clf = TAN(random_state=17) assert clf.states_ == 0
clf.fit(*data, head="random") clf.fit(*data_disc)
assert clf.states_ == 19
assert clf.depth_ == clf.states_
def test_TAN_random_head(clf, data_disc):
clf.fit(*data_disc, head="random")
assert clf.head_ == 3 assert clf.head_ == 3
def test_TAN_classifier(data, clf): def test_TAN_classifier(data_disc, clf):
clf.fit(*data) clf.fit(*data_disc)
attribs = ["classes_", "X_", "y_", "head_", "features_", "class_name_"] attribs = [
"classes_",
"X_",
"y_",
"head_",
"feature_names_in_",
"class_name_",
]
for attr in attribs: for attr in attribs:
assert hasattr(clf, attr) assert hasattr(clf, attr)
X = data[0] X = data_disc[0]
y = data[1] y = data_disc[1]
y_pred = clf.predict(X) y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],) assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 147 assert sum(y == y_pred) == 146
@image_comparison( @image_comparison(
baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"] baseline_images=["line_dashes_TAN"], remove_text=True, extensions=["png"]
) )
def test_TAN_plot(data, clf): def test_TAN_plot(data_disc, features, clf):
# mpl_test_settings will automatically clean these internal side effects # mpl_test_settings will automatically clean these internal side effects
mpl_test_settings mpl_test_settings
dataset = load_iris(as_frame=True) clf.fit(*data_disc, features=features, head=0)
clf.fit(*data, features=dataset["feature_names"], head=0)
clf.plot("TAN Iris head=0") clf.plot("TAN Iris head=0")
def test_KDB_wrong_num_features(data, clf): def test_TAN_wrong_num_features(data_disc, clf):
with pytest.raises( with pytest.raises(
ValueError, ValueError,
match="Number of features does not match the number of columns in X", match="Number of features does not match the number of columns in X",
): ):
clf.fit(*data, features=["feature_1", "feature_2"]) clf.fit(*data_disc, features=["feature_1", "feature_2"])
def test_TAN_wrong_hyperparam(data, clf): def test_TAN_wrong_hyperparam(data_disc, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"): with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param") clf.fit(*data_disc, wrong_param="wrong_param")
def test_TAN_head_out_of_range(data, clf): def test_TAN_head_out_of_range(data_disc, clf):
with pytest.raises(ValueError, match="Head index out of range"): with pytest.raises(ValueError, match="Head index out of range"):
clf.fit(*data, head=4) clf.fit(*data_disc, head=4)
def test_TAN_error_size_predict(data, clf): def test_TAN_error_size_predict(data_disc, clf):
X, y = data X, y = data_disc
clf.fit(X, y) clf.fit(X, y)
with pytest.raises(ValueError): with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1)) X_diff_size = np.ones((10, X.shape[1] + 1))

View File

@@ -0,0 +1,120 @@
import pytest
import numpy as np
from matplotlib.testing.decorators import image_comparison
from matplotlib.testing.conftest import mpl_test_settings
from bayesclass.clfs import TANNew
from .._version import __version__
@pytest.fixture
def clf():
return TANNew(random_state=17)
def test_TANNew_default_hyperparameters(data, clf):
# Test default values of hyperparameters
assert not clf.show_progress
assert clf.random_state == 17
clf = TANNew(show_progress=True)
assert clf.show_progress
assert clf.random_state is None
clf.fit(*data)
assert clf.head_ == 0
assert clf.class_name_ == "class"
assert clf.feature_names_in_ == [
"feature_0",
"feature_1",
"feature_2",
"feature_3",
]
def test_TANNew_version(clf):
"""Check TANNew version."""
assert __version__ == clf.version()
def test_TANNew_nodes_edges(clf, data):
assert clf.nodes_edges() == (0, 0)
clf.fit(*data, head="random")
assert clf.nodes_leaves() == (5, 7)
def test_TANNew_states(clf, data):
assert clf.states_ == 0
clf.fit(*data)
assert clf.states_ == 18
assert clf.depth_ == clf.states_
def test_TANNew_random_head(clf, data):
clf.fit(*data, head="random")
assert clf.head_ == 3
def test_TANNew_local_discretization(clf, data):
expected = [-1, [0, -1], [0, -1], [1, -1]]
clf.fit(*data)
for feature in range(4):
assert (
expected[feature] == clf.estimator_.discretizer_.target_[feature]
)
def test_TANNew_classifier(data, clf):
clf.fit(*data)
attribs = [
"classes_",
"X_",
"y_",
"head_",
"feature_names_in_",
"class_name_",
]
for attr in attribs:
assert hasattr(clf, attr)
X = data[0]
y = data[1]
y_pred = clf.predict(X)
assert y_pred.shape == (X.shape[0],)
assert sum(y == y_pred) == 146
@image_comparison(
baseline_images=["line_dashes_TANNew"],
remove_text=True,
extensions=["png"],
)
def test_TANNew_plot(data, features, clf):
# mpl_test_settings will automatically clean these internal side effects
mpl_test_settings
clf.fit(*data, features=features, head=0)
clf.plot("TANNew Iris head=0")
def test_TANNew_wrong_num_features(data, clf):
with pytest.raises(
ValueError,
match="Number of features does not match the number of columns in X",
):
clf.fit(*data, features=["feature_1", "feature_2"])
def test_TANNew_wrong_hyperparam(data, clf):
with pytest.raises(ValueError, match="Unexpected argument: wrong_param"):
clf.fit(*data, wrong_param="wrong_param")
def test_TANNew_head_out_of_range(data, clf):
with pytest.raises(ValueError, match="Head index out of range"):
clf.fit(*data, head=4)
def test_TANNew_error_size_predict(data, clf):
X, y = data
clf.fit(X, y)
with pytest.raises(ValueError):
X_diff_size = np.ones((10, X.shape[1] + 1))
clf.predict(X_diff_size)

View File

@@ -1,14 +1,29 @@
import pytest import pytest
import numpy as np
from sklearn.utils.estimator_checks import check_estimator from sklearn.utils.estimator_checks import check_estimator
from bayesclass.clfs import TAN, KDB, AODE from bayesclass.clfs import BayesBase, TAN, KDB, AODE
@pytest.mark.parametrize("estimator", [TAN(), KDB(k=2), AODE()]) def test_more_tags():
# @pytest.mark.parametrize("estimator", [AODE()]) expected = {
def test_all_estimators(estimator): "requires_positive_X": True,
"requires_positive_y": True,
"preserve_dtype": [np.int32, np.int64],
"requires_y": True,
}
clf = BayesBase(None, True)
computed = clf._more_tags()
for key, value in expected.items():
assert key in computed
assert computed[key] == value
# @pytest.mark.parametrize("estimators", [TAN(), KDB(k=2), AODE()])
@pytest.mark.parametrize("estimators", [AODE()])
def test_all_estimators(estimators):
i = 0 i = 0
for estimator, test in check_estimator(estimator, generate_only=True): for estimator, test in check_estimator(estimators, generate_only=True):
print(i := i + 1, test) print(i := i + 1, test)
# test(estimator) # test(estimator)

32
patch_pgmpy_0.1.22.diff Normal file
View File

@@ -0,0 +1,32 @@
diff --git a/pgmpy/models/BayesianNetwork.py b/pgmpy/models/BayesianNetwork.py
index bd90122d..70ae38f7 100644
--- a/pgmpy/models/BayesianNetwork.py
+++ b/pgmpy/models/BayesianNetwork.py
@@ -27,7 +27,7 @@ class BayesianNetwork(DAG):
Base class for Bayesian Models.
"""
- def __init__(self, ebunch=None, latents=set()):
+ def __init__(self, ebunch=None, latents=set(), show_progress=False):
"""
Initializes a Bayesian Model.
A models stores nodes and edges with conditional probability
@@ -95,6 +95,7 @@ class BayesianNetwork(DAG):
>>> len(G) # number of nodes in graph
3
"""
+ self.show_progress = show_progress
super(BayesianNetwork, self).__init__(ebunch=ebunch, latents=latents)
self.cpds = []
self.cardinalities = defaultdict(int)
@@ -738,7 +739,9 @@ class BayesianNetwork(DAG):
show_progress=False,
)
for index, data_point in tqdm(
- data_unique.iterrows(), total=data_unique.shape[0]
+ data_unique.iterrows(),
+ total=data_unique.shape[0],
+ disable=not self.show_progress,
)
)

View File

@@ -25,6 +25,7 @@ dependencies = [
"pgmpy", "pgmpy",
"networkx", "networkx",
"matplotlib", "matplotlib",
"fimdlp",
] ]
requires-python = ">=3.8" requires-python = ">=3.8"
classifiers = [ classifiers = [

View File

@@ -1,5 +1,6 @@
numpy numpy
scipy scipy
pandas
scikit-learn scikit-learn
matplotlib matplotlib
networkx networkx